blob: ca528d007e1cc14818dfcf9b58a4d1c53abcf8fb [file] [log] [blame]
// Copyright 2017 The Dawn & Tint Authors
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dawn/native/vulkan/BufferVk.h"
#include <algorithm>
#include <cstring>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "dawn/common/Assert.h"
#include "dawn/common/GPUInfo.h"
#include "dawn/native/ChainUtils.h"
#include "dawn/native/CommandBuffer.h"
#include "dawn/native/PhysicalDevice.h"
#include "dawn/native/Queue.h"
#include "dawn/native/vulkan/DeviceVk.h"
#include "dawn/native/vulkan/FencedDeleter.h"
#include "dawn/native/vulkan/QueueVk.h"
#include "dawn/native/vulkan/ResourceHeapVk.h"
#include "dawn/native/vulkan/ResourceMemoryAllocatorVk.h"
#include "dawn/native/vulkan/UtilsVulkan.h"
#include "dawn/native/vulkan/VulkanError.h"
#include "partition_alloc/pointers/raw_ptr.h"
namespace dawn::native::vulkan {
namespace {
VkBufferUsageFlags VulkanBufferUsage(wgpu::BufferUsage usage) {
VkBufferUsageFlags flags = 0;
if (usage & (wgpu::BufferUsage::CopySrc | kInternalCopySrcBuffer)) {
flags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
}
if (usage & wgpu::BufferUsage::CopyDst) {
flags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
}
if (usage & wgpu::BufferUsage::Index) {
flags |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
}
if (usage & wgpu::BufferUsage::Vertex) {
flags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
}
if (usage & wgpu::BufferUsage::Uniform) {
flags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
}
if (usage & (wgpu::BufferUsage::Storage | kInternalStorageBuffer | kReadOnlyStorageBuffer)) {
flags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
}
if (usage & wgpu::BufferUsage::Indirect) {
flags |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
}
if (usage & wgpu::BufferUsage::QueryResolve) {
flags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
}
return flags;
}
VkPipelineStageFlags VulkanPipelineStage(wgpu::BufferUsage usage, wgpu::ShaderStage shaderStage) {
VkPipelineStageFlags flags = 0;
if (usage & kMappableBufferUsages) {
flags |= VK_PIPELINE_STAGE_HOST_BIT;
}
if (usage &
(wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | kInternalCopySrcBuffer)) {
flags |= VK_PIPELINE_STAGE_TRANSFER_BIT;
}
if (usage & (wgpu::BufferUsage::Index | wgpu::BufferUsage::Vertex)) {
flags |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
}
if (usage & kShaderBufferUsages) {
if (shaderStage & wgpu::ShaderStage::Vertex) {
flags |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT;
}
if (shaderStage & wgpu::ShaderStage::Fragment) {
flags |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
}
if (shaderStage & wgpu::ShaderStage::Compute) {
flags |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
}
}
if (usage & kIndirectBufferForBackendResourceTracking) {
flags |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
}
if (usage & wgpu::BufferUsage::QueryResolve) {
flags |= VK_PIPELINE_STAGE_TRANSFER_BIT;
}
return flags;
}
VkAccessFlags VulkanAccessFlags(wgpu::BufferUsage usage) {
VkAccessFlags flags = 0;
if (usage & wgpu::BufferUsage::MapRead) {
flags |= VK_ACCESS_HOST_READ_BIT;
}
if (usage & wgpu::BufferUsage::MapWrite) {
flags |= VK_ACCESS_HOST_WRITE_BIT;
}
if (usage & (wgpu::BufferUsage::CopySrc | kInternalCopySrcBuffer)) {
flags |= VK_ACCESS_TRANSFER_READ_BIT;
}
if (usage & wgpu::BufferUsage::CopyDst) {
flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
}
if (usage & wgpu::BufferUsage::Index) {
flags |= VK_ACCESS_INDEX_READ_BIT;
}
if (usage & wgpu::BufferUsage::Vertex) {
flags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
}
if (usage & wgpu::BufferUsage::Uniform) {
flags |= VK_ACCESS_UNIFORM_READ_BIT;
}
if (usage & (wgpu::BufferUsage::Storage | kInternalStorageBuffer)) {
flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
}
if (usage & kReadOnlyStorageBuffer) {
flags |= VK_ACCESS_SHADER_READ_BIT;
}
if (usage & kIndirectBufferForBackendResourceTracking) {
flags |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
}
if (usage & wgpu::BufferUsage::QueryResolve) {
flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
}
return flags;
}
MemoryKind GetMemoryKindFor(wgpu::BufferUsage bufferUsage) {
MemoryKind requestKind = MemoryKind::Linear;
if (bufferUsage & wgpu::BufferUsage::MapRead) {
requestKind |= MemoryKind::ReadMappable;
}
if (bufferUsage & wgpu::BufferUsage::MapWrite) {
requestKind |= MemoryKind::WriteMappable;
}
// `kDeviceLocalBufferUsages` covers all the buffer usages that prefer the memory type
// `VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT`.
constexpr wgpu::BufferUsage kDeviceLocalBufferUsages =
wgpu::BufferUsage::Index | wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::Storage |
wgpu::BufferUsage::Uniform | wgpu::BufferUsage::Vertex | kInternalStorageBuffer |
kReadOnlyStorageBuffer | kIndirectBufferForBackendResourceTracking;
if (bufferUsage & kDeviceLocalBufferUsages) {
requestKind |= MemoryKind::DeviceLocal;
}
return requestKind;
}
// Returns a Vulkan spec compliant memory range by aligning `offset` down and `size` up to multiples
// of `nonCoherentAtomSize`.
VkMappedMemoryRange GetMappedMemoryRange(const ResourceMemoryAllocation& allocation,
size_t offset,
size_t size,
size_t nonCoherentAtomSize) {
DAWN_ASSERT(IsAligned(allocation.GetOffset(), nonCoherentAtomSize));
// `offset` must always be a multiple of nonCoherentAtomSize. `size` must either be a multiple
// of nonCoherentAtomSize or offset+size must be equal to the size of the allocation.
size_t fullOffset = allocation.GetOffset() + offset;
size_t alignedOffset = AlignDown(fullOffset, nonCoherentAtomSize);
size_t alignedSize = Align(size + (fullOffset - alignedOffset), nonCoherentAtomSize);
size_t allocationSize = allocation.GetInfo().mRequestedSize;
if (alignedOffset + alignedSize > allocationSize) {
alignedSize = allocationSize - alignedOffset;
}
return VkMappedMemoryRange{.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
.pNext = nullptr,
.memory = ToBackend(allocation.GetResourceHeap())->GetMemory(),
.offset = alignedOffset,
.size = alignedSize};
}
} // namespace
// static
ResultOrError<Ref<Buffer>> Buffer::Create(Device* device,
const UnpackedPtr<BufferDescriptor>& descriptor) {
Ref<Buffer> buffer = AcquireRef(new Buffer(device, descriptor));
if (auto* hostMappedDesc = descriptor.Get<BufferHostMappedPointer>()) {
DAWN_TRY(buffer->InitializeHostMapped(hostMappedDesc));
} else {
DAWN_TRY(buffer->Initialize(descriptor->mappedAtCreation));
}
return std::move(buffer);
}
MaybeError Buffer::Initialize(bool mappedAtCreation) {
// vkCmdFillBuffer requires the size to be a multiple of 4.
constexpr size_t kAlignment = 4u;
uint32_t extraBytes = 0u;
if (GetInternalUsage() & (wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Index)) {
// vkCmdSetIndexBuffer and vkCmdSetVertexBuffer are invalid if the offset
// is equal to the whole buffer size. Allocate at least one more byte so it
// is valid to setVertex/IndexBuffer with a zero-sized range at the end
// of the buffer with (offset=buffer.size, size=0).
extraBytes = 1u;
}
uint64_t size = GetSize();
if (size > std::numeric_limits<uint64_t>::max() - extraBytes) {
return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
}
size += extraBytes;
// Allocate at least 4 bytes so clamped accesses are always in bounds.
// Also, Vulkan requires the size to be non-zero.
size = std::max(size, uint64_t(4u));
if (size > std::numeric_limits<uint64_t>::max() - kAlignment) {
// Alignment would overlow.
return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
}
mAllocatedSize = Align(size, kAlignment);
// Round uniform buffer sizes up to a multiple of 16 bytes since Tint will polyfill them as
// array<vec4u, ...>.
if (GetUsage() & wgpu::BufferUsage::Uniform) {
mAllocatedSize = Align(size, 16u);
}
// Avoid passing ludicrously large sizes to drivers because it causes issues: drivers add
// some constants to the size passed and align it, but for values close to the maximum
// VkDeviceSize this can cause overflows and makes drivers crash or return bad sizes in the
// VkmemoryRequirements. See https://gitlab.khronos.org/vulkan/vulkan/issues/1904
// Any size with one of two top bits of VkDeviceSize set is a HUGE allocation and we can
// safely return an OOM error.
if (mAllocatedSize & (uint64_t(3) << uint64_t(62))) {
return DAWN_OUT_OF_MEMORY_ERROR("Buffer size is HUGE and could cause overflows");
}
VkBufferCreateInfo createInfo;
createInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
createInfo.pNext = nullptr;
createInfo.flags = 0;
createInfo.size = mAllocatedSize;
// Add CopyDst for non-mappable buffer initialization with mappedAtCreation
// and robust resource initialization.
createInfo.usage = VulkanBufferUsage(GetInternalUsage() | wgpu::BufferUsage::CopyDst);
createInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
createInfo.queueFamilyIndexCount = 0;
createInfo.pQueueFamilyIndices = 0;
Device* device = ToBackend(GetDevice());
DAWN_TRY(CheckVkOOMThenSuccess(
device->fn.CreateBuffer(device->GetVkDevice(), &createInfo, nullptr, &*mHandle),
"vkCreateBuffer"));
// Gather requirements for the buffer's memory and allocate it.
VkMemoryRequirements requirements;
device->fn.GetBufferMemoryRequirements(device->GetVkDevice(), mHandle, &requirements);
MemoryKind requestKind = GetMemoryKindFor(GetInternalUsage());
DAWN_TRY_ASSIGN(mMemoryAllocation,
device->GetResourceMemoryAllocator()->Allocate(requirements, requestKind));
// Finally associate it with the buffer.
DAWN_TRY(CheckVkSuccess(
device->fn.BindBufferMemory(device->GetVkDevice(), mHandle,
ToBackend(mMemoryAllocation.GetResourceHeap())->GetMemory(),
mMemoryAllocation.GetOffset()),
"vkBindBufferMemory"));
// Get if buffer is host visible and coherent. This can be the case even if the buffer was not
// created with map usages, as on integrated GPUs all memory will typically be host visible.
const size_t memoryType = ToBackend(mMemoryAllocation.GetResourceHeap())->GetMemoryType();
const VkMemoryPropertyFlags memoryPropertyFlags =
device->GetDeviceInfo().memoryTypes[memoryType].propertyFlags;
mHostVisible = IsSubset(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, memoryPropertyFlags);
mHostCoherent = IsSubset(VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, memoryPropertyFlags);
// The buffers with mappedAtCreation == true will be initialized in BufferBase::MapAtCreation().
if (!mappedAtCreation) {
uint32_t paddingClearSize = Align(GetAllocatedSize() - GetSize(), 4);
uint64_t paddingClearOffset = GetAllocatedSize() - paddingClearSize;
if (mHostVisible && GetSize() > 0) {
// For host visible buffers do initialization on CPU to avoid a GPU write that
// interferes with using the UploadData() fast path.
if (device->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
DAWN_TRY(MapMemoryAndPerformOperation(
0, mAllocatedSize,
[](std::span<uint8_t> mapped) { std::ranges::fill(mapped, 0x01); }));
}
if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
paddingClearSize > 0) {
DAWN_TRY(
MapMemoryAndPerformOperation(paddingClearOffset, paddingClearSize,
[&paddingClearSize](std::span<uint8_t> mapped) {
DAWN_ASSERT(mapped.size() == paddingClearSize);
std::ranges::fill(mapped, 0x0);
}));
}
} else {
if (device->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
ClearBuffer(ToBackend(device->GetQueue())->GetPendingRecordingContext(),
0x01010101);
}
if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) &&
paddingClearSize > 0) {
CommandRecordingContext* recordingContext =
ToBackend(device->GetQueue())->GetPendingRecordingContext();
ClearBuffer(recordingContext, 0, paddingClearOffset, paddingClearSize);
}
}
}
SetLabelImpl();
return {};
}
MaybeError Buffer::InitializeHostMapped(const BufferHostMappedPointer* hostMappedDesc) {
static constexpr auto kHandleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
mAllocatedSize = GetSize();
VkExternalMemoryBufferCreateInfo externalMemoryCreateInfo;
externalMemoryCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO;
externalMemoryCreateInfo.pNext = nullptr;
externalMemoryCreateInfo.handleTypes = kHandleType;
VkBufferCreateInfo createInfo;
createInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
createInfo.pNext = &externalMemoryCreateInfo;
createInfo.flags = 0;
createInfo.size = mAllocatedSize;
createInfo.usage = VulkanBufferUsage(GetInternalUsage());
createInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
createInfo.queueFamilyIndexCount = 0;
createInfo.pQueueFamilyIndices = 0;
Device* device = ToBackend(GetDevice());
DAWN_TRY(CheckVkOOMThenSuccess(
device->fn.CreateBuffer(device->GetVkDevice(), &createInfo, nullptr, &*mHandle),
"vkCreateBuffer"));
// Gather requirements for the buffer's memory and allocate it.
VkMemoryRequirements requirements;
device->fn.GetBufferMemoryRequirements(device->GetVkDevice(), mHandle, &requirements);
// Gather memory requirements from the pointer.
VkMemoryHostPointerPropertiesEXT hostPointerProperties;
hostPointerProperties.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT;
hostPointerProperties.pNext = nullptr;
DAWN_TRY(CheckVkSuccess(
device->fn.GetMemoryHostPointerPropertiesEXT(
device->GetVkDevice(), VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
hostMappedDesc->pointer, &hostPointerProperties),
"vkGetHostPointerPropertiesEXT"));
// Merge the memory type requirements from buffer and the host pointer.
// Don't do this on SwiftShader which reports incompatible memory types even though there
// is no real Device/Host distinction.
if (!gpu_info::IsGoogleSwiftshader(GetDevice()->GetPhysicalDevice()->GetVendorId(),
GetDevice()->GetPhysicalDevice()->GetDeviceId())) {
requirements.memoryTypeBits &= hostPointerProperties.memoryTypeBits;
}
// We can choose memory type with `requirements.memoryTypeBits` only because host-mapped memory
// - is CPU-visible
// - is device-local on UMA
// - cannot be non-device-local on non-UMA
MemoryKind requestKind = MemoryKind::Linear;
int memoryTypeIndex =
device->GetResourceMemoryAllocator()->FindBestTypeIndex(requirements, requestKind);
DAWN_INVALID_IF(memoryTypeIndex < 0, "Failed to find suitable memory type.");
// Make a device memory wrapping the host pointer.
VkMemoryAllocateInfo allocateInfo;
allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocateInfo.pNext = nullptr;
allocateInfo.allocationSize = mAllocatedSize;
allocateInfo.memoryTypeIndex = memoryTypeIndex;
VkImportMemoryHostPointerInfoEXT importMemoryHostPointerInfo;
importMemoryHostPointerInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
importMemoryHostPointerInfo.pNext = nullptr;
importMemoryHostPointerInfo.handleType = kHandleType;
importMemoryHostPointerInfo.pHostPointer = hostMappedDesc->pointer;
allocateInfo.pNext = &importMemoryHostPointerInfo;
DAWN_TRY(CheckVkSuccess(device->fn.AllocateMemory(device->GetVkDevice(), &allocateInfo, nullptr,
&*mDedicatedDeviceMemory),
"vkAllocateMemory"));
// Finally associate it with the buffer.
DAWN_TRY(CheckVkSuccess(
device->fn.BindBufferMemory(device->GetVkDevice(), mHandle, mDedicatedDeviceMemory, 0),
"vkBindBufferMemory"));
mHostMappedDisposeCallback = hostMappedDesc->disposeCallback;
mHostMappedDisposeUserdata = hostMappedDesc->userdata;
SetLabelImpl();
// Assume the data is initialized since an external pointer was provided.
SetInitialized(true);
return {};
}
Buffer::~Buffer() = default;
VkBuffer Buffer::GetHandle() const {
return mHandle;
}
void Buffer::TransitionUsageNow(CommandRecordingContext* recordingContext,
wgpu::BufferUsage usage,
wgpu::ShaderStage shaderStage) {
recordingContext->CheckBufferNeedsEagerTransition(this, usage);
BufferBarrier barrier = TrackUsageAndGetResourceBarrier(usage, shaderStage);
recordingContext->EmitBufferBarrierIfNecessary(ToBackend(GetDevice()), barrier);
}
BufferBarrier Buffer::TrackUsageAndGetResourceBarrier(wgpu::BufferUsage usage,
wgpu::ShaderStage shaderStage) {
if (shaderStage == wgpu::ShaderStage::None) {
// If the buffer isn't used in any shader stages, ignore shader usages. Eg. ignore a uniform
// buffer that isn't actually read in any shader.
usage &= ~kShaderBufferUsages;
}
const bool isMapUsage = usage & kMappableBufferUsages;
if (isMapUsage) {
DAWN_ASSERT(shaderStage == wgpu::ShaderStage::None);
DAWN_ASSERT(IsSubset(usage, kMappableBufferUsages));
// HOST->GPU barriers aren't required. For MapRead, vkQueueSubmit() happens-after all CPU
// reads are complete. For MapWrite, the writes are made available to the "Host domain" with
// vkFlushMappedMemory() (or buffers are coherent) and there is an implicit "Host domain" ->
// "Device domain" barrier in vkQueueSubmit().
} else {
// Request non CPU usage, so assume the buffer will be used in pending commands.
MarkUsedInPendingCommands();
}
const bool readOnly = IsSubset(usage, kReadOnlyBufferUsages);
VkAccessFlags srcAccess = 0;
VkPipelineStageFlags srcStage = 0;
if (readOnly) {
if ((shaderStage & wgpu::ShaderStage::Fragment) &&
(mReadShaderStages & wgpu::ShaderStage::Vertex)) {
// There is an implicit vertex->fragment dependency, so if the vertex stage has already
// waited, there is no need for fragment to wait. Add the fragment usage so we know to
// wait for it before the next write.
mReadShaderStages |= wgpu::ShaderStage::Fragment;
}
if (IsSubset(usage, mReadUsage) && IsSubset(shaderStage, mReadShaderStages)) {
// This usage and shader stage has already waited for the last write.
// No need for another barrier.
return {};
}
if (usage & kReadOnlyShaderBufferUsages) {
// Preemptively transition to all read-only shader buffer usages if one is used to
// avoid unnecessary barriers later.
usage |= GetInternalUsage() & kReadOnlyShaderBufferUsages;
}
if (!isMapUsage) {
mReadUsage |= usage;
mReadShaderStages |= shaderStage;
}
if (mLastWriteUsage == wgpu::BufferUsage::None) {
// Read dependency with no prior writes. No barrier needed.
return {};
}
// Write -> read barrier.
srcAccess = VulkanAccessFlags(mLastWriteUsage);
srcStage = VulkanPipelineStage(mLastWriteUsage, mLastWriteShaderStage);
} else {
bool skipBarrier = false;
if (mLastWriteUsage == wgpu::BufferUsage::None && mReadUsage == wgpu::BufferUsage::None) {
// The buffer has never been used so we don't need a barrier.
skipBarrier = true;
} else if (mReadUsage == wgpu::BufferUsage::None) {
// No reads since the last write.
// Write -> write barrier.
srcAccess = VulkanAccessFlags(mLastWriteUsage);
srcStage = VulkanPipelineStage(mLastWriteUsage, mLastWriteShaderStage);
} else {
// Read -> write barrier.
srcAccess = VulkanAccessFlags(mReadUsage);
srcStage = VulkanPipelineStage(mReadUsage, mReadShaderStages);
}
if (!isMapUsage) {
mLastWriteUsage = usage;
mLastWriteShaderStage = shaderStage;
mReadUsage = wgpu::BufferUsage::None;
mReadShaderStages = wgpu::ShaderStage::None;
}
if (skipBarrier) {
return {};
}
}
return BufferBarrier{.srcAccessMask = srcAccess,
.dstAccessMask = VulkanAccessFlags(usage),
.srcStages = srcStage,
.dstStages = VulkanPipelineStage(usage, shaderStage)};
}
bool Buffer::IsCPUWritableAtCreation() const {
// TODO(enga): Handle CPU-visible memory on UMA
return mMemoryAllocation.GetMappedPointer() != nullptr;
}
MaybeError Buffer::MapAtCreationImpl() {
return {};
}
MaybeError Buffer::MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) {
return {};
}
MaybeError Buffer::FinalizeMapImpl(BufferState newState) {
Device* device = ToBackend(GetDevice());
if (NeedsInitialization() && GetSize() > 0 && newState == BufferState::Mapped) {
// Clear full allocated size, including padding bytes, except for zero sized buffers. For
// zero sized buffers GetMappedPointerImpl() points to const data which we can't clear.
std::memset(GetMappedPointerImpl(), 0, GetAllocatedSize());
GetDevice()->IncrementLazyClearCountForTesting();
SetInitialized(true);
// If the buffer is non-coherent then make sure either the whole buffer will be flushed
// later or flush it now.
if (!mHostCoherent &&
(MapMode() == wgpu::MapMode::Read || MapOffset() != 0 || MapSize() != GetSize())) {
VkDeviceSize nonCoherentAtomSize =
device->GetDeviceInfo().properties.limits.nonCoherentAtomSize;
VkMappedMemoryRange range =
GetMappedMemoryRange(mMemoryAllocation, 0, GetAllocatedSize(), nonCoherentAtomSize);
device->fn.FlushMappedMemoryRanges(device->GetVkDevice(), 1, &range);
}
}
if (!mHostCoherent) {
// Map reads always require invalidation. Map writes only require invalidation if the buffer
// contents could have been modified by the GPU previously, eg. it's not being mapped on
// creation and the buffer is GPU writable.
if (MapMode() == wgpu::MapMode::Read ||
(newState != BufferState::MappedAtCreation &&
!IsSubset(GetInternalUsage(), kReadOnlyBufferUsages | wgpu::BufferUsage::MapWrite))) {
VkDeviceSize nonCoherentAtomSize =
device->GetDeviceInfo().properties.limits.nonCoherentAtomSize;
VkMappedMemoryRange range = GetMappedMemoryRange(mMemoryAllocation, MapOffset(),
MapSize(), nonCoherentAtomSize);
device->fn.InvalidateMappedMemoryRanges(device->GetVkDevice(), 1, &range);
}
}
return {};
}
void Buffer::UnmapImpl(BufferState oldState, BufferState newState) {
// We keep CPU-visible memory mapped at all times but need to flush writes to GPU memory here.
if (!mHostCoherent && IsMappedState(oldState) && MapMode() == wgpu::MapMode::Write) {
Device* device = ToBackend(GetDevice());
VkDeviceSize nonCoherentAtomSize =
device->GetDeviceInfo().properties.limits.nonCoherentAtomSize;
VkMappedMemoryRange range =
GetMappedMemoryRange(mMemoryAllocation, MapOffset(), MapSize(), nonCoherentAtomSize);
device->fn.FlushMappedMemoryRanges(device->GetVkDevice(), 1, &range);
}
}
void* Buffer::GetMappedPointerImpl() {
uint8_t* memory = mMemoryAllocation.GetMappedPointer();
DAWN_ASSERT(memory != nullptr);
return memory;
}
MaybeError Buffer::UploadData(uint64_t bufferOffset, const void* data, size_t size) {
if (size == 0) {
return {};
}
Device* device = ToBackend(GetDevice());
const bool isInUse = GetLastUsageSerial() > device->GetQueue()->GetCompletedCommandSerial();
// Check if the buffer might have pending writes on the GPU. Even if the write workload has
// finished, the write may still need a barrier to make the write available. For MapWrite
// buffers we know the GPU->HOST barrier was eagerly inserted. For other buffers we don't know
// if the right barrier was inserted and assume it wasn't.
const bool hasPendingWrites = !(GetInternalUsage() & wgpu::BufferUsage::MapWrite ||
mLastWriteUsage == wgpu::BufferUsage::None);
if (isInUse || hasPendingWrites || !mHostVisible) {
// Write to scratch buffer and copy into final destination buffer.
return BufferBase::UploadData(bufferOffset, data, size);
}
// Buffer does not have any pending uses and is CPU writable. We can map the buffer directly
// and write the contents, skipping the scratch buffer.
// If the buffer needs initialization request the full buffer is mapped.
bool needsZeroInitialization = NeedsInitialization() && size < GetSize();
uint64_t mapSize = needsZeroInitialization ? mAllocatedSize : size;
uint64_t mapOffset = needsZeroInitialization ? 0 : bufferOffset;
return MapMemoryAndPerformOperation(mapOffset, mapSize, [&](std::span<uint8_t> mapped) {
uint64_t dstOffset = 0;
if (needsZeroInitialization) {
DAWN_ASSERT(mapped.size() == mAllocatedSize);
std::ranges::fill(mapped, 0x0);
GetDevice()->IncrementLazyClearCountForTesting();
dstOffset = bufferOffset;
}
// The buffer is always initialized here, either by explicit zero initialization
// above or memcpy below.
SetInitialized(true);
DAWN_ASSERT(mapped.size() >= dstOffset + size);
memcpy(mapped.data() + dstOffset, data, size);
});
}
template <typename F>
MaybeError Buffer::MapMemoryAndPerformOperation(uint64_t requestedOffset,
size_t requestedSize,
F&& op) {
Device* device = ToBackend(GetDevice());
const bool isMappable = GetInternalUsage() & kMappableBufferUsages;
DAWN_ASSERT(mHostVisible);
DAWN_ASSERT(GetLastUsageSerial() <= device->GetQueue()->GetCompletedCommandSerial());
VkDeviceMemory deviceMemory = ToBackend(mMemoryAllocation.GetResourceHeap())->GetMemory();
uint8_t* memory = nullptr;
uint64_t realOffset = requestedOffset;
if (isMappable) {
// Mappable buffers are already persistently mapped.
memory = mMemoryAllocation.GetMappedPointer();
} else {
// TODO(crbug.com/dawn/774): Persistently map frequently updated buffers instead of
// mapping/unmapping each time.
VkDeviceSize offset = mMemoryAllocation.GetOffset();
VkDeviceSize mapSize = mAllocatedSize;
if (mHostCoherent) {
// We can map only the part of the buffer we need to upload the data.
// We avoid this for non-coherent memory as the mapping needs to be aligned to
// nonCoherentAtomSize.
offset += requestedOffset;
mapSize = requestedSize;
realOffset = 0;
}
void* mappedPointer;
DAWN_TRY(CheckVkSuccess(device->fn.MapMemory(device->GetVkDevice(), deviceMemory, offset,
mapSize, 0, &mappedPointer),
"vkMapMemory"));
memory = static_cast<uint8_t*>(mappedPointer);
}
VkMappedMemoryRange mappedMemoryRange = {};
mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
mappedMemoryRange.memory = deviceMemory;
mappedMemoryRange.offset = mMemoryAllocation.GetOffset();
mappedMemoryRange.size = mAllocatedSize;
if (!mHostCoherent) {
// For non-coherent memory we need to explicitly invalidate the memory range to make
// available GPU writes visible.
device->fn.InvalidateMappedMemoryRanges(device->GetVkDevice(), 1, &mappedMemoryRange);
}
// Pass a span that is exactly the offset/size requested even if a larger range was mapped.
op(std::span(memory + realOffset, requestedSize));
if (!mHostCoherent) {
// For non-coherent memory we need to explicitly flush the memory range to make the host
// write visible.
// TODO(crbug.com/dawn/774): Batch the flush calls instead of doing one per writeBuffer.
device->fn.FlushMappedMemoryRanges(device->GetVkDevice(), 1, &mappedMemoryRange);
}
if (!isMappable) {
device->fn.UnmapMemory(device->GetVkDevice(), deviceMemory);
}
return {};
}
void Buffer::DestroyImpl() {
// TODO(crbug.com/dawn/831): DestroyImpl is called from two places.
// - It may be called if the buffer is explicitly destroyed with APIDestroy.
// This case is NOT thread-safe and needs proper synchronization with other
// simultaneous uses of the buffer.
// - It may be called when the last ref to the buffer is dropped and the buffer
// is implicitly destroyed. This case is thread-safe because there are no
// other threads using the buffer since there are no other live refs.
BufferBase::DestroyImpl();
ToBackend(GetDevice())->GetResourceMemoryAllocator()->Deallocate(&mMemoryAllocation);
if (mHandle != VK_NULL_HANDLE) {
ToBackend(GetDevice())->GetFencedDeleter()->DeleteWhenUnused(mHandle);
mHandle = VK_NULL_HANDLE;
}
if (mDedicatedDeviceMemory != VK_NULL_HANDLE) {
ToBackend(GetDevice())->GetFencedDeleter()->DeleteWhenUnused(mDedicatedDeviceMemory);
mDedicatedDeviceMemory = VK_NULL_HANDLE;
}
if (mHostMappedDisposeCallback) {
struct DisposeTask : TrackTaskCallback {
explicit DisposeTask(wgpu::Callback callback, void* userdata)
: TrackTaskCallback(nullptr), callback(callback), userdata(userdata) {}
~DisposeTask() override = default;
void FinishImpl() override { callback(userdata); }
void HandleDeviceLossImpl() override { callback(userdata); }
void HandleShutDownImpl() override { callback(userdata); }
wgpu::Callback callback;
raw_ptr<void, DisableDanglingPtrDetection> userdata;
};
std::unique_ptr<DisposeTask> request =
std::make_unique<DisposeTask>(mHostMappedDisposeCallback, mHostMappedDisposeUserdata);
mHostMappedDisposeCallback = nullptr;
GetDevice()->GetQueue()->TrackPendingTask(std::move(request));
}
}
bool Buffer::EnsureDataInitialized(CommandRecordingContext* recordingContext) {
if (!NeedsInitialization()) {
return false;
}
InitializeToZero(recordingContext);
return true;
}
bool Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* recordingContext,
uint64_t offset,
uint64_t size) {
if (!NeedsInitialization()) {
return false;
}
if (IsFullBufferRange(offset, size)) {
SetInitialized(true);
return false;
}
InitializeToZero(recordingContext);
return true;
}
bool Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* recordingContext,
const CopyTextureToBufferCmd* copy) {
if (!NeedsInitialization()) {
return false;
}
if (IsFullBufferOverwrittenInTextureToBufferCopy(copy)) {
SetInitialized(true);
return false;
}
InitializeToZero(recordingContext);
return true;
}
// static
void Buffer::TransitionMappableBuffersEagerly(Device* device,
CommandRecordingContext* recordingContext,
const absl::flat_hash_set<Ref<Buffer>>& buffers) {
DAWN_ASSERT(!buffers.empty());
size_t originalBufferCount = buffers.size();
BufferBarrier barrier;
for (const Ref<Buffer>& buffer : buffers) {
wgpu::BufferUsage mapUsage = buffer->GetInternalUsage() & kMappableBufferUsages;
barrier.Merge(buffer->TrackUsageAndGetResourceBarrier(mapUsage, wgpu::ShaderStage::None));
}
// TrackUsageAndGetResourceBarrier() should not modify recordingContext for map usages.
DAWN_ASSERT(buffers.size() == originalBufferCount);
recordingContext->EmitBufferBarrierIfNecessary(device, barrier);
}
void Buffer::SetLabelImpl() {
SetDebugName(ToBackend(GetDevice()), mHandle, "Dawn_Buffer", GetLabel());
}
void Buffer::InitializeToZero(CommandRecordingContext* recordingContext) {
DAWN_ASSERT(NeedsInitialization());
ClearBuffer(recordingContext, 0u);
GetDevice()->IncrementLazyClearCountForTesting();
SetInitialized(true);
}
void Buffer::ClearBuffer(CommandRecordingContext* recordingContext,
uint32_t clearValue,
uint64_t offset,
uint64_t size) {
DAWN_ASSERT(recordingContext != nullptr);
size = size > 0 ? size : GetAllocatedSize();
DAWN_ASSERT(size > 0);
TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);
Device* device = ToBackend(GetDevice());
// VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
// Note: Allocated size must be a multiple of 4.
DAWN_ASSERT(size % 4 == 0);
device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, offset, size, clearValue);
}
bool BufferBarrier::IsEmpty() const {
return srcStages == 0 || dstStages == 0;
}
void BufferBarrier::Merge(const BufferBarrier& other) {
srcAccessMask |= other.srcAccessMask;
dstAccessMask |= other.dstAccessMask;
srcStages |= other.srcStages;
dstStages |= other.dstStages;
}
} // namespace dawn::native::vulkan