D3D11: Unmap buffers before GPU use via TrackUsage() This change ensures that D3D11 buffers are properly unmapped before being used by the GPU. A new TrackUsage() method is added to Buffer that checks if the buffer is already tracked for the current command serial, and if not, unmaps the buffer and marks it as used. Key changes: - Added Buffer::TrackUsage() which unmaps the buffer if needed before marking it as used in pending commands - TrackUsage() is called before any GPU operation on buffers - Removed deferred unmap mechanism (DeferUnmap, PerformDeferredUnmaps) - Added isInitialWrite parameter to WriteInternal() to enable mapping optimization for clear operations Bug: chromium:422741977 Change-Id: Icd7dd2333f8576d47a43575120fd9d86b8f26705 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/288115 Reviewed-by: Corentin Wallez <cwallez@chromium.org> Commit-Queue: Quyen Le <lehoangquyen@chromium.org>
diff --git a/src/dawn/native/Buffer.cpp b/src/dawn/native/Buffer.cpp index 6e75513..2c29f56 100644 --- a/src/dawn/native/Buffer.cpp +++ b/src/dawn/native/Buffer.cpp
@@ -1026,11 +1026,16 @@ } void BufferBase::MarkUsedInPendingCommands() { + // TODO(crbug.com/422741977): Consider storing the pending serial once, perhaps in a + // CommandRecordingContextBase, so that we don't need to load the atomic value repeatedly. + MarkUsedInPendingCommands(GetDevice()->GetQueue()->GetPendingCommandSerial()); +} + +void BufferBase::MarkUsedInPendingCommands(ExecutionSerial pendingSerial) { DAWN_ASSERT(!GetDevice()->IsValidationEnabled() || mState.load(std::memory_order::relaxed) == BufferState::InUse); - ExecutionSerial serial = GetDevice()->GetQueue()->GetPendingCommandSerial(); - DAWN_ASSERT(serial >= mLastUsageSerial); - mLastUsageSerial = serial; + DAWN_ASSERT(pendingSerial >= mLastUsageSerial); + mLastUsageSerial = pendingSerial; } ExecutionSerial BufferBase::GetLastUsageSerial() const {
diff --git a/src/dawn/native/Buffer.h b/src/dawn/native/Buffer.h index 58d5f2a..91070b5 100644 --- a/src/dawn/native/Buffer.h +++ b/src/dawn/native/Buffer.h
@@ -153,6 +153,7 @@ bool IsFullBufferRange(uint64_t offset, uint64_t size) const; bool NeedsInitialization() const; void MarkUsedInPendingCommands(); + void MarkUsedInPendingCommands(ExecutionSerial pendingSerial); virtual MaybeError UploadData(uint64_t bufferOffset, const void* data, size_t size); // SharedResource impl.
diff --git a/src/dawn/native/d3d11/BufferD3D11.cpp b/src/dawn/native/d3d11/BufferD3D11.cpp index 29b15c1..0a69e62 100644 --- a/src/dawn/native/d3d11/BufferD3D11.cpp +++ b/src/dawn/native/d3d11/BufferD3D11.cpp
@@ -215,7 +215,8 @@ Buffer* destination, uint64_t destinationOffset) override { return destination->WriteInternal(commandContext, destinationOffset, - mUploadData.get() + sourceOffset, size); + mUploadData.get() + sourceOffset, size, + /*isInitialWrite=*/false); } MaybeError CopyFromD3DInternal(const ScopedCommandRecordingContext* commandContext, @@ -231,7 +232,8 @@ MaybeError WriteInternal(const ScopedCommandRecordingContext* commandContext, uint64_t offset, const void* data, - size_t size) override { + size_t size, + bool isInitialWrite) override { const auto* src = static_cast<const uint8_t*>(data); std::copy(src, src + size, mUploadData.get() + offset); return {}; @@ -321,6 +323,7 @@ size_t size, Buffer* destination, uint64_t destinationOffset) override { + DAWN_TRY(TrackUsage(commandContext, GetDevice()->GetQueue()->GetPendingCommandSerial())); return destination->CopyFromD3DInternal(commandContext, mD3d11Buffer.Get(), sourceOffset, size, destinationOffset); } @@ -330,6 +333,8 @@ uint64_t sourceOffset, size_t size, uint64_t destinationOffset) override { + DAWN_TRY(TrackUsage(commandContext, GetDevice()->GetQueue()->GetPendingCommandSerial())); + D3D11_BOX srcBox; srcBox.left = static_cast<UINT>(sourceOffset); srcBox.top = 0; @@ -352,7 +357,8 @@ MaybeError WriteInternal(const ScopedCommandRecordingContext* commandContext, uint64_t offset, const void* data, - size_t size) override { + size_t size, + bool isInitialWrite) override { if (size == 0) { return {}; } @@ -524,6 +530,17 @@ UnmapInternal(commandContext); } +MaybeError Buffer::TrackUsage(const ScopedCommandRecordingContext* commandContext, + ExecutionSerial pendingSerial) { + if (GetLastUsageSerial() == pendingSerial) { + return {}; + } + // We need to unmap buffer before it can be used in the queue. + UnmapIfNeeded(commandContext); + MarkUsedInPendingCommands(pendingSerial); + return {}; +} + MaybeError Buffer::MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) { DAWN_ASSERT((mode == wgpu::MapMode::Write && IsCPUWritable()) || (mode == wgpu::MapMode::Read && IsCPUReadable())); @@ -556,6 +573,11 @@ wgpu::MapMode mode) { // Needn't map the buffer if this is for a previous mapAsync that was cancelled. if (completedSerial >= mMapReadySerial) { + // Trigger any deferred unmaps. + // TODO(crbug.com/345471009): Consider reuse the mapped pointer and skip mapping again if + // the previous map mode is the same as the current map mode. + UnmapIfNeeded(commandContext); + // Map then initialize data using mapped pointer. // The mapped pointer is always writable because: // - If mode is Write, then it's already writable. @@ -574,9 +596,11 @@ DAWN_ASSERT(IsMappable(GetInternalUsage())); mMapReadySerial = kMaxExecutionSerial; - if (mMappedData && newState != BufferState::Destroyed) { - ToBackend(GetDevice()->GetQueue())->DeferUnmap(this); - } + + // The actual unmap will be deferred until the buffer is used by the queue or we need to map + // again. This avoids the need to lock the CommandContext here just to call D3D11's Unmap + // function, and instead defers the call to a moment where the CommandContext is already + // acquired. } void* Buffer::GetMappedPointerImpl() { @@ -595,6 +619,9 @@ // other threads using the buffer since there are no other live refs. BufferBase::DestroyImpl(reason); + // If buffer is still mapped, unmap it now. If we don't do that, there might be some issues + // on certain drivers such as Intel's. + // TODO(crbug.com/422741977): Consider a way to unmap without acquiring the context lock. if (mMappedData != nullptr) { auto commandContext = ToBackend(GetDevice()->GetQueue()) ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Passive); @@ -693,7 +720,8 @@ // TODO(dawn:1705): use a reusable zero staging buffer to clear the buffer to avoid this CPU to // GPU copy. std::vector<uint8_t> clearData(size, clearValue); - return WriteInternal(commandContext, offset, clearData.data(), size); + return WriteInternal(commandContext, offset, clearData.data(), size, + /*isInitialWrite=*/true); } MaybeError Buffer::ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext) { @@ -717,7 +745,7 @@ // For non-staging buffers, we can use UpdateSubresource to write the data. DAWN_TRY(EnsureDataInitializedAsDestination(commandContext, offset, size)); - return WriteInternal(commandContext, offset, data, size); + return WriteInternal(commandContext, offset, data, size, /*isInitialWrite=*/false); } // static @@ -791,7 +819,7 @@ } uint8_t* Buffer::ScopedMap::GetMappedData() const { - return mBuffer ? mBuffer->mMappedData.get() : nullptr; + return mBuffer ? static_cast<uint8_t*>(mBuffer->mMappedData) : nullptr; } // GPUUsableBuffer::Storage @@ -1059,6 +1087,9 @@ DAWN_ASSERT(commandContext); + // Must not have pending unmap. + DAWN_CHECK(!mMappedData); + if (dstStorage->SupportsCopyDst()) { commandContext->CopyResource(dstStorage->GetD3D11Buffer(), mLastUpdatedStorage->GetD3D11Buffer()); @@ -1440,16 +1471,13 @@ DAWN_TRY_ASSIGN(stagingBuffer, ToBackend(GetDevice())->GetStagingBuffer(commandContext, size)); { auto scopedUseStaging = stagingBuffer->UseInternal(); - DAWN_TRY(ToBackend(stagingBuffer)->WriteInternal(commandContext, 0, data, size)); + DAWN_TRY(ToBackend(stagingBuffer) + ->WriteInternal(commandContext, 0, data, size, + /*isInitialWrite=*/true)); DAWN_TRY(ToBackend(stagingBuffer.Get()) ->CopyToInternal(commandContext, /*sourceOffset=*/0, /*size=*/size, this, offset)); - // WriteInternal() might not call MarkUsedInPendingCommands() if the staging buffer is - // mappable. But we need to mark buffer as being used for CopyToInternal(). - // TODO(crbug.com/345471009): Consider whether it's OK to change CopyToInternal() to - // automatically trigger MarkUsedInPendingCommands(). - stagingBuffer->MarkUsedInPendingCommands(); } ToBackend(GetDevice())->ReturnStagingBuffer(std::move(stagingBuffer)); @@ -1459,26 +1487,33 @@ MaybeError GPUUsableBuffer::WriteInternal(const ScopedCommandRecordingContext* commandContext, uint64_t offset, const void* data, - size_t size) { + size_t size, + bool isInitialWrite) { if (size == 0) { return {}; } - // Map the buffer if it is possible, so WriteInternal() can write the mapped memory + // Map the buffer if it is possible, so WriteInternal() can write to the mapped memory // directly. - if (IsCPUWritable() && - GetLastUsageSerial() <= GetDevice()->GetQueue()->GetCompletedCommandSerial()) { + // TODO(crbug.com/345471009): Consider mapping the buffer for non-clearing writes when + // it's not in use by the GPU. This would avoid allocating additional GPU storage. + // However, checking GetLastUsageSerial() is unreliable here because Queue::Submit() + // may have already updated it before entering this function. In practice, this is + // uncommon for mappable buffers since users typically update them via MapAsync when + // they know the buffer is idle. + if (IsCPUWritable() && isInitialWrite) { ScopedMap scopedMap; DAWN_TRY_ASSIGN(scopedMap, ScopedMap::Create(commandContext, this, wgpu::MapMode::Write)); DAWN_ASSERT(scopedMap.GetMappedData()); memcpy(scopedMap.GetMappedData() + offset, data, size); + return {}; } // Mark the buffer as used in pending commands if the mapping path above wasn't taken. // Mapped writes complete synchronously and don't require tracking. - MarkUsedInPendingCommands(); + DAWN_TRY(TrackUsage(commandContext, GetDevice()->GetQueue()->GetPendingCommandSerial())); // WriteInternal() can be called with GetAllocatedSize(). We treat it as a full buffer write // as well. @@ -1538,6 +1573,8 @@ size_t size, Buffer* destination, uint64_t destinationOffset) { + DAWN_TRY(TrackUsage(commandContext, GetDevice()->GetQueue()->GetPendingCommandSerial())); + ID3D11Buffer* d3d11SourceBuffer = mLastUpdatedStorage->GetD3D11Buffer(); return destination->CopyFromD3DInternal(commandContext, d3d11SourceBuffer, sourceOffset, size, @@ -1549,6 +1586,8 @@ uint64_t sourceOffset, size_t size, uint64_t destinationOffset) { + DAWN_TRY(TrackUsage(commandContext, GetDevice()->GetQueue()->GetPendingCommandSerial())); + D3D11_BOX srcBox; srcBox.left = static_cast<UINT>(sourceOffset); srcBox.top = 0;
diff --git a/src/dawn/native/d3d11/BufferD3D11.h b/src/dawn/native/d3d11/BufferD3D11.h index 8eb240a..583f812 100644 --- a/src/dawn/native/d3d11/BufferD3D11.h +++ b/src/dawn/native/d3d11/BufferD3D11.h
@@ -34,6 +34,7 @@ #include <utility> #include "absl/container/flat_hash_map.h" +#include "dawn/common/Atomic.h" #include "dawn/common/ityp_array.h" #include "dawn/native/Buffer.h" #include "dawn/native/d3d/d3d_platform.h" @@ -93,6 +94,9 @@ void UnmapIfNeeded(const ScopedCommandRecordingContext* commandContext); + MaybeError TrackUsage(const ScopedCommandRecordingContext* commandContext, + ExecutionSerial pendingSerial); + // This performs GPU Clear. Unlike Clear(), this will always be affected by ID3D11Predicate. // Whereas Clear() might be unaffected by ID3D11Predicate if it's pure CPU clear. virtual MaybeError PredicatedClear(const ScopedSwapStateCommandRecordingContext* commandContext, @@ -105,7 +109,8 @@ virtual MaybeError WriteInternal(const ScopedCommandRecordingContext* commandContext, uint64_t bufferOffset, const void* data, - size_t size) = 0; + size_t size, + bool isInitialWrite) = 0; // Copy this buffer to the destination without checking if the buffer is initialized. virtual MaybeError CopyToInternal(const ScopedCommandRecordingContext* commandContext, uint64_t sourceOffset, @@ -172,7 +177,7 @@ virtual MaybeError ClearPaddingInternal(const ScopedCommandRecordingContext* commandContext); - raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr; + Atomic<uint8_t*, std::memory_order::relaxed> mMappedData{nullptr}; private: MaybeError Initialize(bool mappedAtCreation, @@ -257,7 +262,8 @@ MaybeError WriteInternal(const ScopedCommandRecordingContext* commandContext, uint64_t bufferOffset, const void* data, - size_t size) override; + size_t size, + bool isInitialWrite) override; ResultOrError<ComPtr<ID3D11ShaderResourceView>> CreateD3D11ShaderResourceViewFromD3DBuffer( ID3D11Buffer* d3d11Buffer,
diff --git a/src/dawn/native/d3d11/CommandBufferD3D11.cpp b/src/dawn/native/d3d11/CommandBufferD3D11.cpp index d938eb3..8b06047 100644 --- a/src/dawn/native/d3d11/CommandBufferD3D11.cpp +++ b/src/dawn/native/d3d11/CommandBufferD3D11.cpp
@@ -42,6 +42,7 @@ #include "dawn/native/Commands.h" #include "dawn/native/ExternalTexture.h" #include "dawn/native/ImmediateConstantsTracker.h" +#include "dawn/native/Queue.h" #include "dawn/native/RenderBundle.h" #include "dawn/native/d3d/D3DError.h" #include "dawn/native/d3d11/BindGroupTrackerD3D11.h" @@ -262,7 +263,10 @@ } MaybeError CommandBuffer::Execute(const ScopedSwapStateCommandRecordingContext* commandContext) { - auto LazyClearSyncScope = [&commandContext](const SyncScopeResourceUsage& scope) -> MaybeError { + ExecutionSerial pendingSerial = GetDevice()->GetQueue()->GetPendingCommandSerial(); + + auto LazyClearSyncScope = [&commandContext, + pendingSerial](const SyncScopeResourceUsage& scope) -> MaybeError { for (size_t i = 0; i < scope.textures.size(); i++) { Texture* texture = ToBackend(scope.textures[i]); @@ -283,8 +287,8 @@ } for (BufferBase* buffer : scope.buffers) { + DAWN_TRY(ToBackend(buffer)->TrackUsage(commandContext, pendingSerial)); DAWN_TRY(ToBackend(buffer)->EnsureDataInitialized(commandContext)); - buffer->MarkUsedInPendingCommands(); } return {}; @@ -298,14 +302,6 @@ switch (type) { case Command::BeginComputePass: { mCommands.NextCommand<BeginComputePassCmd>(); - for (BufferBase* buffer : - GetResourceUsages().computePasses[nextComputePassNumber].referencedBuffers) { - buffer->MarkUsedInPendingCommands(); - } - for (TextureBase* texture : - GetResourceUsages().computePasses[nextComputePassNumber].referencedTextures) { - DAWN_TRY(ToBackend(texture)->SynchronizeTextureBeforeUse(commandContext)); - } for (const SyncScopeResourceUsage& scope : GetResourceUsages().computePasses[nextComputePassNumber].dispatchUsages) { for (TextureBase* texture : scope.textures) { @@ -352,11 +348,12 @@ Buffer* source = ToBackend(copy->source.Get()); Buffer* destination = ToBackend(copy->destination.Get()); + DAWN_TRY(source->TrackUsage(commandContext, pendingSerial)); + DAWN_TRY(destination->TrackUsage(commandContext, pendingSerial)); + // Buffer::Copy() will ensure the source and destination buffers are initialized. DAWN_TRY(Buffer::Copy(commandContext, source, copy->sourceOffset, copy->size, destination, copy->destinationOffset)); - source->MarkUsedInPendingCommands(); - destination->MarkUsedInPendingCommands(); break; } @@ -371,6 +368,8 @@ auto& dst = copy->destination; Buffer* buffer = ToBackend(src.buffer.Get()); + DAWN_TRY(buffer->TrackUsage(commandContext, pendingSerial)); + uint64_t bufferOffset = src.offset; Ref<BufferBase> stagingBuffer; const TypedTexelBlockInfo& blockInfo = GetBlockInfo(dst); @@ -411,8 +410,6 @@ copy->copySize.ToExtent3D(), data, static_cast<uint32_t>(bytesPerRow), static_cast<uint32_t>(src.rowsPerImage))); - - buffer->MarkUsedInPendingCommands(); break; } @@ -426,13 +423,15 @@ auto& src = copy->source; auto& dst = copy->destination; + Buffer* buffer = ToBackend(dst.buffer.Get()); + DAWN_TRY(buffer->TrackUsage(commandContext, pendingSerial)); + SubresourceRange subresources = GetSubresourcesAffectedByCopy(src, copy->copySize); Texture* texture = ToBackend(src.texture.Get()); DAWN_TRY(texture->SynchronizeTextureBeforeUse(commandContext)); DAWN_TRY( texture->EnsureSubresourceContentInitialized(commandContext, subresources)); - Buffer* buffer = ToBackend(dst.buffer.Get()); Buffer::ScopedMap scopedDstMap; DAWN_TRY_ASSIGN(scopedDstMap, Buffer::ScopedMap::Create(commandContext, buffer, wgpu::MapMode::Write)); @@ -453,8 +452,6 @@ ->Read(commandContext, subresources, src.origin.ToOrigin3D(), copy->copySize.ToExtent3D(), static_cast<uint32_t>(bytesPerRow), static_cast<uint32_t>(dst.rowsPerImage), callback)); - - dst.buffer->MarkUsedInPendingCommands(); break; } @@ -480,8 +477,8 @@ break; } Buffer* buffer = ToBackend(cmd->buffer.Get()); + DAWN_TRY(buffer->TrackUsage(commandContext, pendingSerial)); DAWN_TRY(buffer->Clear(commandContext, 0, cmd->offset, cmd->size)); - buffer->MarkUsedInPendingCommands(); break; } @@ -493,9 +490,9 @@ Buffer* destination = ToBackend(cmd->destination.Get()); uint64_t destinationOffset = cmd->destinationOffset; + DAWN_TRY(destination->TrackUsage(commandContext, pendingSerial)); DAWN_TRY(querySet->Resolve(commandContext, firstQuery, queryCount, destination, destinationOffset)); - destination->MarkUsedInPendingCommands(); break; } @@ -511,9 +508,9 @@ } Buffer* dstBuffer = ToBackend(cmd->buffer.Get()); + DAWN_TRY(dstBuffer->TrackUsage(commandContext, pendingSerial)); uint8_t* data = mCommands.NextData<uint8_t>(cmd->size); DAWN_TRY(dstBuffer->Write(commandContext, cmd->offset, data, cmd->size)); - dstBuffer->MarkUsedInPendingCommands(); break; }
diff --git a/src/dawn/native/d3d11/DeviceD3D11.cpp b/src/dawn/native/d3d11/DeviceD3D11.cpp index 39ef254..8c394f8 100644 --- a/src/dawn/native/d3d11/DeviceD3D11.cpp +++ b/src/dawn/native/d3d11/DeviceD3D11.cpp
@@ -399,6 +399,8 @@ // D3D11 requires that buffers are unmapped before being used in a copy. DAWN_TRY(source->Unmap()); + auto scopedUseBuffer = source->UseInternal(); + auto commandContext = ToBackend(GetQueue())->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal); return Buffer::Copy(&commandContext, ToBackend(source), sourceOffset, size,
diff --git a/src/dawn/native/d3d11/QueueD3D11.cpp b/src/dawn/native/d3d11/QueueD3D11.cpp index 328d438..ba05ca95 100644 --- a/src/dawn/native/d3d11/QueueD3D11.cpp +++ b/src/dawn/native/d3d11/QueueD3D11.cpp
@@ -271,9 +271,7 @@ if (submitMode == SubmitMode::Normal) { mPendingCommandsNeedSubmit.store(true, std::memory_order_release); } - auto context = ScopedCommandRecordingContext(std::move(commands), lockD3D11Scope); - PerformDeferredUnmaps(&context); - return std::move(context); + return ScopedCommandRecordingContext(std::move(commands), lockD3D11Scope); }); } @@ -283,9 +281,7 @@ if (submitMode == SubmitMode::Normal) { mPendingCommandsNeedSubmit.store(true, std::memory_order_release); } - auto context = ScopedSwapStateCommandRecordingContext(std::move(commands)); - PerformDeferredUnmaps(&context); - return std::move(context); + return ScopedSwapStateCommandRecordingContext(std::move(commands)); }); } @@ -356,19 +352,6 @@ mPendingMapBuffers->Enqueue({buffer, mode}, readySerial); } -void Queue::DeferUnmap(Ref<Buffer>&& buffer) { - mPendingUnmapBuffers->push_back(std::move(buffer)); -} - -void Queue::PerformDeferredUnmaps(const ScopedCommandRecordingContext* commandContext) { - mPendingUnmapBuffers.Use([&](auto pendingUnmapBuffers) { - for (auto& buffer : *pendingUnmapBuffers) { - buffer->UnmapIfNeeded(commandContext); - } - pendingUnmapBuffers->clear(); - }); -} - MaybeError Queue::WriteBufferImpl(BufferBase* buffer, uint64_t bufferOffset, const void* data,
diff --git a/src/dawn/native/d3d11/QueueD3D11.h b/src/dawn/native/d3d11/QueueD3D11.h index 504dd81..620d445 100644 --- a/src/dawn/native/d3d11/QueueD3D11.h +++ b/src/dawn/native/d3d11/QueueD3D11.h
@@ -60,10 +60,6 @@ wgpu::MapMode mode, ExecutionSerial readySerial); - // Defer buffer unmap until next time GetScopedPendingCommandContext() or - // GetScopedSwapStatePendingCommandContext() is called. - void DeferUnmap(Ref<Buffer>&& buffer); - const Ref<SharedFence>& GetSharedFence() const { return mSharedFence; } protected: @@ -99,9 +95,6 @@ // Check all pending map buffers, and actually map the ready ones. MaybeError CheckAndMapReadyBuffers(ExecutionSerial completedSerial); - // Perform deferred unmaps on pending buffers. - void PerformDeferredUnmaps(const ScopedCommandRecordingContext* commandContext); - ComPtr<ID3D11Fence> mFence; Ref<SharedFence> mSharedFence; MutexProtected<CommandRecordingContext, CommandRecordingContextGuard> mPendingCommands; @@ -112,7 +105,6 @@ wgpu::MapMode mode; }; MutexProtected<SerialMap<ExecutionSerial, BufferMapEntry>> mPendingMapBuffers; - MutexProtected<std::vector<Ref<Buffer>>> mPendingUnmapBuffers; }; } // namespace dawn::native::d3d11