d3d11: Emulate upload buffers using system memory
This optimizes buffer uploading by using d3d11 UpdateSubresource1 rather
than CopySubresourceRegion, which can be slow on Intel Gen12.
Bug: dawn:1732
Bug: chromium:1485789
Change-Id: I3c16ecde10543aaef27a2d36510bed2844b8e3ed
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/168940
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Reviewed-by: Peng Huang <penghuang@chromium.org>
Commit-Queue: Jie A Chen <jie.a.chen@intel.com>
Reviewed-by: Austin Eng <enga@chromium.org>
diff --git a/src/dawn/native/d3d11/BufferD3D11.cpp b/src/dawn/native/d3d11/BufferD3D11.cpp
index f8b2395..79e9971 100644
--- a/src/dawn/native/d3d11/BufferD3D11.cpp
+++ b/src/dawn/native/d3d11/BufferD3D11.cpp
@@ -32,6 +32,7 @@
#include <utility>
#include <vector>
+#include "dawn/common/Alloc.h"
#include "dawn/common/Assert.h"
#include "dawn/common/Constants.h"
#include "dawn/common/Math.h"
@@ -69,6 +70,10 @@
return usage & kMappableBufferUsages;
}
+bool IsUpload(wgpu::BufferUsage usage) {
+ return usage == (wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
+}
+
D3D11_USAGE D3D11BufferUsage(wgpu::BufferUsage usage) {
if (IsMappable(usage)) {
return D3D11_USAGE_STAGING;
@@ -152,11 +157,41 @@
} // namespace
+// For CPU-to-GPU upload buffers(CopySrc|MapWrite), they can be emulated in the system memory, and
+// then written into the dest GPU buffer via ID3D11DeviceContext::UpdateSubresource.
+class UploadBuffer final : public Buffer {
+ using Buffer::Buffer;
+ ~UploadBuffer() override;
+
+ MaybeError InitializeInternal() override;
+ MaybeError MapInternal(const ScopedCommandRecordingContext* commandContext) override;
+ void UnmapInternal(const ScopedCommandRecordingContext* commandContext) override;
+
+ MaybeError ClearInternal(const ScopedCommandRecordingContext* commandContext,
+ uint8_t clearValue,
+ uint64_t offset = 0,
+ uint64_t size = 0) override;
+
+ uint8_t* GetUploadData() override;
+
+ std::unique_ptr<uint8_t[]> mUploadData;
+};
+
// static
ResultOrError<Ref<Buffer>> Buffer::Create(Device* device,
const UnpackedPtr<BufferDescriptor>& descriptor,
- const ScopedCommandRecordingContext* commandContext) {
- Ref<Buffer> buffer = AcquireRef(new Buffer(device, descriptor));
+ const ScopedCommandRecordingContext* commandContext,
+ bool allowUploadBufferEmulation) {
+ bool useUploadBuffer = allowUploadBufferEmulation;
+ useUploadBuffer &= IsUpload(descriptor->usage);
+ constexpr uint64_t kMaxUploadBufferSize = 4 * 1024 * 1024;
+ useUploadBuffer &= descriptor->size <= kMaxUploadBufferSize;
+ Ref<Buffer> buffer;
+ if (useUploadBuffer) {
+ buffer = AcquireRef(new UploadBuffer(device, descriptor));
+ } else {
+ buffer = AcquireRef(new Buffer(device, descriptor));
+ }
DAWN_TRY(buffer->Initialize(descriptor->mappedAtCreation, commandContext));
return buffer;
}
@@ -180,6 +215,45 @@
}
mAllocatedSize = Align(size, alignment);
+ DAWN_TRY(InitializeInternal());
+
+ SetLabelImpl();
+
+ if (!mappedAtCreation) {
+ if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
+ if (commandContext) {
+ DAWN_TRY(ClearInternal(commandContext, 1u));
+ } else {
+ auto tmpCommandContext =
+ ToBackend(GetDevice()->GetQueue())
+ ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
+ DAWN_TRY(ClearInternal(&tmpCommandContext, 1u));
+ }
+ }
+
+ // Initialize the padding bytes to zero.
+ if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
+ uint32_t paddingBytes = GetAllocatedSize() - GetSize();
+ if (paddingBytes > 0) {
+ uint32_t clearSize = paddingBytes;
+ uint64_t clearOffset = GetSize();
+ if (commandContext) {
+ DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
+
+ } else {
+ auto tmpCommandContext =
+ ToBackend(GetDevice()->GetQueue())
+ ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
+ DAWN_TRY(ClearInternal(&tmpCommandContext, 0, clearOffset, clearSize));
+ }
+ }
+ }
+ }
+
+ return {};
+}
+
+MaybeError Buffer::InitializeInternal() {
bool needsConstantBuffer = GetUsage() & wgpu::BufferUsage::Uniform;
bool onlyNeedsConstantBuffer =
needsConstantBuffer && IsSubset(GetUsage(), kD3D11AllowedUniformBufferUsages);
@@ -221,39 +295,6 @@
DAWN_ASSERT(mD3d11NonConstantBuffer || mD3d11ConstantBuffer);
- SetLabelImpl();
-
- if (!mappedAtCreation) {
- if (GetDevice()->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting)) {
- if (commandContext) {
- DAWN_TRY(ClearInternal(commandContext, 1u));
- } else {
- auto tmpCommandContext =
- ToBackend(GetDevice()->GetQueue())
- ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
- DAWN_TRY(ClearInternal(&tmpCommandContext, 1u));
- }
- }
-
- // Initialize the padding bytes to zero.
- if (GetDevice()->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse)) {
- uint32_t paddingBytes = GetAllocatedSize() - GetSize();
- if (paddingBytes > 0) {
- uint32_t clearSize = paddingBytes;
- uint64_t clearOffset = GetSize();
- if (commandContext) {
- DAWN_TRY(ClearInternal(commandContext, 0, clearOffset, clearSize));
-
- } else {
- auto tmpCommandContext =
- ToBackend(GetDevice()->GetQueue())
- ->GetScopedPendingCommandContext(QueueBase::SubmitMode::Normal);
- DAWN_TRY(ClearInternal(&tmpCommandContext, 0, clearOffset, clearSize));
- }
- }
- }
- }
-
return {};
}
@@ -295,7 +336,7 @@
}
MaybeError Buffer::MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) {
- DAWN_ASSERT(mD3d11NonConstantBuffer);
+ DAWN_ASSERT(mD3d11NonConstantBuffer || GetUploadData());
mMapReadySerial = mLastUsageSerial;
const ExecutionSerial completedSerial = GetDevice()->GetQueue()->GetCompletedCommandSerial();
@@ -327,7 +368,7 @@
}
void Buffer::UnmapImpl() {
- DAWN_ASSERT(mD3d11NonConstantBuffer);
+ DAWN_ASSERT(mD3d11NonConstantBuffer || GetUploadData());
mMapReadySerial = kMaxExecutionSerial;
if (mMappedData) {
auto commandContext = ToBackend(GetDevice()->GetQueue())
@@ -662,6 +703,15 @@
size_t size,
Buffer* destination,
uint64_t destinationOffset) {
+ // Upload buffers shouldn't be copied to.
+ DAWN_ASSERT(!destination->GetUploadData());
+ // Use UpdateSubresource1() if the source is an upload buffer.
+ if (source->GetUploadData()) {
+ DAWN_TRY(destination->WriteInternal(commandContext, destinationOffset,
+ source->GetUploadData() + sourceOffset, size));
+ return {};
+ }
+
D3D11_BOX srcBox;
srcBox.left = static_cast<UINT>(sourceOffset);
srcBox.top = 0;
@@ -699,6 +749,10 @@
return {};
}
+uint8_t* Buffer::GetUploadData() {
+ return nullptr;
+}
+
ResultOrError<Buffer::ScopedMap> Buffer::ScopedMap::Create(
const ScopedCommandRecordingContext* commandContext,
Buffer* buffer) {
@@ -752,4 +806,40 @@
return mBuffer ? mBuffer->mMappedData.get() : nullptr;
}
+UploadBuffer::~UploadBuffer() = default;
+
+MaybeError UploadBuffer::InitializeInternal() {
+ mUploadData = std::unique_ptr<uint8_t[]>(AllocNoThrow<uint8_t>(GetAllocatedSize()));
+ if (mUploadData == nullptr) {
+ return DAWN_OUT_OF_MEMORY_ERROR("Failed to allocate memory for buffer uploading.");
+ }
+ return {};
+}
+
+uint8_t* UploadBuffer::GetUploadData() {
+ return mUploadData.get();
+}
+
+MaybeError UploadBuffer::MapInternal(const ScopedCommandRecordingContext* commandContext) {
+ mMappedData = mUploadData.get();
+ return {};
+}
+
+void UploadBuffer::UnmapInternal(const ScopedCommandRecordingContext* commandContext) {
+ mMappedData = nullptr;
+}
+
+MaybeError UploadBuffer::ClearInternal(const ScopedCommandRecordingContext* commandContext,
+ uint8_t clearValue,
+ uint64_t offset,
+ uint64_t size) {
+ if (size == 0) {
+ DAWN_ASSERT(offset == 0);
+ size = GetAllocatedSize();
+ }
+
+ memset(mUploadData.get() + offset, clearValue, size);
+ return {};
+}
+
} // namespace dawn::native::d3d11
diff --git a/src/dawn/native/d3d11/BufferD3D11.h b/src/dawn/native/d3d11/BufferD3D11.h
index 7c185fb..288de0a 100644
--- a/src/dawn/native/d3d11/BufferD3D11.h
+++ b/src/dawn/native/d3d11/BufferD3D11.h
@@ -40,11 +40,12 @@
class Device;
class ScopedCommandRecordingContext;
-class Buffer final : public BufferBase {
+class Buffer : public BufferBase {
public:
static ResultOrError<Ref<Buffer>> Create(Device* device,
const UnpackedPtr<BufferDescriptor>& descriptor,
- const ScopedCommandRecordingContext* commandContext);
+ const ScopedCommandRecordingContext* commandContext,
+ bool allowUploadBufferEmulation = true);
MaybeError EnsureDataInitialized(const ScopedCommandRecordingContext* commandContext);
MaybeError EnsureDataInitializedAsDestination(
@@ -119,11 +120,27 @@
bool mNeedsUnmap = false;
};
- private:
+ protected:
using BufferBase::BufferBase;
~Buffer() override;
+ virtual MaybeError InitializeInternal();
+
+ virtual MaybeError MapInternal(const ScopedCommandRecordingContext* commandContext);
+ virtual void UnmapInternal(const ScopedCommandRecordingContext* commandContext);
+
+ // Clear the buffer without checking if the buffer is initialized.
+ virtual MaybeError ClearInternal(const ScopedCommandRecordingContext* commandContext,
+ uint8_t clearValue,
+ uint64_t offset = 0,
+ uint64_t size = 0);
+
+ virtual uint8_t* GetUploadData();
+
+ raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr;
+
+ private:
MaybeError Initialize(bool mappedAtCreation,
const ScopedCommandRecordingContext* commandContext);
MaybeError MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) override;
@@ -133,15 +150,7 @@
MaybeError MapAtCreationImpl() override;
void* GetMappedPointer() override;
- MaybeError MapInternal(const ScopedCommandRecordingContext* commandContext);
- void UnmapInternal(const ScopedCommandRecordingContext* commandContext);
-
MaybeError InitializeToZero(const ScopedCommandRecordingContext* commandContext);
- // Clear the buffer without checking if the buffer is initialized.
- MaybeError ClearInternal(const ScopedCommandRecordingContext* commandContext,
- uint8_t clearValue,
- uint64_t offset = 0,
- uint64_t size = 0);
// Write the buffer without checking if the buffer is initialized.
MaybeError WriteInternal(const ScopedCommandRecordingContext* commandContext,
uint64_t bufferOffset,
@@ -159,7 +168,6 @@
// The buffer object for non-constant buffer usages(e.g. storage buffer, vertex buffer, etc.)
ComPtr<ID3D11Buffer> mD3d11NonConstantBuffer;
bool mConstantBufferIsUpdated = true;
- raw_ptr<uint8_t, AllowPtrArithmetic> mMappedData = nullptr;
ExecutionSerial mMapReadySerial = kMaxExecutionSerial;
};
diff --git a/src/dawn/native/d3d11/DeviceD3D11.cpp b/src/dawn/native/d3d11/DeviceD3D11.cpp
index e194452..dc5b66d 100644
--- a/src/dawn/native/d3d11/DeviceD3D11.cpp
+++ b/src/dawn/native/d3d11/DeviceD3D11.cpp
@@ -560,7 +560,8 @@
Ref<BufferBase> buffer;
// We don't cache the buffer if it's too large.
if (bufferSize > kMaxStagingBufferSize) {
- DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext));
+ DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext,
+ /*allowUploadBufferEmulation=*/false));
return buffer;
}
@@ -580,7 +581,8 @@
}
// Create a new staging buffer as no existing one can be re-used.
- DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext));
+ DAWN_TRY_ASSIGN(buffer, Buffer::Create(this, Unpack(&descriptor), commandContext,
+ /*allowUploadBufferEmulation=*/false));
mTotalStagingBufferSize += bufferSize;
// Purge the old staging buffers if the total size is too large.
diff --git a/src/dawn/tests/end2end/BufferTests.cpp b/src/dawn/tests/end2end/BufferTests.cpp
index f7da0de..6a5a7ac 100644
--- a/src/dawn/tests/end2end/BufferTests.cpp
+++ b/src/dawn/tests/end2end/BufferTests.cpp
@@ -120,6 +120,13 @@
descriptor.usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc;
return device.CreateBuffer(&descriptor);
}
+
+ wgpu::Buffer CreateUniformBuffer(uint64_t size) {
+ wgpu::BufferDescriptor descriptor;
+ descriptor.size = size;
+ descriptor.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
+ return device.CreateBuffer(&descriptor);
+ }
};
void CheckMapping(const void* actual, const void* expected, size_t size) {
@@ -716,9 +723,10 @@
constexpr int kRepeatCount = 50;
constexpr int kBufferSize = 1024 * 1024 * 10;
wgpu::Buffer tempWriteBuffer = CreateMapWriteBuffer(kBufferSize);
- wgpu::Buffer tempReadBuffer = CreateMapReadBuffer(kBufferSize);
+ wgpu::Buffer tempReadBuffer = CreateUniformBuffer(kBufferSize);
for (int i = 0; i < kRepeatCount; ++i) {
- encoder.CopyBufferToBuffer(tempWriteBuffer, 0, tempReadBuffer, 0, kBufferSize);
+ encoder.CopyBufferToBuffer(tempWriteBuffer, 0, tempReadBuffer, 0,
+ kBufferSize - 1024);
}
}