| // Copyright 2021 The Dawn Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "dawn_native/IndirectDrawValidationEncoder.h" |
| |
| #include "common/Constants.h" |
| #include "common/Math.h" |
| #include "dawn_native/BindGroup.h" |
| #include "dawn_native/BindGroupLayout.h" |
| #include "dawn_native/CommandEncoder.h" |
| #include "dawn_native/ComputePassEncoder.h" |
| #include "dawn_native/ComputePipeline.h" |
| #include "dawn_native/Device.h" |
| #include "dawn_native/InternalPipelineStore.h" |
| #include "dawn_native/Queue.h" |
| |
| #include <cstdlib> |
| #include <limits> |
| |
| namespace dawn_native { |
| |
| namespace { |
| // NOTE: This must match the workgroup_size attribute on the compute entry point below. |
| constexpr uint64_t kWorkgroupSize = 64; |
| |
| // Equivalent to the BatchInfo struct defined in the shader below. |
| struct BatchInfo { |
| uint64_t numIndexBufferElements; |
| uint32_t numDraws; |
| uint32_t padding; |
| }; |
| |
| // TODO(https://crbug.com/dawn/1108): Propagate validation feedback from this shader in |
| // various failure modes. |
| static const char sRenderValidationShaderSource[] = R"( |
| let kNumIndirectParamsPerDrawCall = 5u; |
| |
| let kIndexCountEntry = 0u; |
| let kInstanceCountEntry = 1u; |
| let kFirstIndexEntry = 2u; |
| let kBaseVertexEntry = 3u; |
| let kFirstInstanceEntry = 4u; |
| |
| [[block]] struct BatchInfo { |
| numIndexBufferElementsLow: u32; |
| numIndexBufferElementsHigh: u32; |
| numDraws: u32; |
| padding: u32; |
| indirectOffsets: array<u32>; |
| }; |
| |
| [[block]] struct IndirectParams { |
| data: array<u32>; |
| }; |
| |
| [[group(0), binding(0)]] var<storage, read> batch: BatchInfo; |
| [[group(0), binding(1)]] var<storage, read_write> clientParams: IndirectParams; |
| [[group(0), binding(2)]] var<storage, write> validatedParams: IndirectParams; |
| |
| fn fail(drawIndex: u32) { |
| let index = drawIndex * kNumIndirectParamsPerDrawCall; |
| validatedParams.data[index + kIndexCountEntry] = 0u; |
| validatedParams.data[index + kInstanceCountEntry] = 0u; |
| validatedParams.data[index + kFirstIndexEntry] = 0u; |
| validatedParams.data[index + kBaseVertexEntry] = 0u; |
| validatedParams.data[index + kFirstInstanceEntry] = 0u; |
| } |
| |
| fn pass(drawIndex: u32) { |
| let vIndex = drawIndex * kNumIndirectParamsPerDrawCall; |
| let cIndex = batch.indirectOffsets[drawIndex]; |
| validatedParams.data[vIndex + kIndexCountEntry] = |
| clientParams.data[cIndex + kIndexCountEntry]; |
| validatedParams.data[vIndex + kInstanceCountEntry] = |
| clientParams.data[cIndex + kInstanceCountEntry]; |
| validatedParams.data[vIndex + kFirstIndexEntry] = |
| clientParams.data[cIndex + kFirstIndexEntry]; |
| validatedParams.data[vIndex + kBaseVertexEntry] = |
| clientParams.data[cIndex + kBaseVertexEntry]; |
| validatedParams.data[vIndex + kFirstInstanceEntry] = |
| clientParams.data[cIndex + kFirstInstanceEntry]; |
| } |
| |
| [[stage(compute), workgroup_size(64, 1, 1)]] |
| fn main([[builtin(global_invocation_id)]] id : vec3<u32>) { |
| if (id.x >= batch.numDraws) { |
| return; |
| } |
| |
| let clientIndex = batch.indirectOffsets[id.x]; |
| let firstInstance = clientParams.data[clientIndex + kFirstInstanceEntry]; |
| if (firstInstance != 0u) { |
| fail(id.x); |
| return; |
| } |
| |
| if (batch.numIndexBufferElementsHigh >= 2u) { |
| // firstIndex and indexCount are both u32. The maximum possible sum of these |
| // values is 0x1fffffffe, which is less than 0x200000000. Nothing to validate. |
| pass(id.x); |
| return; |
| } |
| |
| let firstIndex = clientParams.data[clientIndex + kFirstIndexEntry]; |
| if (batch.numIndexBufferElementsHigh == 0u && |
| batch.numIndexBufferElementsLow < firstIndex) { |
| fail(id.x); |
| return; |
| } |
| |
| // Note that this subtraction may underflow, but only when |
| // numIndexBufferElementsHigh is 1u. The result is still correct in that case. |
| let maxIndexCount = batch.numIndexBufferElementsLow - firstIndex; |
| let indexCount = clientParams.data[clientIndex + kIndexCountEntry]; |
| if (indexCount > maxIndexCount) { |
| fail(id.x); |
| return; |
| } |
| pass(id.x); |
| } |
| )"; |
| |
| ResultOrError<ComputePipelineBase*> GetOrCreateRenderValidationPipeline( |
| DeviceBase* device) { |
| InternalPipelineStore* store = device->GetInternalPipelineStore(); |
| |
| if (store->renderValidationPipeline == nullptr) { |
| // Create compute shader module if not cached before. |
| if (store->renderValidationShader == nullptr) { |
| ShaderModuleDescriptor descriptor; |
| ShaderModuleWGSLDescriptor wgslDesc; |
| wgslDesc.source = sRenderValidationShaderSource; |
| descriptor.nextInChain = reinterpret_cast<ChainedStruct*>(&wgslDesc); |
| DAWN_TRY_ASSIGN(store->renderValidationShader, |
| device->CreateShaderModule(&descriptor)); |
| } |
| |
| BindGroupLayoutEntry entries[3]; |
| entries[0].binding = 0; |
| entries[0].visibility = wgpu::ShaderStage::Compute; |
| entries[0].buffer.type = wgpu::BufferBindingType::ReadOnlyStorage; |
| entries[1].binding = 1; |
| entries[1].visibility = wgpu::ShaderStage::Compute; |
| entries[1].buffer.type = kInternalStorageBufferBinding; |
| entries[2].binding = 2; |
| entries[2].visibility = wgpu::ShaderStage::Compute; |
| entries[2].buffer.type = wgpu::BufferBindingType::Storage; |
| |
| BindGroupLayoutDescriptor bindGroupLayoutDescriptor; |
| bindGroupLayoutDescriptor.entryCount = 3; |
| bindGroupLayoutDescriptor.entries = entries; |
| Ref<BindGroupLayoutBase> bindGroupLayout; |
| DAWN_TRY_ASSIGN(bindGroupLayout, |
| device->CreateBindGroupLayout(&bindGroupLayoutDescriptor, true)); |
| |
| PipelineLayoutDescriptor pipelineDescriptor; |
| pipelineDescriptor.bindGroupLayoutCount = 1; |
| pipelineDescriptor.bindGroupLayouts = &bindGroupLayout.Get(); |
| Ref<PipelineLayoutBase> pipelineLayout; |
| DAWN_TRY_ASSIGN(pipelineLayout, device->CreatePipelineLayout(&pipelineDescriptor)); |
| |
| ComputePipelineDescriptor computePipelineDescriptor = {}; |
| computePipelineDescriptor.layout = pipelineLayout.Get(); |
| computePipelineDescriptor.compute.module = store->renderValidationShader.Get(); |
| computePipelineDescriptor.compute.entryPoint = "main"; |
| |
| DAWN_TRY_ASSIGN(store->renderValidationPipeline, |
| device->CreateComputePipeline(&computePipelineDescriptor)); |
| } |
| |
| return store->renderValidationPipeline.Get(); |
| } |
| |
| size_t GetBatchDataSize(uint32_t numDraws) { |
| return sizeof(BatchInfo) + numDraws * sizeof(uint32_t); |
| } |
| |
| } // namespace |
| |
| const uint32_t kBatchDrawCallLimitByDispatchSize = |
| kMaxComputePerDimensionDispatchSize * kWorkgroupSize; |
| const uint32_t kBatchDrawCallLimitByStorageBindingSize = |
| (kMaxStorageBufferBindingSize - sizeof(BatchInfo)) / sizeof(uint32_t); |
| const uint32_t kMaxDrawCallsPerIndirectValidationBatch = |
| std::min(kBatchDrawCallLimitByDispatchSize, kBatchDrawCallLimitByStorageBindingSize); |
| |
| MaybeError EncodeIndirectDrawValidationCommands(DeviceBase* device, |
| CommandEncoder* commandEncoder, |
| RenderPassResourceUsageTracker* usageTracker, |
| IndirectDrawMetadata* indirectDrawMetadata) { |
| struct Batch { |
| const IndirectDrawMetadata::IndexedIndirectValidationBatch* metadata; |
| uint64_t numIndexBufferElements; |
| uint64_t dataBufferOffset; |
| uint64_t dataSize; |
| uint64_t clientIndirectOffset; |
| uint64_t clientIndirectSize; |
| uint64_t validatedParamsOffset; |
| uint64_t validatedParamsSize; |
| BatchInfo* batchInfo; |
| }; |
| |
| struct Pass { |
| BufferBase* clientIndirectBuffer; |
| uint64_t validatedParamsSize = 0; |
| uint64_t batchDataSize = 0; |
| std::unique_ptr<void, void (*)(void*)> batchData{nullptr, std::free}; |
| std::vector<Batch> batches; |
| }; |
| |
| // First stage is grouping all batches into passes. We try to pack as many batches into a |
| // single pass as possible. Batches can be grouped together as long as they're validating |
| // data from the same indirect buffer, but they may still be split into multiple passes if |
| // the number of draw calls in a pass would exceed some (very high) upper bound. |
| uint64_t numTotalDrawCalls = 0; |
| size_t validatedParamsSize = 0; |
| std::vector<Pass> passes; |
| IndirectDrawMetadata::IndexedIndirectBufferValidationInfoMap& bufferInfoMap = |
| *indirectDrawMetadata->GetIndexedIndirectBufferValidationInfo(); |
| if (bufferInfoMap.empty()) { |
| return {}; |
| } |
| |
| for (auto& entry : bufferInfoMap) { |
| const IndirectDrawMetadata::IndexedIndirectConfig& config = entry.first; |
| BufferBase* clientIndirectBuffer = config.first; |
| for (const IndirectDrawMetadata::IndexedIndirectValidationBatch& batch : |
| entry.second.GetBatches()) { |
| const uint64_t minOffsetFromAlignedBoundary = |
| batch.minOffset % kMinStorageBufferOffsetAlignment; |
| const uint64_t minOffsetAlignedDown = |
| batch.minOffset - minOffsetFromAlignedBoundary; |
| |
| Batch newBatch; |
| newBatch.metadata = &batch; |
| newBatch.numIndexBufferElements = config.second; |
| newBatch.dataSize = GetBatchDataSize(batch.draws.size()); |
| newBatch.clientIndirectOffset = minOffsetAlignedDown; |
| newBatch.clientIndirectSize = |
| batch.maxOffset + kDrawIndexedIndirectSize - minOffsetAlignedDown; |
| numTotalDrawCalls += batch.draws.size(); |
| |
| newBatch.validatedParamsSize = batch.draws.size() * kDrawIndexedIndirectSize; |
| newBatch.validatedParamsOffset = |
| Align(validatedParamsSize, kMinStorageBufferOffsetAlignment); |
| validatedParamsSize = newBatch.validatedParamsOffset + newBatch.validatedParamsSize; |
| if (validatedParamsSize > kMaxStorageBufferBindingSize) { |
| return DAWN_INTERNAL_ERROR("Too many drawIndexedIndirect calls to validate"); |
| } |
| |
| Pass* currentPass = passes.empty() ? nullptr : &passes.back(); |
| if (currentPass && currentPass->clientIndirectBuffer == clientIndirectBuffer) { |
| uint64_t nextBatchDataOffset = |
| Align(currentPass->batchDataSize, kMinStorageBufferOffsetAlignment); |
| uint64_t newPassBatchDataSize = nextBatchDataOffset + newBatch.dataSize; |
| if (newPassBatchDataSize <= kMaxStorageBufferBindingSize) { |
| // We can fit this batch in the current pass. |
| newBatch.dataBufferOffset = nextBatchDataOffset; |
| currentPass->batchDataSize = newPassBatchDataSize; |
| currentPass->batches.push_back(newBatch); |
| continue; |
| } |
| } |
| |
| // We need to start a new pass for this batch. |
| newBatch.dataBufferOffset = 0; |
| |
| Pass newPass; |
| newPass.clientIndirectBuffer = clientIndirectBuffer; |
| newPass.batchDataSize = newBatch.dataSize; |
| newPass.batches.push_back(newBatch); |
| passes.push_back(std::move(newPass)); |
| } |
| } |
| |
| auto* const store = device->GetInternalPipelineStore(); |
| ScratchBuffer& validatedParamsBuffer = store->scratchIndirectStorage; |
| ScratchBuffer& batchDataBuffer = store->scratchStorage; |
| |
| uint64_t requiredBatchDataBufferSize = 0; |
| for (const Pass& pass : passes) { |
| requiredBatchDataBufferSize = std::max(requiredBatchDataBufferSize, pass.batchDataSize); |
| } |
| DAWN_TRY(batchDataBuffer.EnsureCapacity(requiredBatchDataBufferSize)); |
| usageTracker->BufferUsedAs(batchDataBuffer.GetBuffer(), wgpu::BufferUsage::Storage); |
| |
| DAWN_TRY(validatedParamsBuffer.EnsureCapacity(validatedParamsSize)); |
| usageTracker->BufferUsedAs(validatedParamsBuffer.GetBuffer(), wgpu::BufferUsage::Indirect); |
| |
| // Now we allocate and populate host-side batch data to be copied to the GPU, and prepare to |
| // update all DrawIndexedIndirectCmd buffer references. |
| std::vector<DeferredBufferLocationUpdate> deferredBufferLocationUpdates; |
| deferredBufferLocationUpdates.reserve(numTotalDrawCalls); |
| for (Pass& pass : passes) { |
| // We use std::malloc here because it guarantees maximal scalar alignment. |
| pass.batchData = {std::malloc(pass.batchDataSize), std::free}; |
| memset(pass.batchData.get(), 0, pass.batchDataSize); |
| uint8_t* batchData = static_cast<uint8_t*>(pass.batchData.get()); |
| for (Batch& batch : pass.batches) { |
| batch.batchInfo = new (&batchData[batch.dataBufferOffset]) BatchInfo(); |
| batch.batchInfo->numIndexBufferElements = batch.numIndexBufferElements; |
| batch.batchInfo->numDraws = static_cast<uint32_t>(batch.metadata->draws.size()); |
| |
| uint32_t* indirectOffsets = reinterpret_cast<uint32_t*>(batch.batchInfo + 1); |
| uint64_t validatedParamsOffset = batch.validatedParamsOffset; |
| for (const auto& draw : batch.metadata->draws) { |
| // The shader uses this to index an array of u32, hence the division by 4 bytes. |
| *indirectOffsets++ = static_cast<uint32_t>( |
| (draw.clientBufferOffset - batch.clientIndirectOffset) / 4); |
| |
| DeferredBufferLocationUpdate deferredUpdate; |
| deferredUpdate.location = draw.bufferLocation; |
| deferredUpdate.buffer = validatedParamsBuffer.GetBuffer(); |
| deferredUpdate.offset = validatedParamsOffset; |
| deferredBufferLocationUpdates.push_back(std::move(deferredUpdate)); |
| |
| validatedParamsOffset += kDrawIndexedIndirectSize; |
| } |
| } |
| } |
| |
| ComputePipelineBase* pipeline; |
| DAWN_TRY_ASSIGN(pipeline, GetOrCreateRenderValidationPipeline(device)); |
| |
| Ref<BindGroupLayoutBase> layout; |
| DAWN_TRY_ASSIGN(layout, pipeline->GetBindGroupLayout(0)); |
| |
| BindGroupEntry bindings[3]; |
| BindGroupEntry& bufferDataBinding = bindings[0]; |
| bufferDataBinding.binding = 0; |
| bufferDataBinding.buffer = batchDataBuffer.GetBuffer(); |
| |
| BindGroupEntry& clientIndirectBinding = bindings[1]; |
| clientIndirectBinding.binding = 1; |
| |
| BindGroupEntry& validatedParamsBinding = bindings[2]; |
| validatedParamsBinding.binding = 2; |
| validatedParamsBinding.buffer = validatedParamsBuffer.GetBuffer(); |
| |
| BindGroupDescriptor bindGroupDescriptor = {}; |
| bindGroupDescriptor.layout = layout.Get(); |
| bindGroupDescriptor.entryCount = 3; |
| bindGroupDescriptor.entries = bindings; |
| |
| // Finally, we can now encode our validation passes. Each pass first does a single |
| // WriteBuffer to get batch data over to the GPU, followed by a single compute pass. The |
| // compute pass encodes a separate SetBindGroup and Dispatch command for each batch. |
| commandEncoder->EncodeSetValidatedBufferLocationsInternal( |
| std::move(deferredBufferLocationUpdates)); |
| for (const Pass& pass : passes) { |
| commandEncoder->APIWriteBuffer(batchDataBuffer.GetBuffer(), 0, |
| static_cast<const uint8_t*>(pass.batchData.get()), |
| pass.batchDataSize); |
| |
| // TODO(dawn:723): change to not use AcquireRef for reentrant object creation. |
| ComputePassDescriptor descriptor = {}; |
| Ref<ComputePassEncoder> passEncoder = |
| AcquireRef(commandEncoder->APIBeginComputePass(&descriptor)); |
| passEncoder->APISetPipeline(pipeline); |
| |
| clientIndirectBinding.buffer = pass.clientIndirectBuffer; |
| |
| for (const Batch& batch : pass.batches) { |
| bufferDataBinding.offset = batch.dataBufferOffset; |
| bufferDataBinding.size = batch.dataSize; |
| clientIndirectBinding.offset = batch.clientIndirectOffset; |
| clientIndirectBinding.size = batch.clientIndirectSize; |
| validatedParamsBinding.offset = batch.validatedParamsOffset; |
| validatedParamsBinding.size = batch.validatedParamsSize; |
| |
| Ref<BindGroupBase> bindGroup; |
| DAWN_TRY_ASSIGN(bindGroup, device->CreateBindGroup(&bindGroupDescriptor)); |
| |
| const uint32_t numDrawsRoundedUp = |
| (batch.batchInfo->numDraws + kWorkgroupSize - 1) / kWorkgroupSize; |
| passEncoder->APISetBindGroup(0, bindGroup.Get()); |
| passEncoder->APIDispatch(numDrawsRoundedUp); |
| } |
| |
| passEncoder->APIEndPass(); |
| } |
| |
| return {}; |
| } |
| |
| } // namespace dawn_native |