blob: 44dbb16dcc2f9ec4a886e462296518965e3aa3a2 [file] [log] [blame] [edit]
// Copyright 2023 The Dawn Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dawn/native/BlitTextureToBuffer.h"
#include <string_view>
#include <utility>
#include "dawn/common/Assert.h"
#include "dawn/native/BindGroup.h"
#include "dawn/native/CommandBuffer.h"
#include "dawn/native/CommandEncoder.h"
#include "dawn/native/ComputePassEncoder.h"
#include "dawn/native/ComputePipeline.h"
#include "dawn/native/Device.h"
#include "dawn/native/InternalPipelineStore.h"
#include "dawn/native/Queue.h"
#include "dawn/native/utils/WGPUHelpers.h"
namespace dawn::native {
namespace {
constexpr uint32_t kWorkgroupSizeX = 8;
constexpr uint32_t kWorkgroupSizeY = 8;
// Helper to join constexpr std::string_view
template <std::string_view const&... Strs>
struct ConcatStringViewsImpl {
// Join all strings into a single std::array of chars
static constexpr auto impl() noexcept {
constexpr std::size_t len = (Strs.size() + ... + 0);
std::array<char, len + 1> a{};
auto append = [i = 0, &a](auto const& s) mutable {
for (auto c : s) {
a[i++] = c;
}
};
(append(Strs), ...);
a[len] = 0;
return a;
}
// Give the joined string static storage
static constexpr auto arr = impl();
// View as a std::string_view
static constexpr std::string_view value{arr.data(), arr.size() - 1};
};
// Helper to get the value out
template <std::string_view const&... Strs>
static constexpr auto ConcatStringViews = ConcatStringViewsImpl<Strs...>::value;
constexpr std::string_view kSnormTexture = R"(
fn textureLoadGeneral(src_tex: texture_2d_array<f32>, coords: vec3u, level: u32) -> vec4<f32> {
return textureLoad(src_tex, coords.xy, coords.z, level);
}
@group(0) @binding(0) var src_tex : texture_2d_array<f32>;
@group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
)";
constexpr std::string_view kSnormTexture3D = R"(
fn textureLoadGeneral(src_tex: texture_3d<f32>, coords: vec3u, level: u32) -> vec4<f32> {
return textureLoad(src_tex, coords, level);
}
@group(0) @binding(0) var src_tex : texture_3d<f32>;
@group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
)";
constexpr std::string_view kStencilTexture = R"(
@group(0) @binding(0) var src_tex : texture_2d_array<u32>;
@group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
)";
constexpr std::string_view kDepthTexture = R"(
@group(0) @binding(0) var src_tex : texture_depth_2d_array;
@group(0) @binding(1) var<storage, read_write> dst_buf : array<u32>;
)";
constexpr std::string_view kDepth32FloatTexture = R"(
@group(0) @binding(0) var src_tex : texture_depth_2d_array;
// Can directly use f32 for the buffer array data type
@group(0) @binding(1) var<storage, read_write> dst_buf : array<f32>;
)";
constexpr std::string_view kCommon = R"(
struct Params {
// copyExtent
srcOrigin: vec3u,
// How many texel values one thread needs to pack (1, 2, or 4)
packTexelCount: u32,
srcExtent: vec3u,
pad1: u32,
// GPUImageDataLayout
indicesPerRow: u32,
rowsPerImage: u32,
indicesOffset: u32,
};
@group(0) @binding(2) var<uniform> params : Params;
override workgroupSizeX: u32;
override workgroupSizeY: u32;
// Load the texel value and write to storage buffer.
// Each thread is responsible for reading (packTexelCount) byte and packing them into a 4-byte u32.
@compute @workgroup_size(workgroupSizeX, workgroupSizeY, 1) fn main
(@builtin(global_invocation_id) id : vec3u) {
let srcBoundary = params.srcOrigin + params.srcExtent;
let coord0 = vec3u(id.x * params.packTexelCount, id.y, id.z) + params.srcOrigin;
if (any(coord0 >= srcBoundary)) {
return;
}
let dstOffset = params.indicesOffset + id.x + id.y * params.indicesPerRow + id.z * params.indicesPerRow * params.rowsPerImage;
)";
constexpr std::string_view kCommonEnd = R"(
dst_buf[dstOffset] = result;
}
)";
constexpr std::string_view kPackStencil8ToU32 = R"(
// Storing stencil8 texel values
var result: u32 = 0xff & textureLoad(src_tex, coord0.xy, coord0.z, 0).r;
if (coord0.x + 4u <= srcBoundary.x) {
// All 4 texels for this thread are within texture bounds.
for (var i = 1u; i < 4u; i += 1u) {
let coordi = coord0 + vec3u(i, 0, 0);
let ri = 0xff & textureLoad(src_tex, coordi.xy, coordi.z, 0).r;
result |= ri << (i * 8u);
}
} else {
// Otherwise, srcExtent.x is not a multiple of 4 and this thread is at right edge of the texture
// To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
let original: u32 = dst_buf[dstOffset];
result |= original & 0xffffff00;
for (var i = 1u; i < 4u; i += 1u) {
let coordi = coord0 + vec3u(i, 0, 0);
if (coordi.x >= srcBoundary.x) {
break;
}
let ri = 0xff & textureLoad(src_tex, coordi.xy, coordi.z, 0).r;
result |= ri << (i * 8u);
}
}
)";
constexpr std::string_view kPackR8SnormToU32 = R"(
// Result bits to store into dst_buf
var result: u32 = 0u;
// Storing snorm8 texel values
// later called by pack4x8snorm to convert to u32.
var v: vec4<f32>;
v[0] = textureLoadGeneral(src_tex, coord0, 0).r;
if (coord0.x + 4u <= srcBoundary.x) {
// All 4 texels for this thread are within texture bounds.
for (var i = 1u; i < 4u; i += 1u) {
let coordi = coord0 + vec3u(i, 0, 0);
v[i] = textureLoadGeneral(src_tex, coordi, 0).r;
}
result = pack4x8snorm(v);
} else {
// Otherwise, srcExtent.x is not a multiple of 4 and this thread is at right edge of the texture
// To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
let original: u32 = dst_buf[dstOffset];
var i = 1u;
for (; i < 4u; i += 1u) {
let coordi = coord0 + vec3u(i, 0, 0);
if (coordi.x >= srcBoundary.x) {
break;
}
v[i] = textureLoadGeneral(src_tex, coordi, 0).r;
}
let mask: u32 = 0xffffffffu << (i * 8u);
result = (original & mask) | (pack4x8snorm(v) & ~mask);
}
)";
constexpr std::string_view kPackRG8SnormToU32 = R"(
// Result bits to store into dst_buf
var result: u32 = 0u;
// Storing snorm8 texel values
// later called by pack4x8snorm to convert to u32.
var v: vec4<f32>;
let texel0 = textureLoadGeneral(src_tex, coord0, 0).rg;
v[0] = texel0.r;
v[1] = texel0.g;
let coord1 = coord0 + vec3u(1, 0, 0);
if (coord1.x < srcBoundary.x) {
// Make sure coord1 is still within the copy boundary.
let texel1 = textureLoadGeneral(src_tex, coord1, 0).rg;
v[2] = texel1.r;
v[3] = texel1.g;
result = pack4x8snorm(v);
} else {
// Otherwise, srcExtent.x is not a multiple of 2 and this thread is at right edge of the texture
// To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
let original: u32 = dst_buf[dstOffset];
let mask = 0xffff0000u;
result = (original & mask) | (pack4x8snorm(v) & ~mask);
}
)";
// ShaderF16 extension is only enabled by GL_AMD_gpu_shader_half_float for GL
// so we should not use it generally for the emulation.
// As a result we are using f32 and array<u32> to do all the math and byte manipulation.
// If we have 2-byte scalar type (f16, u16) it can be a bit easier when writing to the storage
// buffer.
constexpr std::string_view kPackDepth16UnormToU32 = R"(
// Result bits to store into dst_buf
var result: u32 = 0u;
// Storing depth16unorm texel values
// later called by pack2x16unorm to convert to u32.
var v: vec2<f32>;
v[0] = textureLoad(src_tex, coord0.xy, coord0.z, 0);
let coord1 = coord0 + vec3u(1, 0, 0);
if (coord1.x < srcBoundary.x) {
// Make sure coord1 is still within the copy boundary.
v[1] = textureLoad(src_tex, coord1.xy, coord1.z, 0);
result = pack2x16unorm(v);
} else {
// Otherwise, srcExtent.x is not a multiple of 2 and this thread is at right edge of the texture
// To preserve the original buffer content, we need to read from the buffer and pack it together with other values.
// TODO(dawn:1782): profiling against making a separate pass for this edge case
// as it requires reading from dst_buf.
let original: u32 = dst_buf[dstOffset];
let mask = 0xffff0000u;
result = (original & mask) | (pack2x16unorm(v) & ~mask);
}
)";
constexpr std::string_view kPackRGBA8SnormToU32 = R"(
// Storing snorm8 texel values
// later called by pack4x8snorm to convert to u32.
var v: vec4<f32>;
let texel0 = textureLoadGeneral(src_tex, coord0, 0);
v[0] = texel0.r;
v[1] = texel0.g;
v[2] = texel0.b;
v[3] = texel0.a;
let result: u32 = pack4x8snorm(v);
)";
constexpr std::string_view kLoadDepth32Float = R"(
dst_buf[dstOffset] = textureLoad(src_tex, coord0.xy, coord0.z, 0);
}
)";
constexpr std::string_view kBlitR8Snorm =
ConcatStringViews<kSnormTexture, kCommon, kPackR8SnormToU32, kCommonEnd>;
constexpr std::string_view kBlitRG8Snorm =
ConcatStringViews<kSnormTexture, kCommon, kPackRG8SnormToU32, kCommonEnd>;
constexpr std::string_view kBlitRGBA8Snorm =
ConcatStringViews<kSnormTexture, kCommon, kPackRGBA8SnormToU32, kCommonEnd>;
constexpr std::string_view kBlitR8Snorm3D =
ConcatStringViews<kSnormTexture3D, kCommon, kPackR8SnormToU32, kCommonEnd>;
constexpr std::string_view kBlitRG8Snorm3D =
ConcatStringViews<kSnormTexture3D, kCommon, kPackRG8SnormToU32, kCommonEnd>;
constexpr std::string_view kBlitRGBA8Snorm3D =
ConcatStringViews<kSnormTexture3D, kCommon, kPackRGBA8SnormToU32, kCommonEnd>;
constexpr std::string_view kBlitStencil8 =
ConcatStringViews<kStencilTexture, kCommon, kPackStencil8ToU32, kCommonEnd>;
constexpr std::string_view kBlitDepth16Unorm =
ConcatStringViews<kDepthTexture, kCommon, kPackDepth16UnormToU32, kCommonEnd>;
constexpr std::string_view kBlitDepth32Float =
ConcatStringViews<kDepth32FloatTexture, kCommon, kLoadDepth32Float>;
ResultOrError<Ref<ComputePipelineBase>> GetOrCreateTextureToBufferPipeline(DeviceBase* device,
const TextureCopy& src) {
InternalPipelineStore* store = device->GetInternalPipelineStore();
const Format& format = src.texture->GetFormat();
wgpu::TextureDimension dimension = src.texture->GetDimension();
bool is3D = dimension == wgpu::TextureDimension::e3D;
auto iter = store->blitTextureToBufferComputePipelines.find({format.format, dimension});
if (iter != store->blitTextureToBufferComputePipelines.end()) {
return iter->second;
}
ShaderModuleWGSLDescriptor wgslDesc = {};
ShaderModuleDescriptor shaderModuleDesc = {};
shaderModuleDesc.nextInChain = &wgslDesc;
wgpu::TextureSampleType textureSampleType;
switch (format.format) {
case wgpu::TextureFormat::R8Snorm:
wgslDesc.code = is3D ? kBlitR8Snorm3D.data() : kBlitR8Snorm.data();
textureSampleType = wgpu::TextureSampleType::Float;
break;
case wgpu::TextureFormat::RG8Snorm:
wgslDesc.code = is3D ? kBlitRG8Snorm3D.data() : kBlitRG8Snorm.data();
textureSampleType = wgpu::TextureSampleType::Float;
break;
case wgpu::TextureFormat::RGBA8Snorm:
wgslDesc.code = is3D ? kBlitRGBA8Snorm3D.data() : kBlitRGBA8Snorm.data();
textureSampleType = wgpu::TextureSampleType::Float;
break;
case wgpu::TextureFormat::Depth16Unorm:
DAWN_ASSERT(!is3D);
wgslDesc.code = kBlitDepth16Unorm.data();
textureSampleType = wgpu::TextureSampleType::Depth;
break;
case wgpu::TextureFormat::Depth32Float:
DAWN_ASSERT(!is3D);
wgslDesc.code = kBlitDepth32Float.data();
textureSampleType = wgpu::TextureSampleType::Depth;
break;
case wgpu::TextureFormat::Stencil8:
case wgpu::TextureFormat::Depth24PlusStencil8:
// Depth24PlusStencil8 can only copy with stencil aspect and is gated by validation.
DAWN_ASSERT(!is3D);
wgslDesc.code = kBlitStencil8.data();
textureSampleType = wgpu::TextureSampleType::Uint;
break;
case wgpu::TextureFormat::Depth32FloatStencil8:
// Depth32FloatStencil8 is not supported on OpenGL/OpenGLES where the blit path is
// enabled by default. But could be hit if the blit path toggle is manually set on other
// backends.
DAWN_ASSERT(!is3D);
switch (src.aspect) {
case Aspect::Depth:
wgslDesc.code = kBlitDepth32Float.data();
textureSampleType = wgpu::TextureSampleType::Depth;
break;
case Aspect::Stencil:
wgslDesc.code = kBlitStencil8.data();
textureSampleType = wgpu::TextureSampleType::Uint;
break;
default:
UNREACHABLE();
}
break;
default:
UNREACHABLE();
}
Ref<ShaderModuleBase> shaderModule;
DAWN_TRY_ASSIGN(shaderModule, device->CreateShaderModule(&shaderModuleDesc));
Ref<BindGroupLayoutBase> bindGroupLayout;
DAWN_TRY_ASSIGN(
bindGroupLayout,
utils::MakeBindGroupLayout(
device,
{
{0, wgpu::ShaderStage::Compute, textureSampleType,
is3D ? wgpu::TextureViewDimension::e3D : wgpu::TextureViewDimension::e2DArray},
{1, wgpu::ShaderStage::Compute, kInternalStorageBufferBinding},
{2, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Uniform},
},
/* allowInternalBinding */ true));
Ref<PipelineLayoutBase> pipelineLayout;
DAWN_TRY_ASSIGN(pipelineLayout, utils::MakeBasicPipelineLayout(device, bindGroupLayout));
ComputePipelineDescriptor computePipelineDescriptor = {};
computePipelineDescriptor.layout = pipelineLayout.Get();
computePipelineDescriptor.compute.module = shaderModule.Get();
computePipelineDescriptor.compute.entryPoint = "main";
constexpr std::array<ConstantEntry, 2> constants = {{
{nullptr, "workgroupSizeX", kWorkgroupSizeX},
{nullptr, "workgroupSizeY", kWorkgroupSizeY},
}};
computePipelineDescriptor.compute.constantCount = constants.size();
computePipelineDescriptor.compute.constants = constants.data();
Ref<ComputePipelineBase> pipeline;
DAWN_TRY_ASSIGN(pipeline, device->CreateComputePipeline(&computePipelineDescriptor));
store->blitTextureToBufferComputePipelines.insert(
{std::make_pair(format.format, dimension), pipeline});
return pipeline;
}
} // anonymous namespace
MaybeError BlitTextureToBuffer(DeviceBase* device,
CommandEncoder* commandEncoder,
const TextureCopy& src,
const BufferCopy& dst,
const Extent3D& copyExtent) {
Ref<ComputePipelineBase> pipeline;
DAWN_TRY_ASSIGN(pipeline, GetOrCreateTextureToBufferPipeline(device, src));
const Format& format = src.texture->GetFormat();
wgpu::TextureDimension dimension = src.texture->GetDimension();
// TODO(dawn:1781): Implement Snorm copy for 1D texture.
DAWN_INVALID_IF(dimension == wgpu::TextureDimension::e1D, "Unsupported texture dimension %s.",
dimension);
bool is3D = dimension == wgpu::TextureDimension::e3D;
uint32_t texelFormatByteSize = format.GetAspectInfo(src.aspect).block.byteSize;
uint32_t workgroupCountX = 1;
uint32_t workgroupCountY = (copyExtent.height + kWorkgroupSizeY - 1) / kWorkgroupSizeY;
uint32_t workgroupCountZ = copyExtent.depthOrArrayLayers;
switch (texelFormatByteSize) {
case 1:
// One thread is responsible for writing four texel values (x, y) ~ (x+3, y).
workgroupCountX = (copyExtent.width + 4 * kWorkgroupSizeX - 1) / (4 * kWorkgroupSizeX);
break;
case 2:
// One thread is responsible for writing two texel values (x, y) and (x+1, y).
workgroupCountX = (copyExtent.width + 2 * kWorkgroupSizeX - 1) / (2 * kWorkgroupSizeX);
break;
case 4:
workgroupCountX = (copyExtent.width + kWorkgroupSizeX - 1) / kWorkgroupSizeX;
break;
default:
UNREACHABLE();
}
Ref<BufferBase> destinationBuffer = dst.buffer;
bool useIntermediateCopyBuffer = false;
if (texelFormatByteSize < 4 && dst.buffer->GetSize() % 4 != 0 &&
copyExtent.width % (4 / texelFormatByteSize) != 0) {
// This path is made for OpenGL/GLES bliting a texture with an width % (4 / texelByteSize)
// != 0, to a compact buffer. When we copy the last texel, we inevitably need to access an
// out of bounds location given by dst.buffer.size as we use array<u32> in the shader for
// the storage buffer. Although the allocated size of dst.buffer is aligned to 4 bytes for
// OpenGL/GLES backend, the size of the storage buffer binding for the shader is not. Thus
// we make an intermediate buffer aligned to 4 bytes for the compute shader to safely
// access, and perform an additional buffer to buffer copy at the end. This path should be
// hit rarely.
useIntermediateCopyBuffer = true;
BufferDescriptor descriptor = {};
descriptor.size = Align(dst.buffer->GetSize(), 4);
// TODO(dawn:1485): adding CopyDst usage to add kInternalStorageBuffer usage internally.
descriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
DAWN_TRY_ASSIGN(destinationBuffer, device->CreateBuffer(&descriptor));
}
// Allow internal usages since we need to use the source as a texture binding
// and buffer as a storage binding.
auto scope = commandEncoder->MakeInternalUsageScope();
Ref<BindGroupLayoutBase> bindGroupLayout;
DAWN_TRY_ASSIGN(bindGroupLayout, pipeline->GetBindGroupLayout(0));
Ref<BufferBase> uniformBuffer;
{
BufferDescriptor bufferDesc = {};
// Uniform buffer size needs to be multiple of 16 bytes
bufferDesc.size = sizeof(uint32_t) * 12;
bufferDesc.usage = wgpu::BufferUsage::Uniform;
bufferDesc.mappedAtCreation = true;
DAWN_TRY_ASSIGN(uniformBuffer, device->CreateBuffer(&bufferDesc));
uint32_t* params =
static_cast<uint32_t*>(uniformBuffer->GetMappedRange(0, bufferDesc.size));
// srcOrigin: vec3u
params[0] = src.origin.x;
params[1] = src.origin.y;
if (is3D) {
params[2] = src.origin.z;
} else {
// src.origin.z is set at textureView.baseArrayLayer
params[2] = 0;
}
// packTexelCount: number of texel values (1, 2, or 4) one thread packs into the dst buffer
params[3] = 4 / texelFormatByteSize;
// srcExtent: vec3u
params[4] = copyExtent.width;
params[5] = copyExtent.height;
params[6] = copyExtent.depthOrArrayLayers;
params[7] = src.mipLevel;
// Turn bytesPerRow, (bytes)offset to use array index as unit
// We pack values into array<u32> or array<f32>
params[8] = dst.bytesPerRow / 4;
params[9] = dst.rowsPerImage;
params[10] = dst.offset / 4;
DAWN_TRY(uniformBuffer->Unmap());
}
TextureViewDescriptor viewDesc = {};
switch (src.aspect) {
case Aspect::Color:
viewDesc.aspect = wgpu::TextureAspect::All;
break;
case Aspect::Depth:
viewDesc.aspect = wgpu::TextureAspect::DepthOnly;
break;
case Aspect::Stencil:
viewDesc.aspect = wgpu::TextureAspect::StencilOnly;
break;
default:
UNREACHABLE();
}
viewDesc.dimension =
is3D ? wgpu::TextureViewDimension::e3D : wgpu::TextureViewDimension::e2DArray;
viewDesc.baseMipLevel = src.mipLevel;
viewDesc.mipLevelCount = 1;
if (is3D) {
viewDesc.baseArrayLayer = 0;
viewDesc.arrayLayerCount = 1;
} else {
viewDesc.baseArrayLayer = src.origin.z;
viewDesc.arrayLayerCount = copyExtent.depthOrArrayLayers;
}
Ref<TextureViewBase> srcView;
DAWN_TRY_ASSIGN(srcView, src.texture->CreateView(&viewDesc));
Ref<BindGroupBase> bindGroup;
DAWN_TRY_ASSIGN(bindGroup, utils::MakeBindGroup(device, bindGroupLayout,
{
{0, srcView},
{1, destinationBuffer},
{2, uniformBuffer},
},
UsageValidationMode::Internal));
Ref<ComputePassEncoder> pass = commandEncoder->BeginComputePass();
pass->APISetPipeline(pipeline.Get());
pass->APISetBindGroup(0, bindGroup.Get());
pass->APIDispatchWorkgroups(workgroupCountX, workgroupCountY, workgroupCountZ);
pass->APIEnd();
if (useIntermediateCopyBuffer) {
ASSERT(destinationBuffer->GetSize() <= dst.buffer->GetAllocatedSize());
commandEncoder->InternalCopyBufferToBufferWithAllocatedSize(
destinationBuffer.Get(), 0, dst.buffer.Get(), 0, destinationBuffer->GetSize());
}
return {};
}
} // namespace dawn::native