Files
test2/intern/opensubdiv/internal/evaluator/gpu_compute_evaluator.cc
Jeroen Bakker 15d88e544a GPU: Storage buffer allocation alignment
Since the introduction of storage buffers in Blender, the calling
code has been responsible for ensuring the buffer meets allocation
requirements. All backends require the allocation size to be divisible
by 16 bytes. Until now, this was sufficient, but with GPU subdivision
changes, an external library must also adhere to these requirements.

For OpenSubdiv (OSD), some buffers are not 16-byte aligned, leading
to potential misallocation. Currently, this is mitigated by allocating
a few extra bytes, but this approach has the drawback of potentially
reading unintended bytes beyond the source buffer.

This PR adopts a similar approach to vertex buffers: the backend handles
extra byte allocation while ensuring data uploads and downloads function
correctly without requiring those additional bytes.

No changes were needed for Metal, as its allocation size is already
aligned to 256 bytes.

**Alternative solutions considered**:

- Copying the CPU buffer to a larger buffer when needed (performance impact).
- Modifying OSD buffers to allocate extra space (requires changes to an external library).
- Implementing GPU_storagebuf_update_sub.

Ref #135873

Pull Request: https://projects.blender.org/blender/blender/pulls/135716
2025-03-13 15:05:16 +01:00

518 lines
20 KiB
C++

/* SPDX-FileCopyrightText: 2025 Blender Foundation
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#include <epoxy/gl.h>
#include "gpu_compute_evaluator.h"
#include <opensubdiv/far/error.h>
#include <opensubdiv/far/patchDescriptor.h>
#include <opensubdiv/far/stencilTable.h>
#include <opensubdiv/osd/glslPatchShaderSource.h>
#include <cassert>
#include <cmath>
#include <sstream>
#include <string>
#include <vector>
#include "GPU_capabilities.hh"
#include "GPU_compute.hh"
#include "GPU_context.hh"
#include "GPU_debug.hh"
#include "GPU_state.hh"
#include "GPU_vertex_buffer.hh"
#include "gpu_shader_create_info.hh"
using OpenSubdiv::Far::LimitStencilTable;
using OpenSubdiv::Far::StencilTable;
using OpenSubdiv::Osd::BufferDescriptor;
using OpenSubdiv::Osd::PatchArray;
using OpenSubdiv::Osd::PatchArrayVector;
#define SHADER_SRC_VERTEX_BUFFER_BUF_SLOT 0
#define SHADER_DST_VERTEX_BUFFER_BUF_SLOT 1
#define SHADER_DU_BUFFER_BUF_SLOT 2
#define SHADER_DV_BUFFER_BUF_SLOT 3
#define SHADER_SIZES_BUF_SLOT 4
#define SHADER_OFFSETS_BUF_SLOT 5
#define SHADER_INDICES_BUF_SLOT 6
#define SHADER_WEIGHTS_BUF_SLOT 7
#define SHADER_DU_WEIGHTS_BUF_SLOT 8
#define SHADER_DV_WEIGHTS_BUF_SLOT 9
#define SHADER_PATCH_ARRAY_BUFFER_BUF_SLOT 4
#define SHADER_PATCH_COORDS_BUF_SLOT 5
#define SHADER_PATCH_INDEX_BUFFER_BUF_SLOT 6
#define SHADER_PATCH_PARAM_BUFFER_BUF_SLOT 7
namespace blender::opensubdiv {
template<class T> GPUStorageBuf *create_buffer(std::vector<T> const &src, const char *name)
{
if (src.empty()) {
return nullptr;
}
const size_t buffer_size = src.size() * sizeof(T);
GPUStorageBuf *storage_buffer = GPU_storagebuf_create_ex(
buffer_size, &src.at(0), GPU_USAGE_STATIC, name);
return storage_buffer;
}
GPUStencilTableSSBO::GPUStencilTableSSBO(StencilTable const *stencilTable)
{
_numStencils = stencilTable->GetNumStencils();
if (_numStencils > 0) {
sizes_buf = create_buffer(stencilTable->GetSizes(), "osd_sized");
offsets_buf = create_buffer(stencilTable->GetOffsets(), "osd_offsets");
indices_buf = create_buffer(stencilTable->GetControlIndices(), "osd_control_indices");
weights_buf = create_buffer(stencilTable->GetWeights(), "osd_weights");
}
}
GPUStencilTableSSBO::GPUStencilTableSSBO(LimitStencilTable const *limitStencilTable)
{
_numStencils = limitStencilTable->GetNumStencils();
if (_numStencils > 0) {
sizes_buf = create_buffer(limitStencilTable->GetSizes(), "osd_sized");
offsets_buf = create_buffer(limitStencilTable->GetOffsets(), "osd_offsets");
indices_buf = create_buffer(limitStencilTable->GetControlIndices(), "osd_control_indices");
weights_buf = create_buffer(limitStencilTable->GetWeights(), "osd_weights");
du_weights_buf = create_buffer(limitStencilTable->GetDuWeights(), "osd_du_weights");
dv_weights_buf = create_buffer(limitStencilTable->GetDvWeights(), "osd_dv_weights");
duu_weights_buf = create_buffer(limitStencilTable->GetDuuWeights(), "osd_duu_weights");
duv_weights_buf = create_buffer(limitStencilTable->GetDuvWeights(), "osd_duv_weights");
dvv_weights_buf = create_buffer(limitStencilTable->GetDvvWeights(), "osd_dvv_weights");
}
}
static void storage_buffer_free(GPUStorageBuf **buffer)
{
if (*buffer) {
GPU_storagebuf_free(*buffer);
*buffer = nullptr;
}
}
GPUStencilTableSSBO::~GPUStencilTableSSBO()
{
storage_buffer_free(&sizes_buf);
storage_buffer_free(&offsets_buf);
storage_buffer_free(&indices_buf);
storage_buffer_free(&weights_buf);
storage_buffer_free(&du_weights_buf);
storage_buffer_free(&dv_weights_buf);
storage_buffer_free(&duu_weights_buf);
storage_buffer_free(&duv_weights_buf);
storage_buffer_free(&dvv_weights_buf);
}
// ---------------------------------------------------------------------------
GPUComputeEvaluator::GPUComputeEvaluator() : _workGroupSize(64), _patchArraysSSBO(nullptr)
{
memset((void *)&_stencilKernel, 0, sizeof(_stencilKernel));
memset((void *)&_patchKernel, 0, sizeof(_patchKernel));
}
GPUComputeEvaluator::~GPUComputeEvaluator()
{
if (_patchArraysSSBO) {
GPU_storagebuf_free(_patchArraysSSBO);
_patchArraysSSBO = nullptr;
}
}
bool GPUComputeEvaluator::Compile(BufferDescriptor const &srcDesc,
BufferDescriptor const &dstDesc,
BufferDescriptor const &duDesc,
BufferDescriptor const &dvDesc)
{
if (!_stencilKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc, _workGroupSize)) {
return false;
}
if (!_patchKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc, _workGroupSize)) {
return false;
}
return true;
}
/* static */
void GPUComputeEvaluator::Synchronize(void * /*kernel*/)
{
// XXX: this is currently just for the performance measuring purpose.
// need to be reimplemented by fence and sync.
GPU_finish();
}
int GPUComputeEvaluator::GetDispatchSize(int count) const
{
return (count + _workGroupSize - 1) / _workGroupSize;
}
void GPUComputeEvaluator::DispatchCompute(GPUShader *shader, int totalDispatchSize) const
{
const int dispatchSize = GetDispatchSize(totalDispatchSize);
int dispatchRX = dispatchSize;
int dispatchRY = 1u;
if (dispatchRX > GPU_max_work_group_count(0)) {
/* Since there are some limitations with regards to the maximum work group size (could be as
* low as 64k elements per call), we split the number elements into a "2d" number, with the
* final index being computed as `res_x + res_y * max_work_group_size`. Even with a maximum
* work group size of 64k, that still leaves us with roughly `64k * 64k = 4` billion elements
* total, which should be enough. If not, we could also use the 3rd dimension. */
/* TODO(fclem): We could dispatch fewer groups if we compute the prime factorization and
* get the smallest rect fitting the requirements. */
dispatchRX = dispatchRY = std::ceil(std::sqrt(dispatchSize));
/* Avoid a completely empty dispatch line caused by rounding. */
if ((dispatchRX * (dispatchRY - 1)) >= dispatchSize) {
dispatchRY -= 1;
}
}
/* X and Y dimensions may have different limits so the above computation may not be right, but
* even with the standard 64k minimum on all dimensions we still have a lot of room. Therefore,
* we presume it all fits. */
assert(dispatchRY < GPU_max_work_group_count(1));
GPU_compute_dispatch(shader, dispatchRX, dispatchRY, 1);
/* Next usage of the src/dst buffers will always be a shader storage. Vertices/normals/attributes
* are copied over to the final buffers using compute shaders. */
GPU_memory_barrier(GPU_BARRIER_SHADER_STORAGE);
}
bool GPUComputeEvaluator::EvalStencils(gpu::VertBuf *srcBuffer,
BufferDescriptor const &srcDesc,
gpu::VertBuf *dstBuffer,
BufferDescriptor const &dstDesc,
gpu::VertBuf *duBuffer,
BufferDescriptor const &duDesc,
gpu::VertBuf *dvBuffer,
BufferDescriptor const &dvDesc,
GPUStorageBuf *sizesBuffer,
GPUStorageBuf *offsetsBuffer,
GPUStorageBuf *indicesBuffer,
GPUStorageBuf *weightsBuffer,
GPUStorageBuf *duWeightsBuffer,
GPUStorageBuf *dvWeightsBuffer,
int start,
int end) const
{
if (_stencilKernel.shader == nullptr) {
return false;
}
int count = end - start;
if (count <= 0) {
return true;
}
GPU_shader_bind(_stencilKernel.shader);
GPU_vertbuf_bind_as_ssbo(srcBuffer, SHADER_SRC_VERTEX_BUFFER_BUF_SLOT);
GPU_vertbuf_bind_as_ssbo(dstBuffer, SHADER_DST_VERTEX_BUFFER_BUF_SLOT);
if (duBuffer) {
GPU_vertbuf_bind_as_ssbo(duBuffer, SHADER_DU_BUFFER_BUF_SLOT);
}
if (dvBuffer) {
GPU_vertbuf_bind_as_ssbo(dvBuffer, SHADER_DV_BUFFER_BUF_SLOT);
}
GPU_storagebuf_bind(sizesBuffer, SHADER_SIZES_BUF_SLOT);
GPU_storagebuf_bind(offsetsBuffer, SHADER_OFFSETS_BUF_SLOT);
GPU_storagebuf_bind(indicesBuffer, SHADER_INDICES_BUF_SLOT);
GPU_storagebuf_bind(weightsBuffer, SHADER_WEIGHTS_BUF_SLOT);
if (duWeightsBuffer) {
GPU_storagebuf_bind(duWeightsBuffer, SHADER_DU_WEIGHTS_BUF_SLOT);
}
if (dvWeightsBuffer) {
GPU_storagebuf_bind(dvWeightsBuffer, SHADER_DV_WEIGHTS_BUF_SLOT);
}
GPU_shader_uniform_int_ex(_stencilKernel.shader, _stencilKernel.uniformStart, 1, 1, &start);
GPU_shader_uniform_int_ex(_stencilKernel.shader, _stencilKernel.uniformEnd, 1, 1, &end);
GPU_shader_uniform_int_ex(
_stencilKernel.shader, _stencilKernel.uniformSrcOffset, 1, 1, &srcDesc.offset);
GPU_shader_uniform_int_ex(
_stencilKernel.shader, _stencilKernel.uniformDstOffset, 1, 1, &dstDesc.offset);
// TODO init to -1 and check >= 0 to align with GPU module. Currently we assume that the uniform
// location is not zero as there are other uniforms defined as well.
#define BIND_BUF_DESC(uniform, desc) \
if (_stencilKernel.uniform > 0) { \
int value[] = {desc.offset, desc.length, desc.stride}; \
GPU_shader_uniform_int_ex(_stencilKernel.shader, _stencilKernel.uniform, 3, 1, value); \
}
BIND_BUF_DESC(uniformDuDesc, duDesc)
BIND_BUF_DESC(uniformDvDesc, dvDesc)
#undef BIND_BUF_DESC
DispatchCompute(_stencilKernel.shader, count);
// GPU_storagebuf_unbind_all();
GPU_shader_unbind();
return true;
}
bool GPUComputeEvaluator::EvalPatches(gpu::VertBuf *srcBuffer,
BufferDescriptor const &srcDesc,
gpu::VertBuf *dstBuffer,
BufferDescriptor const &dstDesc,
gpu::VertBuf *duBuffer,
BufferDescriptor const &duDesc,
gpu::VertBuf *dvBuffer,
BufferDescriptor const &dvDesc,
int numPatchCoords,
gpu::VertBuf *patchCoordsBuffer,
const PatchArrayVector &patchArrays,
GPUStorageBuf *patchIndexBuffer,
GPUStorageBuf *patchParamsBuffer)
{
if (_patchKernel.shader == nullptr) {
return false;
}
GPU_shader_bind(_patchKernel.shader);
GPU_vertbuf_bind_as_ssbo(srcBuffer, SHADER_SRC_VERTEX_BUFFER_BUF_SLOT);
GPU_vertbuf_bind_as_ssbo(dstBuffer, SHADER_DST_VERTEX_BUFFER_BUF_SLOT);
if (duBuffer) {
GPU_vertbuf_bind_as_ssbo(duBuffer, SHADER_DU_BUFFER_BUF_SLOT);
}
if (dvBuffer) {
GPU_vertbuf_bind_as_ssbo(dvBuffer, SHADER_DV_BUFFER_BUF_SLOT);
}
GPU_vertbuf_bind_as_ssbo(patchCoordsBuffer, SHADER_PATCH_COORDS_BUF_SLOT);
GPU_storagebuf_bind(patchIndexBuffer, SHADER_PATCH_INDEX_BUFFER_BUF_SLOT);
GPU_storagebuf_bind(patchParamsBuffer, SHADER_PATCH_PARAM_BUFFER_BUF_SLOT);
int patchArraySize = sizeof(PatchArray);
if (_patchArraysSSBO) {
GPU_storagebuf_free(_patchArraysSSBO);
_patchArraysSSBO = nullptr;
}
_patchArraysSSBO = GPU_storagebuf_create_ex(patchArrays.size() * patchArraySize,
static_cast<const void *>(&patchArrays[0]),
GPU_USAGE_STATIC,
"osd_patch_array");
GPU_storagebuf_bind(_patchArraysSSBO, SHADER_PATCH_ARRAY_BUFFER_BUF_SLOT);
GPU_shader_uniform_int_ex(
_patchKernel.shader, _patchKernel.uniformSrcOffset, 1, 1, &srcDesc.offset);
GPU_shader_uniform_int_ex(
_patchKernel.shader, _patchKernel.uniformDstOffset, 1, 1, &dstDesc.offset);
// TODO init to -1 and check >= 0 to align with GPU module.
#define BIND_BUF_DESC(uniform, desc) \
if (_stencilKernel.uniform > 0) { \
int value[] = {desc.offset, desc.length, desc.stride}; \
GPU_shader_uniform_int_ex(_patchKernel.shader, _patchKernel.uniform, 3, 1, value); \
}
BIND_BUF_DESC(uniformDuDesc, duDesc)
BIND_BUF_DESC(uniformDvDesc, dvDesc)
#undef BIND_BUF_DESC
DispatchCompute(_patchKernel.shader, numPatchCoords);
GPU_shader_unbind();
return true;
}
// ---------------------------------------------------------------------------
GPUComputeEvaluator::_StencilKernel::_StencilKernel() {}
GPUComputeEvaluator::_StencilKernel::~_StencilKernel()
{
if (shader) {
GPU_shader_free(shader);
shader = nullptr;
}
}
static GPUShader *compile_eval_stencil_shader(BufferDescriptor const &srcDesc,
BufferDescriptor const &dstDesc,
BufferDescriptor const &duDesc,
BufferDescriptor const &dvDesc,
int workGroupSize)
{
using namespace blender::gpu::shader;
ShaderCreateInfo info("opensubdiv_compute_eval");
info.local_group_size(workGroupSize, 1, 1);
if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
info.define("OSD_PATCH_BASIS_METAL");
}
else {
info.define("OSD_PATCH_BASIS_GLSL");
}
// TODO: use specialization constants for src_stride, dst_stride. Not sure we can use
// work group size as that requires extensions. This allows us to compile less shaders and
// improve overall performance. Adding length as specialization constant will not work as it is
// used to define an array length. This is not supported by Metal.
std::string length = std::to_string(srcDesc.length);
std::string src_stride = std::to_string(srcDesc.stride);
std::string dst_stride = std::to_string(dstDesc.stride);
std::string work_group_size = std::to_string(workGroupSize);
info.define("LENGTH", length);
info.define("SRC_STRIDE", src_stride);
info.define("DST_STRIDE", dst_stride);
info.define("WORK_GROUP_SIZE", work_group_size);
info.typedef_source("osd_patch_basis.glsl");
info.storage_buf(
SHADER_SRC_VERTEX_BUFFER_BUF_SLOT, Qualifier::READ, "float", "srcVertexBuffer[]");
info.storage_buf(
SHADER_DST_VERTEX_BUFFER_BUF_SLOT, Qualifier::WRITE, "float", "dstVertexBuffer[]");
info.push_constant(Type::INT, "srcOffset");
info.push_constant(Type::INT, "dstOffset");
bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
if (deriv1) {
info.define("OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES");
info.storage_buf(SHADER_DU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duBuffer[]");
info.storage_buf(SHADER_DV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvBuffer[]");
info.push_constant(Type::IVEC3, "duDesc");
info.push_constant(Type::IVEC3, "dvDesc");
}
info.storage_buf(SHADER_SIZES_BUF_SLOT, Qualifier::READ, "int", "sizes_buf[]");
info.storage_buf(SHADER_OFFSETS_BUF_SLOT, Qualifier::READ, "int", "offsets_buf[]");
info.storage_buf(SHADER_INDICES_BUF_SLOT, Qualifier::READ, "int", "indices_buf[]");
info.storage_buf(SHADER_WEIGHTS_BUF_SLOT, Qualifier::READ, "float", "weights_buf[]");
if (deriv1) {
info.storage_buf(
SHADER_DU_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "du_weights_buf[]");
info.storage_buf(
SHADER_DV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "dv_weights_buf[]");
}
info.push_constant(Type::INT, "batchStart");
info.push_constant(Type::INT, "batchEnd");
info.compute_source("osd_eval_stencils_comp.glsl");
GPUShader *shader = GPU_shader_create_from_info(
reinterpret_cast<const GPUShaderCreateInfo *>(&info));
return shader;
}
bool GPUComputeEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDesc,
BufferDescriptor const &dstDesc,
BufferDescriptor const &duDesc,
BufferDescriptor const &dvDesc,
int workGroupSize)
{
if (shader) {
GPU_shader_free(shader);
shader = nullptr;
}
shader = compile_eval_stencil_shader(srcDesc, dstDesc, duDesc, dvDesc, workGroupSize);
if (shader == nullptr) {
return false;
}
// cache uniform locations (TODO: use uniform block)
uniformStart = GPU_shader_get_uniform(shader, "batchStart");
uniformEnd = GPU_shader_get_uniform(shader, "batchEnd");
uniformSrcOffset = GPU_shader_get_uniform(shader, "srcOffset");
uniformDstOffset = GPU_shader_get_uniform(shader, "dstOffset");
uniformDuDesc = GPU_shader_get_uniform(shader, "duDesc");
uniformDvDesc = GPU_shader_get_uniform(shader, "dvDesc");
return true;
}
// ---------------------------------------------------------------------------
GPUComputeEvaluator::_PatchKernel::_PatchKernel() {}
GPUComputeEvaluator::_PatchKernel::~_PatchKernel()
{
if (shader) {
GPU_shader_free(shader);
shader = nullptr;
}
}
static GPUShader *compile_eval_patches_shader(BufferDescriptor const &srcDesc,
BufferDescriptor const &dstDesc,
BufferDescriptor const &duDesc,
BufferDescriptor const &dvDesc,
int workGroupSize)
{
using namespace blender::gpu::shader;
ShaderCreateInfo info("opensubdiv_compute_eval");
info.local_group_size(workGroupSize, 1, 1);
if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
info.define("OSD_PATCH_BASIS_METAL");
}
else {
info.define("OSD_PATCH_BASIS_GLSL");
}
// TODO: use specialization constants for src_stride, dst_stride. Not sure we can use
// work group size as that requires extensions. This allows us to compile less shaders and
// improve overall performance. Adding length as specialization constant will not work as it is
// used to define an array length. This is not supported by Metal.
std::string length = std::to_string(srcDesc.length);
std::string src_stride = std::to_string(srcDesc.stride);
std::string dst_stride = std::to_string(dstDesc.stride);
std::string work_group_size = std::to_string(workGroupSize);
info.define("LENGTH", length);
info.define("SRC_STRIDE", src_stride);
info.define("DST_STRIDE", dst_stride);
info.define("WORK_GROUP_SIZE", work_group_size);
info.typedef_source("osd_patch_basis.glsl");
info.storage_buf(
SHADER_SRC_VERTEX_BUFFER_BUF_SLOT, Qualifier::READ, "float", "srcVertexBuffer[]");
info.storage_buf(
SHADER_DST_VERTEX_BUFFER_BUF_SLOT, Qualifier::WRITE, "float", "dstVertexBuffer[]");
info.push_constant(Type::INT, "srcOffset");
info.push_constant(Type::INT, "dstOffset");
bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
if (deriv1) {
info.define("OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES");
info.storage_buf(SHADER_DU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duBuffer[]");
info.storage_buf(SHADER_DV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvBuffer[]");
info.push_constant(Type::IVEC3, "duDesc");
info.push_constant(Type::IVEC3, "dvDesc");
}
info.storage_buf(
SHADER_PATCH_ARRAY_BUFFER_BUF_SLOT, Qualifier::READ, "OsdPatchArray", "patchArrayBuffer[]");
info.storage_buf(
SHADER_PATCH_COORDS_BUF_SLOT, Qualifier::READ, "OsdPatchCoord", "patchCoords[]");
info.storage_buf(
SHADER_PATCH_INDEX_BUFFER_BUF_SLOT, Qualifier::READ, "int", "patchIndexBuffer[]");
info.storage_buf(
SHADER_PATCH_PARAM_BUFFER_BUF_SLOT, Qualifier::READ, "OsdPatchParam", "patchParamBuffer[]");
info.compute_source("osd_eval_patches_comp.glsl");
GPUShader *shader = GPU_shader_create_from_info(
reinterpret_cast<const GPUShaderCreateInfo *>(&info));
return shader;
}
bool GPUComputeEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
BufferDescriptor const &dstDesc,
BufferDescriptor const &duDesc,
BufferDescriptor const &dvDesc,
int workGroupSize)
{
if (shader) {
GPU_shader_free(shader);
shader = nullptr;
}
shader = compile_eval_patches_shader(srcDesc, dstDesc, duDesc, dvDesc, workGroupSize);
if (shader == nullptr) {
return false;
}
// cache uniform locations
uniformSrcOffset = GPU_shader_get_uniform(shader, "srcOffset");
uniformDstOffset = GPU_shader_get_uniform(shader, "dstOffset");
uniformDuDesc = GPU_shader_get_uniform(shader, "duDesc");
uniformDvDesc = GPU_shader_get_uniform(shader, "dvDesc");
return true;
}
} // namespace blender::opensubdiv