Files
test2/source/blender/gpu/intern/gpu_index_buffer.cc
Hans Goudey 0f46e02310 Mesh: Draw triangle index buffer creation improvements
This PR is another step in the refactor described by #116901.
This change applies to the triangle index buffer. The main improvement
is the ability to recognize when the mesh corner triangles index array
can be uploaded directly (when there is a single material and no hidden
faces). In that case the index data should be copied directly to the
GPU rather than to a temporary array owned by the IBO first. Though
that isn't implemented yet since it will be handled by the GPU module
later, the code is now structured to make that change simple from the
data extraction perspective.

Other than that, the main change is to not use the extractor iterator
framework anymore, and to set index data directly instead of using GPU
API functions. Though we're mainly bottlenecked by memory-bandwidth
anyway, it's nice to avoid function call overhead.

We also now avoid creating the array of sorted triangle indices when
there is a single material and no hidden faces. And we don't use
restart indices for the single-material case anymore. For Metal that's
nice because we can avoid `strip_restart_indices`.

I didn't notice significant performance improvements in my test files
beyond a few percent here and there. With a hacked implementation of
the copy-directly-to-the-gpu optimization, I did see more consistent
improvements though.

Pull Request: https://projects.blender.org/blender/blender/pulls/119130
2024-04-11 04:49:27 +02:00

582 lines
18 KiB
C++

/* SPDX-FileCopyrightText: 2016 by Mike Erwin. All rights reserved.
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup gpu
*
* GPU element list (AKA index buffer)
*/
#include "MEM_guardedalloc.h"
#include "BLI_array_utils.hh"
#include "BLI_math_base.h"
#include "BLI_utildefines.h"
#include "gpu_backend.hh"
#include "GPU_index_buffer.hh"
#include "GPU_capabilities.hh"
#include "GPU_compute.hh"
#include "GPU_platform.hh"
#include <algorithm> /* For `min/max`. */
#include <cstring>
#define KEEP_SINGLE_COPY 1
#define RESTART_INDEX 0xFFFFFFFF
/* -------------------------------------------------------------------- */
/** \name IndexBufBuilder
* \{ */
using namespace blender;
using namespace blender::gpu;
void GPU_indexbuf_init_ex(GPUIndexBufBuilder *builder,
GPUPrimType prim_type,
uint index_len,
uint vertex_len)
{
builder->max_allowed_index = vertex_len - 1;
builder->max_index_len = index_len;
builder->index_len = 0; // start empty
builder->index_min = UINT32_MAX;
builder->index_max = 0;
builder->prim_type = prim_type;
#ifdef __APPLE__
/* Only encode restart indices for restart-compatible primitive types.
* Resolves out-of-bounds read error on macOS. Using 0-index will ensure
* degenerative primitives when skipping primitives is required and will
* incur no additional performance cost for rendering. */
if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL)) {
/* We will still use restart-indices for point primitives and then
* patch these during IndexBuf::init, as we cannot benefit from degenerative
* primitives to eliminate these. */
builder->restart_index_value = (is_restart_compatible(prim_type) ||
prim_type == GPU_PRIM_POINTS) ?
RESTART_INDEX :
0;
}
else {
builder->restart_index_value = RESTART_INDEX;
}
#else
builder->restart_index_value = RESTART_INDEX;
#endif
builder->uses_restart_indices = false;
builder->data = (uint *)MEM_callocN(builder->max_index_len * sizeof(uint), "IndexBuf data");
}
void GPU_indexbuf_init(GPUIndexBufBuilder *builder,
GPUPrimType prim_type,
uint prim_len,
uint vertex_len)
{
int verts_per_prim = GPU_indexbuf_primitive_len(prim_type);
#if TRUST_NO_ONE
assert(verts_per_prim != -1);
#endif
GPU_indexbuf_init_ex(builder, prim_type, prim_len * uint(verts_per_prim), vertex_len);
}
IndexBuf *GPU_indexbuf_build_on_device(uint index_len)
{
IndexBuf *elem_ = GPU_indexbuf_calloc();
GPU_indexbuf_init_build_on_device(elem_, index_len);
return elem_;
}
void GPU_indexbuf_init_build_on_device(IndexBuf *elem, uint index_len)
{
IndexBuf *elem_ = elem;
elem_->init_build_on_device(index_len);
}
blender::MutableSpan<uint32_t> GPU_indexbuf_get_data(GPUIndexBufBuilder *builder)
{
return {builder->data, builder->max_index_len};
}
void GPU_indexbuf_join(GPUIndexBufBuilder *builder_to, const GPUIndexBufBuilder *builder_from)
{
BLI_assert(builder_to->data == builder_from->data);
builder_to->index_len = max_uu(builder_to->index_len, builder_from->index_len);
builder_to->index_min = min_uu(builder_to->index_min, builder_from->index_min);
builder_to->index_max = max_uu(builder_to->index_max, builder_from->index_max);
}
void GPU_indexbuf_add_generic_vert(GPUIndexBufBuilder *builder, uint v)
{
#if TRUST_NO_ONE
assert(builder->data != nullptr);
assert(builder->index_len < builder->max_index_len);
assert(v <= builder->max_allowed_index);
#endif
builder->data[builder->index_len++] = v;
builder->index_min = std::min(builder->index_min, v);
builder->index_max = std::max(builder->index_max, v);
}
void GPU_indexbuf_add_primitive_restart(GPUIndexBufBuilder *builder)
{
#if TRUST_NO_ONE
assert(builder->data != nullptr);
assert(builder->index_len < builder->max_index_len);
#endif
builder->data[builder->index_len++] = builder->restart_index_value;
builder->uses_restart_indices = true;
}
void GPU_indexbuf_add_point_vert(GPUIndexBufBuilder *builder, uint v)
{
#if TRUST_NO_ONE
assert(builder->prim_type == GPU_PRIM_POINTS);
#endif
GPU_indexbuf_add_generic_vert(builder, v);
}
void GPU_indexbuf_add_line_verts(GPUIndexBufBuilder *builder, uint v1, uint v2)
{
#if TRUST_NO_ONE
assert(builder->prim_type == GPU_PRIM_LINES);
assert(v1 != v2);
#endif
GPU_indexbuf_add_generic_vert(builder, v1);
GPU_indexbuf_add_generic_vert(builder, v2);
}
void GPU_indexbuf_add_tri_verts(GPUIndexBufBuilder *builder, uint v1, uint v2, uint v3)
{
#if TRUST_NO_ONE
assert(builder->prim_type == GPU_PRIM_TRIS);
assert(v1 != v2 && v2 != v3 && v3 != v1);
#endif
GPU_indexbuf_add_generic_vert(builder, v1);
GPU_indexbuf_add_generic_vert(builder, v2);
GPU_indexbuf_add_generic_vert(builder, v3);
}
void GPU_indexbuf_add_line_adj_verts(
GPUIndexBufBuilder *builder, uint v1, uint v2, uint v3, uint v4)
{
#if TRUST_NO_ONE
assert(builder->prim_type == GPU_PRIM_LINES_ADJ);
assert(v2 != v3); /* only the line need diff indices */
#endif
GPU_indexbuf_add_generic_vert(builder, v1);
GPU_indexbuf_add_generic_vert(builder, v2);
GPU_indexbuf_add_generic_vert(builder, v3);
GPU_indexbuf_add_generic_vert(builder, v4);
}
void GPU_indexbuf_set_point_vert(GPUIndexBufBuilder *builder, uint elem, uint v1)
{
BLI_assert(builder->prim_type == GPU_PRIM_POINTS);
BLI_assert(elem < builder->max_index_len);
builder->data[elem++] = v1;
builder->index_min = std::min(builder->index_min, v1);
builder->index_max = std::max(builder->index_max, v1);
builder->index_len = std::max(builder->index_len, elem);
}
void GPU_indexbuf_set_line_verts(GPUIndexBufBuilder *builder, uint elem, uint v1, uint v2)
{
BLI_assert(builder->prim_type == GPU_PRIM_LINES);
BLI_assert(v1 != v2);
BLI_assert(v1 <= builder->max_allowed_index);
BLI_assert(v2 <= builder->max_allowed_index);
BLI_assert((elem + 1) * 2 <= builder->max_index_len);
uint idx = elem * 2;
builder->data[idx++] = v1;
builder->data[idx++] = v2;
builder->index_min = std::min({builder->index_min, v1, v2});
builder->index_max = std::max({builder->index_max, v1, v2});
builder->index_len = std::max(builder->index_len, idx);
}
void GPU_indexbuf_set_tri_verts(GPUIndexBufBuilder *builder, uint elem, uint v1, uint v2, uint v3)
{
BLI_assert(builder->prim_type == GPU_PRIM_TRIS);
BLI_assert(v1 != v2 && v2 != v3 && v3 != v1);
BLI_assert(v1 <= builder->max_allowed_index);
BLI_assert(v2 <= builder->max_allowed_index);
BLI_assert(v3 <= builder->max_allowed_index);
BLI_assert((elem + 1) * 3 <= builder->max_index_len);
uint idx = elem * 3;
builder->data[idx++] = v1;
builder->data[idx++] = v2;
builder->data[idx++] = v3;
builder->index_min = std::min({builder->index_min, v1, v2, v3});
builder->index_max = std::max({builder->index_max, v1, v2, v3});
builder->index_len = std::max(builder->index_len, idx);
}
void GPU_indexbuf_set_point_restart(GPUIndexBufBuilder *builder, uint elem)
{
BLI_assert(builder->prim_type == GPU_PRIM_POINTS);
BLI_assert(elem < builder->max_index_len);
builder->data[elem++] = builder->restart_index_value;
builder->index_len = std::max(builder->index_len, elem);
builder->uses_restart_indices = true;
}
void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem)
{
BLI_assert(builder->prim_type == GPU_PRIM_LINES);
BLI_assert((elem + 1) * 2 <= builder->max_index_len);
uint idx = elem * 2;
builder->data[idx++] = builder->restart_index_value;
builder->data[idx++] = builder->restart_index_value;
builder->index_len = std::max(builder->index_len, idx);
builder->uses_restart_indices = true;
}
void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem)
{
BLI_assert(builder->prim_type == GPU_PRIM_TRIS);
BLI_assert((elem + 1) * 3 <= builder->max_index_len);
uint idx = elem * 3;
builder->data[idx++] = builder->restart_index_value;
builder->data[idx++] = builder->restart_index_value;
builder->data[idx++] = builder->restart_index_value;
builder->index_len = std::max(builder->index_len, idx);
builder->uses_restart_indices = true;
}
IndexBuf *GPU_indexbuf_build_curves_on_device(GPUPrimType prim_type,
uint curves_num,
uint verts_per_curve)
{
uint64_t dispatch_x_dim = verts_per_curve;
if (ELEM(prim_type, GPU_PRIM_LINE_STRIP, GPU_PRIM_TRI_STRIP)) {
dispatch_x_dim += 1;
}
uint64_t grid_x, grid_y, grid_z;
uint64_t max_grid_x = GPU_max_work_group_count(0), max_grid_y = GPU_max_work_group_count(1),
max_grid_z = GPU_max_work_group_count(2);
grid_x = min_uu(max_grid_x, (dispatch_x_dim + 15) / 16);
grid_y = (curves_num + 15) / 16;
if (grid_y <= max_grid_y) {
grid_z = 1;
}
else {
grid_y = grid_z = uint64_t(ceil(sqrt(double(grid_y))));
grid_y = min_uu(grid_y, max_grid_y);
grid_z = min_uu(grid_z, max_grid_z);
}
bool tris = (prim_type == GPU_PRIM_TRIS);
bool lines = (prim_type == GPU_PRIM_LINES);
GPUShader *shader = GPU_shader_get_builtin_shader(
tris ? GPU_SHADER_INDEXBUF_TRIS :
(lines ? GPU_SHADER_INDEXBUF_LINES : GPU_SHADER_INDEXBUF_POINTS));
GPU_shader_bind(shader);
IndexBuf *ibo = GPU_indexbuf_build_on_device(curves_num * dispatch_x_dim);
int resolution;
if (tris) {
resolution = 6;
}
else if (lines) {
resolution = 2;
}
else {
resolution = 1;
}
GPU_shader_uniform_1i(shader, "elements_per_curve", dispatch_x_dim / resolution);
GPU_shader_uniform_1i(shader, "ncurves", curves_num);
GPU_indexbuf_bind_as_ssbo(ibo, GPU_shader_get_ssbo_binding(shader, "out_indices"));
GPU_compute_dispatch(shader, grid_x, grid_y, grid_z);
GPU_memory_barrier(GPU_BARRIER_ELEMENT_ARRAY);
GPU_shader_unbind();
return ibo;
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name Creation & Deletion
* \{ */
namespace blender::gpu {
IndexBuf::~IndexBuf()
{
if (!is_subrange_) {
MEM_SAFE_FREE(data_);
}
}
void IndexBuf::init(uint indices_len,
uint32_t *indices,
uint min_index,
uint max_index,
GPUPrimType prim_type,
bool uses_restart_indices)
{
is_init_ = true;
data_ = indices;
index_start_ = 0;
index_len_ = indices_len;
is_empty_ = min_index > max_index;
/* Patch index buffer to remove restart indices from
* non-restart-compatible primitive types. Restart indices
* are situationally added to selectively hide vertices.
* Metal does not support restart-indices for non-restart-compatible
* types, as such we should remove these indices.
*
* We only need to perform this for point primitives, as
* line primitives/triangle primitives can use index 0 for all
* vertices to create a degenerative primitive, where all
* vertices share the same index and skip rendering via HW
* culling. */
if (prim_type == GPU_PRIM_POINTS && uses_restart_indices) {
this->strip_restart_indices();
}
#if GPU_TRACK_INDEX_RANGE
/* Everything remains 32 bit while building to keep things simple.
* Find min/max after, then convert to smallest index type possible. */
uint range = min_index < max_index ? max_index - min_index : 0;
/* count the primitive restart index. */
range += 1;
if (range <= 0xFFFF) {
index_type_ = GPU_INDEX_U16;
bool do_clamp_indices = false;
# ifdef __APPLE__
/* NOTE: For the Metal Backend, we use degenerative primitives to hide vertices
* which are not restart compatible. When this is done, we need to ensure
* that compressed index ranges clamp all index values within the valid
* range, rather than maximally clamping against the USHORT restart index
* value of 0xFFFFu, as this will cause an out-of-bounds read during
* vertex assembly. */
do_clamp_indices = GPU_type_matches_ex(
GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL);
# endif
this->squeeze_indices_short(min_index, max_index, prim_type, do_clamp_indices);
}
#endif
}
void IndexBuf::init_build_on_device(uint index_len)
{
is_init_ = true;
index_start_ = 0;
index_len_ = index_len;
index_type_ = GPU_INDEX_U32;
data_ = nullptr;
}
void IndexBuf::init_subrange(IndexBuf *elem_src, uint start, uint length)
{
/* We don't support nested sub-ranges. */
BLI_assert(elem_src && elem_src->is_subrange_ == false);
BLI_assert((length == 0) || (start + length <= elem_src->index_len_));
is_init_ = true;
is_subrange_ = true;
src_ = elem_src;
index_start_ = start;
index_len_ = length;
index_base_ = elem_src->index_base_;
index_type_ = elem_src->index_type_;
}
uint IndexBuf::index_range(uint *r_min, uint *r_max)
{
if (index_len_ == 0) {
*r_min = *r_max = 0;
return 0;
}
const uint32_t *uint_idx = (uint32_t *)data_;
uint min_value = RESTART_INDEX;
uint max_value = 0;
for (uint i = 0; i < index_len_; i++) {
const uint value = uint_idx[i];
if (value == RESTART_INDEX) {
continue;
}
if (value < min_value) {
min_value = value;
}
else if (value > max_value) {
max_value = value;
}
}
if (min_value == RESTART_INDEX) {
*r_min = *r_max = 0;
return 0;
}
*r_min = min_value;
*r_max = max_value;
return max_value - min_value;
}
void IndexBuf::squeeze_indices_short(uint min_idx,
uint max_idx,
GPUPrimType prim_type,
bool clamp_indices_in_range)
{
/* data will never be *larger* than builder->data...
* converting in place to avoid extra allocation */
uint16_t *ushort_idx = (uint16_t *)data_;
const uint32_t *uint_idx = (uint32_t *)data_;
if (max_idx >= 0xFFFF) {
index_base_ = min_idx;
/* NOTE: When using restart_index=0 for degenerative primitives indices,
* the compressed index will go below zero and wrap around when min_idx > 0.
* In order to ensure the resulting index is still within range, we instead
* clamp index to the maximum within the index range.
*
* `clamp_max_idx` represents the maximum possible index to clamp against. If primitive is
* restart-compatible, we can just clamp against the primitive-restart value, otherwise, we
* must assign to a valid index within the range.
*
* NOTE: For OpenGL we skip this by disabling clamping, as we still need to use
* restart index values for point primitives to disable rendering. */
uint16_t clamp_max_idx = (is_restart_compatible(prim_type) || !clamp_indices_in_range) ?
0xFFFFu :
(max_idx - min_idx);
for (uint i = 0; i < index_len_; i++) {
ushort_idx[i] = std::min<uint16_t>(clamp_max_idx, uint_idx[i] - min_idx);
}
}
else {
index_base_ = 0;
for (uint i = 0; i < index_len_; i++) {
ushort_idx[i] = uint16_t(uint_idx[i]);
}
}
}
} // namespace blender::gpu
/** \} */
/* -------------------------------------------------------------------- */
/** \name C-API
* \{ */
IndexBuf *GPU_indexbuf_calloc()
{
return GPUBackend::get()->indexbuf_alloc();
}
IndexBuf *GPU_indexbuf_build(GPUIndexBufBuilder *builder)
{
IndexBuf *elem = GPU_indexbuf_calloc();
GPU_indexbuf_build_in_place(builder, elem);
return elem;
}
IndexBuf *GPU_indexbuf_create_subrange(IndexBuf *elem_src, uint start, uint length)
{
IndexBuf *elem = GPU_indexbuf_calloc();
GPU_indexbuf_create_subrange_in_place(elem, elem_src, start, length);
return elem;
}
void GPU_indexbuf_build_in_place(GPUIndexBufBuilder *builder, IndexBuf *elem)
{
BLI_assert(builder->data != nullptr);
/* Transfer data ownership to IndexBuf.
* It will be uploaded upon first use. */
elem->init(builder->index_len,
builder->data,
builder->index_min,
builder->index_max,
builder->prim_type,
builder->uses_restart_indices);
builder->data = nullptr;
}
void GPU_indexbuf_build_in_place_ex(GPUIndexBufBuilder *builder,
const uint index_min,
const uint index_max,
const bool uses_restart_indices,
IndexBuf *elem)
{
BLI_assert(builder->data != nullptr);
/* Transfer data ownership to IndexBuf.
* It will be uploaded upon first use. */
elem->init(builder->max_index_len,
builder->data,
index_min,
index_max,
builder->prim_type,
uses_restart_indices);
builder->data = nullptr;
}
void GPU_indexbuf_build_in_place_from_memory(IndexBuf *ibo,
const GPUPrimType prim_type,
const uint32_t *data,
const int32_t data_len,
const int32_t index_min,
const int32_t index_max,
const bool uses_restart_indices)
{
const uint32_t indices_num = data_len * indices_per_primitive(prim_type);
/* TODO: The need for this copy is meant to be temporary. The data should be uploaded directly to
* the GPU here rather than copied to an array owned by the IBO first. */
uint32_t *copy = static_cast<uint32_t *>(
MEM_malloc_arrayN(indices_num, sizeof(uint32_t), __func__));
threading::memory_bandwidth_bound_task(sizeof(uint32_t) * indices_num * 2, [&]() {
array_utils::copy(Span(data, indices_num), MutableSpan(copy, indices_num));
});
ibo->init(indices_num, copy, index_min, index_max, prim_type, uses_restart_indices);
}
void GPU_indexbuf_create_subrange_in_place(IndexBuf *elem,
IndexBuf *elem_src,
uint start,
uint length)
{
elem->init_subrange(elem_src, start, length);
}
void GPU_indexbuf_read(IndexBuf *elem, uint32_t *data)
{
return elem->read(data);
}
void GPU_indexbuf_discard(IndexBuf *elem)
{
delete elem;
}
bool GPU_indexbuf_is_init(IndexBuf *elem)
{
return elem->is_init();
}
int GPU_indexbuf_primitive_len(GPUPrimType prim_type)
{
return indices_per_primitive(prim_type);
}
void GPU_indexbuf_use(IndexBuf *elem)
{
elem->upload_data();
}
void GPU_indexbuf_bind_as_ssbo(IndexBuf *elem, int binding)
{
elem->bind_as_ssbo(binding);
}
void GPU_indexbuf_update_sub(IndexBuf *elem, uint start, uint len, const void *data)
{
elem->update_sub(start, len, data);
}
/** \} */