Files
test2/source/blender/gpu/metal/mtl_index_buffer.mm
Campbell Barton e955c94ed3 License Headers: Set copyright to "Blender Authors", add AUTHORS
Listing the "Blender Foundation" as copyright holder implied the Blender
Foundation holds copyright to files which may include work from many
developers.

While keeping copyright on headers makes sense for isolated libraries,
Blender's own code may be refactored or moved between files in a way
that makes the per file copyright holders less meaningful.

Copyright references to the "Blender Foundation" have been replaced with
"Blender Authors", with the exception of `./extern/` since these this
contains libraries which are more isolated, any changed to license
headers there can be handled on a case-by-case basis.

Some directories in `./intern/` have also been excluded:

- `./intern/cycles/` it's own `AUTHORS` file is planned.
- `./intern/opensubdiv/`.

An "AUTHORS" file has been added, using the chromium projects authors
file as a template.

Design task: #110784

Ref !110783.
2023-08-16 00:20:26 +10:00

565 lines
19 KiB
Plaintext

/* SPDX-FileCopyrightText: 2022-2023 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup gpu
*/
#include "mtl_index_buffer.hh"
#include "mtl_context.hh"
#include "mtl_debug.hh"
#include "mtl_storage_buffer.hh"
#include "BLI_span.hh"
namespace blender::gpu {
/* -------------------------------------------------------------------- */
/** \name Core MTLIndexBuf implementation.
* \{ */
MTLIndexBuf::~MTLIndexBuf()
{
if (ibo_ != nullptr && !this->is_subrange_) {
ibo_->free();
}
this->free_optimized_buffer();
if (ssbo_wrapper_) {
delete ssbo_wrapper_;
ssbo_wrapper_ = nullptr;
}
}
void MTLIndexBuf::free_optimized_buffer()
{
if (optimized_ibo_) {
optimized_ibo_->free();
optimized_ibo_ = nullptr;
}
}
void MTLIndexBuf::bind_as_ssbo(uint32_t binding)
{
/* Flag buffer as incompatible with optimized/patched buffers as contents
* can now have partial modifications from the GPU. */
this->flag_can_optimize(false);
this->free_optimized_buffer();
/* Ensure resource is initialized. */
this->upload_data();
/* Ensure we have a valid IBO. */
BLI_assert(this->ibo_);
/* Create MTLStorageBuffer to wrap this resource and use conventional binding. */
if (ssbo_wrapper_ == nullptr) {
ssbo_wrapper_ = new MTLStorageBuf(this, alloc_size_);
}
ssbo_wrapper_->bind(binding);
}
void MTLIndexBuf::read(uint32_t *data) const
{
if (ibo_ != nullptr) {
/* Fetch active context. */
MTLContext *ctx = MTLContext::get();
BLI_assert(ctx);
/* Ensure data is flushed for host caches. */
id<MTLBuffer> source_buffer = ibo_->get_metal_buffer();
if (source_buffer.storageMode == MTLStorageModeManaged) {
id<MTLBlitCommandEncoder> enc = ctx->main_command_buffer.ensure_begin_blit_encoder();
[enc synchronizeResource:source_buffer];
}
/* Ensure GPU has finished operating on commands which may modify data. */
GPU_finish();
/* Read data. */
void *host_ptr = ibo_->get_host_ptr();
memcpy(data, host_ptr, size_get());
return;
}
BLI_assert(false && "Index buffer not ready to be read.");
}
void MTLIndexBuf::upload_data()
{
/* Handle sub-range upload. */
if (is_subrange_) {
MTLIndexBuf *mtlsrc = static_cast<MTLIndexBuf *>(src_);
mtlsrc->upload_data();
#ifndef NDEBUG
BLI_assert_msg(!mtlsrc->point_restarts_stripped_,
"Cannot use sub-range on stripped point buffer.");
#endif
/* If parent sub-range allocation has changed,
* update our index buffer. */
if (alloc_size_ != mtlsrc->alloc_size_ || ibo_ != mtlsrc->ibo_) {
/* Update index buffer and allocation from source. */
alloc_size_ = mtlsrc->alloc_size_;
ibo_ = mtlsrc->ibo_;
/* Reset any allocated patched or optimized index buffers. */
this->free_optimized_buffer();
}
return;
}
/* If new data ready, and index buffer already exists, release current. */
if ((ibo_ != nullptr) && (this->data_ != nullptr)) {
MTL_LOG_INFO("Re-creating index buffer with new data. IndexBuf %p", this);
ibo_->free();
ibo_ = nullptr;
}
/* Prepare Buffer and Upload Data. */
if (ibo_ == nullptr) {
alloc_size_ = this->size_get();
if (alloc_size_ == 0) {
MTL_LOG_WARNING("Warning! Trying to allocate index buffer with size=0 bytes");
}
else {
if (data_) {
ibo_ = MTLContext::get_global_memory_manager()->allocate_with_data(
alloc_size_, true, data_);
}
else {
ibo_ = MTLContext::get_global_memory_manager()->allocate(alloc_size_, true);
}
BLI_assert(ibo_);
ibo_->set_label(@"Index Buffer");
}
/* No need to keep copy of data_ in system memory. */
if (data_) {
MEM_SAFE_FREE(data_);
}
}
}
void MTLIndexBuf::update_sub(uint32_t start, uint32_t len, const void *data)
{
BLI_assert(!is_subrange_);
/* If host-side data still exists, modify and upload as normal */
if (data_ != nullptr) {
/* Free index buffer if one exists. */
if (ibo_ != nullptr && !this->is_subrange_) {
ibo_->free();
ibo_ = nullptr;
}
BLI_assert(start + len < this->size_get());
/* Apply start byte offset to data pointer. */
void *modified_base_ptr = data_;
uint8_t *ptr = static_cast<uint8_t *>(modified_base_ptr);
ptr += start;
modified_base_ptr = static_cast<void *>(ptr);
/* Modify host-side data. */
memcpy(modified_base_ptr, data, len);
return;
}
/* Verify buffer. */
BLI_assert(ibo_ != nullptr);
/* Otherwise, we will inject a data update, using staged data, into the command stream.
* Stage update contents in temporary buffer. */
MTLContext *ctx = static_cast<MTLContext *>(unwrap(GPU_context_active_get()));
BLI_assert(ctx);
MTLTemporaryBuffer range = ctx->get_scratchbuffer_manager().scratch_buffer_allocate_range(len);
memcpy(range.data, data, len);
/* Copy updated contents into primary buffer.
* These changes need to be uploaded via blit to ensure the data copies happen in-order. */
id<MTLBuffer> dest_buffer = ibo_->get_metal_buffer();
BLI_assert(dest_buffer != nil);
id<MTLBlitCommandEncoder> enc = ctx->main_command_buffer.ensure_begin_blit_encoder();
[enc copyFromBuffer:range.metal_buffer
sourceOffset:(uint32_t)range.buffer_offset
toBuffer:dest_buffer
destinationOffset:start
size:len];
/* Synchronize changes back to host to ensure CPU-side data is up-to-date for non
* Shared buffers. */
if (dest_buffer.storageMode == MTLStorageModeManaged) {
[enc synchronizeResource:dest_buffer];
}
/* Invalidate patched/optimized buffers. */
this->free_optimized_buffer();
/* Flag buffer as incompatible with optimized/patched buffers as contents
* have partial modifications. */
this->flag_can_optimize(false);
BLI_assert(false);
}
void MTLIndexBuf::flag_can_optimize(bool can_optimize)
{
can_optimize_ = can_optimize;
/* NOTE: Index buffer optimization needs to be disabled for Indirect draws, as the index count is
* unknown at submission time. However, if the index buffer has already been optimized by a
* separate draw pass, errors will occur and these cases need to be resolved at the high-level,
* ensuring primitive types without primitive restart are used instead, as these perform far
* more optimally on hardware. */
BLI_assert_msg(can_optimize_ || (optimized_ibo_ == nullptr),
"Index buffer optimization disabled, but optimal buffer already generated.");
}
/** \} */
/** \name Index buffer optimization and topology emulation
*
* Index buffer optimization and emulation. Optimize index buffers by
* eliminating restart-indices.
* Emulate unsupported index types e.g. Triangle Fan and Line Loop.
* \{ */
/* Returns total vertices in new buffer. */
template<typename T>
static uint32_t populate_optimized_tri_strip_buf(Span<T> original_data,
MutableSpan<T> output_data,
uint32_t input_index_len)
{
/* Generate #TriangleList from #TriangleStrip. */
uint32_t current_vert_len = 0;
uint32_t current_output_ind = 0;
T indices[3];
for (int c_index = 0; c_index < input_index_len; c_index++) {
T current_index = original_data[c_index];
if (current_index == T(-1)) {
/* Stop current primitive. Move onto next. */
current_vert_len = 0;
}
else {
if (current_vert_len < 3) {
/* Prepare first triangle.
* Cache indices before generating a triangle, in case we have bad primitive-restarts. */
indices[current_vert_len] = current_index;
}
/* Emit triangle once we reach 3 input verts in current strip. */
if (current_vert_len == 3) {
/* First triangle in strip. */
output_data[current_output_ind++] = indices[0];
output_data[current_output_ind++] = indices[1];
output_data[current_output_ind++] = indices[2];
}
else if (current_vert_len > 3) {
/* All other triangles in strip.
* These triangles are populated using data from previous 2 vertices
* and the latest index. */
uint32_t tri_id = current_vert_len - 3;
uint32_t base_output_ind = current_output_ind;
if ((tri_id % 2) == 0) {
output_data[base_output_ind + 0] = output_data[base_output_ind - 2];
output_data[base_output_ind + 1] = current_index;
output_data[base_output_ind + 2] = output_data[base_output_ind - 1];
}
else {
output_data[base_output_ind + 0] = output_data[base_output_ind - 1];
output_data[base_output_ind + 1] = output_data[base_output_ind - 2];
output_data[base_output_ind + 2] = current_index;
}
current_output_ind += 3;
}
/* Increment relative vertex index. */
current_vert_len++;
}
}
return current_output_ind;
}
/* Returns total vertices in new buffer. */
template<typename T>
static uint32_t populate_emulated_tri_fan_buf(Span<T> original_data,
MutableSpan<T> output_data,
uint32_t input_index_len)
{
/* Generate #TriangleList from #TriangleFan. */
T base_prim_ind_val = 0;
uint32_t current_vert_len = 0;
uint32_t current_output_ind = 0;
T indices[3];
for (int c_index = 0; c_index < input_index_len; c_index++) {
T current_index = original_data[c_index];
if (current_index == T(-1)) {
/* Stop current primitive. Move onto next. */
current_vert_len = 0;
}
else {
if (current_vert_len < 3) {
/* Prepare first triangle.
* Cache indices before generating a triangle, in case we have bad primitive-restarts. */
indices[current_vert_len] = current_index;
}
/* emit triangle once we reach 3 input verts in current strip. */
if (current_vert_len == 3) {
/* First triangle in strip. */
output_data[current_output_ind++] = indices[0];
output_data[current_output_ind++] = indices[1];
output_data[current_output_ind++] = indices[2];
base_prim_ind_val = indices[0];
}
else if (current_vert_len > 3) {
/* All other triangles in strip.
* These triangles are populated using data from previous 2 vertices
* and the latest index. */
uint32_t base_output_ind = current_output_ind;
output_data[base_output_ind + 0] = base_prim_ind_val;
output_data[base_output_ind + 1] = output_data[base_output_ind - 1];
output_data[base_output_ind + 2] = current_index;
current_output_ind += 3;
}
/* Increment relative vertex index. */
current_vert_len++;
}
}
return current_output_ind;
}
id<MTLBuffer> MTLIndexBuf::get_index_buffer(GPUPrimType &in_out_primitive_type,
uint32_t &in_out_v_count)
{
/* Determine whether to return the original index buffer, or whether we
* should emulate an unsupported primitive type, or optimize a restart-
* compatible type for faster performance. */
bool should_optimize_or_emulate = (in_out_primitive_type == GPU_PRIM_TRI_FAN) ||
(in_out_primitive_type == GPU_PRIM_TRI_STRIP);
if (!should_optimize_or_emulate || is_subrange_ || !can_optimize_) {
/* Ensure we are not optimized. */
BLI_assert(this->optimized_ibo_ == nullptr);
/* Return regular index buffer. */
BLI_assert(this->ibo_ && this->ibo_->get_metal_buffer());
return this->ibo_->get_metal_buffer();
}
/* Perform optimization on type. */
GPUPrimType input_prim_type = in_out_primitive_type;
this->upload_data();
if (!ibo_ && optimized_ibo_ == nullptr) {
/* Cannot optimize buffer if no source IBO exists. */
return nil;
}
/* Verify whether existing index buffer is valid. */
if (optimized_ibo_ != nullptr && optimized_primitive_type_ != input_prim_type) {
BLI_assert_msg(false,
"Cannot change the optimized primitive format after generation, as source "
"index buffer data is discarded.");
return nil;
}
/* Generate optimized index buffer. */
if (optimized_ibo_ == nullptr) {
/* Generate unwrapped index buffer. */
switch (input_prim_type) {
case GPU_PRIM_TRI_FAN: {
/* Calculate maximum size. */
uint32_t max_possible_verts = (this->index_len_ - 2) * 3;
BLI_assert(max_possible_verts > 0);
/* Allocate new buffer. */
optimized_ibo_ = MTLContext::get_global_memory_manager()->allocate(
max_possible_verts *
((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)),
true);
/* Populate new index buffer. */
if (index_type_ == GPU_INDEX_U16) {
Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()),
this->index_len_);
MutableSpan<uint16_t> output_data(
static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), max_possible_verts);
emulated_v_count = populate_emulated_tri_fan_buf<uint16_t>(
orig_data, output_data, this->index_len_);
}
else {
Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()),
this->index_len_);
MutableSpan<uint32_t> output_data(
static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), max_possible_verts);
emulated_v_count = populate_emulated_tri_fan_buf<uint32_t>(
orig_data, output_data, this->index_len_);
}
BLI_assert(emulated_v_count <= max_possible_verts);
/* Flush buffer and output. */
optimized_ibo_->flush();
optimized_primitive_type_ = input_prim_type;
in_out_v_count = emulated_v_count;
in_out_primitive_type = GPU_PRIM_TRIS;
}
case GPU_PRIM_TRI_STRIP: {
/* Calculate maximum size. */
uint32_t max_possible_verts = (this->index_len_ - 2) * 3;
BLI_assert(max_possible_verts > 0);
/* Allocate new buffer. */
optimized_ibo_ = MTLContext::get_global_memory_manager()->allocate(
max_possible_verts *
((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)),
true);
/* Populate new index buffer. */
if (index_type_ == GPU_INDEX_U16) {
Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()),
this->index_len_);
MutableSpan<uint16_t> output_data(
static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), max_possible_verts);
emulated_v_count = populate_optimized_tri_strip_buf<uint16_t>(
orig_data, output_data, this->index_len_);
}
else {
Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()),
this->index_len_);
MutableSpan<uint32_t> output_data(
static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), max_possible_verts);
emulated_v_count = populate_optimized_tri_strip_buf<uint32_t>(
orig_data, output_data, this->index_len_);
}
BLI_assert(emulated_v_count <= max_possible_verts);
/* Flush buffer and output. */
optimized_ibo_->flush();
optimized_primitive_type_ = input_prim_type;
in_out_v_count = emulated_v_count;
in_out_primitive_type = GPU_PRIM_TRIS;
} break;
case GPU_PRIM_LINE_STRIP: {
/* TODO(Metal): Line strip topology types would benefit from optimization to remove
* primitive restarts, however, these do not occur frequently, nor with
* significant geometry counts. */
MTL_LOG_INFO("TODO: Primitive topology: Optimize line strip topology types");
} break;
case GPU_PRIM_LINE_LOOP: {
/* TODO(Metal): Line Loop primitive type requires use of optimized index buffer for
* emulation, if used with indexed rendering. This path is currently not hit as #LineLoop
* does not currently appear to be used alongside an index buffer. */
MTL_LOG_WARNING(
"TODO: Primitive topology: Line Loop Index buffer optimization required for "
"emulation.");
} break;
case GPU_PRIM_TRIS:
case GPU_PRIM_LINES:
case GPU_PRIM_POINTS: {
/* Should not get here - TRIS/LINES/POINTS do not require emulation or optimization. */
BLI_assert_unreachable();
return nil;
}
default:
/* Should not get here - Invalid primitive type. */
BLI_assert_unreachable();
break;
}
}
/* Return optimized buffer. */
if (optimized_ibo_ != nullptr) {
/* Delete original buffer if one still exists, as we do no need it. */
if (ibo_ != nullptr) {
ibo_->free();
ibo_ = nullptr;
}
/* Output params. */
in_out_v_count = emulated_v_count;
in_out_primitive_type = GPU_PRIM_TRIS;
return optimized_ibo_->get_metal_buffer();
}
return nil;
}
void MTLIndexBuf::strip_restart_indices()
{
/* We remove point buffer primitive restart indices by swapping restart indices
* with the first valid index at the end of the index buffer and reducing the
* length. Primitive restarts are invalid in Metal for non-restart-compatible
* primitive types. We also cannot just use zero unlike for Lines and Triangles,
* as we cannot create de-generative point primitives to hide geometry, as each
* point is independent.
* Instead, we must remove these hidden indices from the index buffer.
* NOTE: This happens prior to index squeezing so operate on 32-bit indices. */
MutableSpan<uint32_t> uint_idx(static_cast<uint32_t *>(data_), index_len_);
for (uint i = 0; i < index_len_; i++) {
if (uint_idx[i] == 0xFFFFFFFFu) {
/* Find swap index at end of index buffer. */
int swap_index = -1;
for (uint j = index_len_ - 1; j >= i && index_len_ > 0; j--) {
/* If end index is restart, just reduce length. */
if (uint_idx[j] == 0xFFFFFFFFu) {
index_len_--;
continue;
}
/* Otherwise assign swap index. */
swap_index = j;
break;
}
/* If index_len_ == 0, this means all indices were flagged as hidden, with restart index
* values. Hence we will entirely skip the draw. */
if (index_len_ > 0) {
/* If swap index is not valid, then there were no valid non-restart indices
* to swap with. However, the above loop will have removed these indices by
* reducing the length of indices. Debug assertions verify that the restart
* index is no longer included. */
if (swap_index == -1) {
BLI_assert(index_len_ <= i);
}
else {
/* If we have found an index we can swap with, flip the values.
* We also reduce the length. As per above loop, swap_index should
* now be outside the index length range. */
uint32_t swap_index_value = uint_idx[swap_index];
uint_idx[i] = swap_index_value;
uint_idx[swap_index] = 0xFFFFFFFFu;
index_len_--;
BLI_assert(index_len_ <= swap_index);
}
}
}
}
#ifndef NDEBUG
/* Flag as having been stripped to ensure invalid usage is tracked. */
point_restarts_stripped_ = true;
#endif
}
/** \} */
} // namespace blender::gpu