Files
test/source/blender/blenlib/intern/task_range.cc
Brecht Van Lommel 841ae6e8ab Fix part of #131933: Crash with playback of deforming subdivision surface
The `ForeachContext` in `deform_coarse_vertices` does not use TLS but still has
a `func_free` callback set. Change the task API to allow this.

Pull Request: https://projects.blender.org/blender/blender/pulls/132498
2025-01-02 12:21:56 +01:00

318 lines
9.7 KiB
C++

/* SPDX-FileCopyrightText: 2023 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup bli
*
* Task parallel range functions.
*/
#include <cstdlib>
#include "MEM_guardedalloc.h"
#include "BLI_array.hh"
#include "BLI_lazy_threading.hh"
#include "BLI_offset_indices.hh"
#include "BLI_task.h"
#include "BLI_task.hh"
#include "BLI_threads.h"
#include "BLI_vector.hh"
#include "atomic_ops.h"
#ifdef WITH_TBB
# include <tbb/blocked_range.h>
# include <tbb/enumerable_thread_specific.h>
# include <tbb/parallel_for.h>
# include <tbb/parallel_reduce.h>
#endif
#ifdef WITH_TBB
/* Functor for running TBB parallel_for and parallel_reduce. */
struct RangeTask {
TaskParallelRangeFunc func;
void *userdata;
const TaskParallelSettings *settings;
void *userdata_chunk;
/* Root constructor. */
RangeTask(TaskParallelRangeFunc func, void *userdata, const TaskParallelSettings *settings)
: func(func), userdata(userdata), settings(settings)
{
init_chunk(settings->userdata_chunk);
}
/* Copy constructor. */
RangeTask(const RangeTask &other)
: func(other.func), userdata(other.userdata), settings(other.settings)
{
init_chunk(settings->userdata_chunk);
}
/* Splitting constructor for parallel reduce. */
RangeTask(RangeTask &other, tbb::split /*unused*/)
: func(other.func), userdata(other.userdata), settings(other.settings)
{
init_chunk(settings->userdata_chunk);
}
~RangeTask()
{
if (settings->func_free != nullptr && userdata_chunk != nullptr) {
settings->func_free(userdata, userdata_chunk);
}
MEM_SAFE_FREE(userdata_chunk);
}
void init_chunk(void *from_chunk)
{
if (from_chunk) {
userdata_chunk = MEM_mallocN(settings->userdata_chunk_size, "RangeTask");
memcpy(userdata_chunk, from_chunk, settings->userdata_chunk_size);
}
else {
userdata_chunk = nullptr;
}
}
void operator()(const tbb::blocked_range<int> &r) const
{
TaskParallelTLS tls;
tls.userdata_chunk = userdata_chunk;
for (int i = r.begin(); i != r.end(); ++i) {
func(userdata, i, &tls);
}
}
void join(const RangeTask &other)
{
settings->func_reduce(userdata, userdata_chunk, other.userdata_chunk);
}
};
#endif
void BLI_task_parallel_range(const int start,
const int stop,
void *userdata,
TaskParallelRangeFunc func,
const TaskParallelSettings *settings)
{
#ifdef WITH_TBB
/* Multithreading. */
if (settings->use_threading && BLI_task_scheduler_num_threads() > 1) {
RangeTask task(func, userdata, settings);
const size_t grainsize = std::max(settings->min_iter_per_thread, 1);
const tbb::blocked_range<int> range(start, stop, grainsize);
blender::lazy_threading::send_hint();
if (settings->func_reduce) {
parallel_reduce(range, task);
if (settings->userdata_chunk) {
memcpy(settings->userdata_chunk, task.userdata_chunk, settings->userdata_chunk_size);
}
}
else {
parallel_for(range, task);
}
return;
}
#endif
/* Single threaded. Nothing to reduce as everything is accumulated into the
* main userdata chunk directly. */
TaskParallelTLS tls;
tls.userdata_chunk = settings->userdata_chunk;
for (int i = start; i < stop; i++) {
func(userdata, i, &tls);
}
if (settings->func_free != nullptr && settings->userdata_chunk != nullptr) {
settings->func_free(userdata, settings->userdata_chunk);
}
}
int BLI_task_parallel_thread_id(const TaskParallelTLS * /*tls*/)
{
#ifdef WITH_TBB
/* Get a unique thread ID for texture nodes. In the future we should get rid
* of the thread ID and change texture evaluation to not require per-thread
* storage that can't be efficiently allocated on the stack. */
static tbb::enumerable_thread_specific<int> tbb_thread_id(-1);
static int tbb_thread_id_counter = 0;
int &thread_id = tbb_thread_id.local();
if (thread_id == -1) {
thread_id = atomic_fetch_and_add_int32(&tbb_thread_id_counter, 1);
if (thread_id >= BLENDER_MAX_THREADS) {
BLI_assert_msg(0, "Maximum number of threads exceeded for sculpting");
thread_id = thread_id % BLENDER_MAX_THREADS;
}
}
return thread_id;
#else
return 0;
#endif
}
namespace blender::threading::detail {
#ifdef WITH_TBB
static void parallel_for_impl_static_size(const IndexRange range,
const int64_t grain_size,
const FunctionRef<void(IndexRange)> function)
{
tbb::parallel_for(tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
[function](const tbb::blocked_range<int64_t> &subrange) {
function(IndexRange(subrange.begin(), subrange.size()));
});
}
#endif /* WITH_TBB */
#ifdef WITH_TBB
static void parallel_for_impl_individual_size_lookup(
const IndexRange range,
const int64_t grain_size,
const FunctionRef<void(IndexRange)> function,
const TaskSizeHints_IndividualLookup &size_hints)
{
/* Shouldn't be too small, because then there is more overhead when the individual tasks are
* small. Also shouldn't be too large because then the serial code to split up tasks causes extra
* overhead. */
const int64_t outer_grain_size = std::min<int64_t>(grain_size, 512);
threading::parallel_for(range, outer_grain_size, [&](const IndexRange sub_range) {
/* Compute the size of every task in the current range. */
Array<int64_t, 1024> task_sizes(sub_range.size());
size_hints.lookup_individual_sizes(sub_range, task_sizes);
/* Split range into multiple segments that have a size that approximates the grain size. */
Vector<int64_t, 256> offsets_vec;
offsets_vec.append(0);
int64_t counter = 0;
for (const int64_t i : sub_range.index_range()) {
counter += task_sizes[i];
if (counter >= grain_size) {
offsets_vec.append(i + 1);
counter = 0;
}
}
if (offsets_vec.last() < sub_range.size()) {
offsets_vec.append(sub_range.size());
}
const OffsetIndices<int64_t> offsets = offsets_vec.as_span();
/* Run the dynamically split tasks in parallel. */
threading::parallel_for(offsets.index_range(), 1, [&](const IndexRange offsets_range) {
for (const int64_t i : offsets_range) {
const IndexRange actual_range = offsets[i].shift(sub_range.start());
function(actual_range);
}
});
});
}
#endif /* WITH_TBB */
static void parallel_for_impl_accumulated_size_lookup(
const IndexRange range,
const int64_t grain_size,
const FunctionRef<void(IndexRange)> function,
const TaskSizeHints_AccumulatedLookup &size_hints)
{
BLI_assert(!range.is_empty());
if (range.size() == 1) {
/* Can't subdivide further. */
function(range);
return;
}
const int64_t total_size = size_hints.lookup_accumulated_size(range);
if (total_size <= grain_size) {
function(range);
return;
}
const int64_t middle = range.size() / 2;
const IndexRange left_range = range.take_front(middle);
const IndexRange right_range = range.drop_front(middle);
threading::parallel_invoke(
[&]() {
parallel_for_impl_accumulated_size_lookup(left_range, grain_size, function, size_hints);
},
[&]() {
parallel_for_impl_accumulated_size_lookup(right_range, grain_size, function, size_hints);
});
}
void parallel_for_impl(const IndexRange range,
const int64_t grain_size,
const FunctionRef<void(IndexRange)> function,
const TaskSizeHints &size_hints)
{
#ifdef WITH_TBB
lazy_threading::send_hint();
switch (size_hints.type) {
case TaskSizeHints::Type::Static: {
const int64_t task_size = static_cast<const detail::TaskSizeHints_Static &>(size_hints).size;
const int64_t final_grain_size = task_size == 1 ?
grain_size :
std::max<int64_t>(1, grain_size / task_size);
parallel_for_impl_static_size(range, final_grain_size, function);
break;
}
case TaskSizeHints::Type::IndividualLookup: {
parallel_for_impl_individual_size_lookup(
range,
grain_size,
function,
static_cast<const detail::TaskSizeHints_IndividualLookup &>(size_hints));
break;
}
case TaskSizeHints::Type::AccumulatedLookup: {
parallel_for_impl_accumulated_size_lookup(
range,
grain_size,
function,
static_cast<const detail::TaskSizeHints_AccumulatedLookup &>(size_hints));
break;
}
}
#else
UNUSED_VARS(grain_size, size_hints);
function(range);
#endif
}
void memory_bandwidth_bound_task_impl(const FunctionRef<void()> function)
{
#ifdef WITH_TBB
/* This is the maximum number of threads that may perform these memory bandwidth bound tasks at
* the same time. Often fewer threads are already enough to use up the full bandwidth capacity.
* Additional threads usually have a negligible benefit and can even make performance worse.
*
* It's better to use fewer threads here so that the CPU cores can do other tasks at the same
* time which may be more compute intensive. */
const int num_threads = 8;
if (num_threads >= BLI_task_scheduler_num_threads()) {
/* Avoid overhead of using a task arena when it would not have any effect anyway. */
function();
return;
}
static tbb::task_arena arena{num_threads};
/* Make sure the lazy threading hints are send now, because they shouldn't be send out of an
* isolated region. */
lazy_threading::send_hint();
lazy_threading::ReceiverIsolation isolation;
arena.execute(function);
#else
function();
#endif
}
} // namespace blender::threading::detail