This improves performance by **reducing** the amounts of threads used for tasks which require a high memory bandwidth. This works because the underlying hardware has a certain maximum memory bandwidth. If that is used up by a few threads already, any additional threads wanting to use a lot of memory will just cause more contention which actually slows things down. By reducing the number of threads that can perform certain tasks, the remaining threads are also not locked up doing work that they can't do efficiently. It's best if there is enough scheduled work so that these tasks can do more compute intensive tasks instead. To use this new functionality, one has to put the parallel code in question into a `threading::memory_bandwidth_bound_task(...)` block. Additionally, one also has to provide a (very) rough approximation for how many bytes are accessed. If the number is low, the number of threads shouldn't be reduced because it's likely that all touched memory can be in L3 cache which generally has a much higher bandwidth than main memory. The exact number of threads that are allowed to do bandwidth bound tasks at the same time is generally highly context and hardware dependent. It's also not really possible to measure reliably because it depends on so many static and dynamic factors. The thread count is now hardcoded to 8. It seems that this many threads are easily capable of maxing out the bandwidth capacity. With this technique I can measure surprisingly good performance improvements: * Generating a 3000x3000 grid: 133ms -> 103ms. * Generating a mesh line with 100'000'000 vertices: 212ms -> 189ms. * Realize mesh instances resulting in ~27'000'000 vertices: 460ms -> 305ms. In all of these cases, only 8 instead of 24 threads are used. The remaining threads are idle in these cases, but they could do other work if available. Pull Request: https://projects.blender.org/blender/blender/pulls/118939
255 lines
7.5 KiB
C++
255 lines
7.5 KiB
C++
/* SPDX-FileCopyrightText: 2023 Blender Authors
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
/** \file
|
|
* \ingroup bli
|
|
*
|
|
* Task parallel range functions.
|
|
*/
|
|
|
|
#include <cstdlib>
|
|
|
|
#include "MEM_guardedalloc.h"
|
|
|
|
#include "BLI_array.hh"
|
|
#include "BLI_lazy_threading.hh"
|
|
#include "BLI_offset_indices.hh"
|
|
#include "BLI_task.h"
|
|
#include "BLI_task.hh"
|
|
#include "BLI_threads.h"
|
|
#include "BLI_vector.hh"
|
|
|
|
#include "atomic_ops.h"
|
|
|
|
#ifdef WITH_TBB
|
|
# include <tbb/blocked_range.h>
|
|
# include <tbb/enumerable_thread_specific.h>
|
|
# include <tbb/parallel_for.h>
|
|
# include <tbb/parallel_reduce.h>
|
|
#endif
|
|
|
|
#ifdef WITH_TBB
|
|
|
|
/* Functor for running TBB parallel_for and parallel_reduce. */
|
|
struct RangeTask {
|
|
TaskParallelRangeFunc func;
|
|
void *userdata;
|
|
const TaskParallelSettings *settings;
|
|
|
|
void *userdata_chunk;
|
|
|
|
/* Root constructor. */
|
|
RangeTask(TaskParallelRangeFunc func, void *userdata, const TaskParallelSettings *settings)
|
|
: func(func), userdata(userdata), settings(settings)
|
|
{
|
|
init_chunk(settings->userdata_chunk);
|
|
}
|
|
|
|
/* Copy constructor. */
|
|
RangeTask(const RangeTask &other)
|
|
: func(other.func), userdata(other.userdata), settings(other.settings)
|
|
{
|
|
init_chunk(settings->userdata_chunk);
|
|
}
|
|
|
|
/* Splitting constructor for parallel reduce. */
|
|
RangeTask(RangeTask &other, tbb::split /*unused*/)
|
|
: func(other.func), userdata(other.userdata), settings(other.settings)
|
|
{
|
|
init_chunk(settings->userdata_chunk);
|
|
}
|
|
|
|
~RangeTask()
|
|
{
|
|
if (settings->func_free != nullptr) {
|
|
settings->func_free(userdata, userdata_chunk);
|
|
}
|
|
MEM_SAFE_FREE(userdata_chunk);
|
|
}
|
|
|
|
void init_chunk(void *from_chunk)
|
|
{
|
|
if (from_chunk) {
|
|
userdata_chunk = MEM_mallocN(settings->userdata_chunk_size, "RangeTask");
|
|
memcpy(userdata_chunk, from_chunk, settings->userdata_chunk_size);
|
|
}
|
|
else {
|
|
userdata_chunk = nullptr;
|
|
}
|
|
}
|
|
|
|
void operator()(const tbb::blocked_range<int> &r) const
|
|
{
|
|
TaskParallelTLS tls;
|
|
tls.userdata_chunk = userdata_chunk;
|
|
for (int i = r.begin(); i != r.end(); ++i) {
|
|
func(userdata, i, &tls);
|
|
}
|
|
}
|
|
|
|
void join(const RangeTask &other)
|
|
{
|
|
settings->func_reduce(userdata, userdata_chunk, other.userdata_chunk);
|
|
}
|
|
};
|
|
|
|
#endif
|
|
|
|
void BLI_task_parallel_range(const int start,
|
|
const int stop,
|
|
void *userdata,
|
|
TaskParallelRangeFunc func,
|
|
const TaskParallelSettings *settings)
|
|
{
|
|
#ifdef WITH_TBB
|
|
/* Multithreading. */
|
|
if (settings->use_threading && BLI_task_scheduler_num_threads() > 1) {
|
|
RangeTask task(func, userdata, settings);
|
|
const size_t grainsize = std::max(settings->min_iter_per_thread, 1);
|
|
const tbb::blocked_range<int> range(start, stop, grainsize);
|
|
|
|
blender::lazy_threading::send_hint();
|
|
|
|
if (settings->func_reduce) {
|
|
parallel_reduce(range, task);
|
|
if (settings->userdata_chunk) {
|
|
memcpy(settings->userdata_chunk, task.userdata_chunk, settings->userdata_chunk_size);
|
|
}
|
|
}
|
|
else {
|
|
parallel_for(range, task);
|
|
}
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
/* Single threaded. Nothing to reduce as everything is accumulated into the
|
|
* main userdata chunk directly. */
|
|
TaskParallelTLS tls;
|
|
tls.userdata_chunk = settings->userdata_chunk;
|
|
for (int i = start; i < stop; i++) {
|
|
func(userdata, i, &tls);
|
|
}
|
|
if (settings->func_free != nullptr) {
|
|
settings->func_free(userdata, settings->userdata_chunk);
|
|
}
|
|
}
|
|
|
|
int BLI_task_parallel_thread_id(const TaskParallelTLS * /*tls*/)
|
|
{
|
|
#ifdef WITH_TBB
|
|
/* Get a unique thread ID for texture nodes. In the future we should get rid
|
|
* of the thread ID and change texture evaluation to not require per-thread
|
|
* storage that can't be efficiently allocated on the stack. */
|
|
static tbb::enumerable_thread_specific<int> tbb_thread_id(-1);
|
|
static int tbb_thread_id_counter = 0;
|
|
|
|
int &thread_id = tbb_thread_id.local();
|
|
if (thread_id == -1) {
|
|
thread_id = atomic_fetch_and_add_int32(&tbb_thread_id_counter, 1);
|
|
if (thread_id >= BLENDER_MAX_THREADS) {
|
|
BLI_assert_msg(0, "Maximum number of threads exceeded for sculpting");
|
|
thread_id = thread_id % BLENDER_MAX_THREADS;
|
|
}
|
|
}
|
|
return thread_id;
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
namespace blender::threading::detail {
|
|
|
|
void parallel_for_impl(const IndexRange range,
|
|
const int64_t grain_size,
|
|
const FunctionRef<void(IndexRange)> function)
|
|
{
|
|
#ifdef WITH_TBB
|
|
/* Invoking tbb for small workloads has a large overhead. */
|
|
if (range.size() >= grain_size) {
|
|
lazy_threading::send_hint();
|
|
tbb::parallel_for(
|
|
tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
|
|
[function](const tbb::blocked_range<int64_t> &subrange) {
|
|
function(IndexRange(subrange.begin(), subrange.size()));
|
|
});
|
|
return;
|
|
}
|
|
#else
|
|
UNUSED_VARS(grain_size);
|
|
#endif
|
|
function(range);
|
|
}
|
|
|
|
void parallel_for_weighted_impl(
|
|
const IndexRange range,
|
|
const int64_t grain_size,
|
|
const FunctionRef<void(IndexRange)> function,
|
|
const FunctionRef<void(IndexRange, MutableSpan<int64_t>)> task_sizes_fn)
|
|
{
|
|
/* Shouldn't be too small, because then there is more overhead when the individual tasks are
|
|
* small. Also shouldn't be too large because then the serial code to split up tasks causes extra
|
|
* overhead. */
|
|
const int64_t outer_grain_size = std::min<int64_t>(grain_size, 512);
|
|
threading::parallel_for(range, outer_grain_size, [&](const IndexRange sub_range) {
|
|
/* Compute the size of every task in the current range. */
|
|
Array<int64_t, 1024> task_sizes(sub_range.size());
|
|
task_sizes_fn(sub_range, task_sizes);
|
|
|
|
/* Split range into multiple segments that have a size that approximates the grain size. */
|
|
Vector<int64_t, 256> offsets_vec;
|
|
offsets_vec.append(0);
|
|
int64_t counter = 0;
|
|
for (const int64_t i : sub_range.index_range()) {
|
|
counter += task_sizes[i];
|
|
if (counter >= grain_size) {
|
|
offsets_vec.append(i + 1);
|
|
counter = 0;
|
|
}
|
|
}
|
|
if (offsets_vec.last() < sub_range.size()) {
|
|
offsets_vec.append(sub_range.size());
|
|
}
|
|
const OffsetIndices<int64_t> offsets = offsets_vec.as_span();
|
|
|
|
/* Run the dynamically split tasks in parallel. */
|
|
threading::parallel_for(offsets.index_range(), 1, [&](const IndexRange offsets_range) {
|
|
for (const int64_t i : offsets_range) {
|
|
const IndexRange actual_range = offsets[i].shift(sub_range.start());
|
|
function(actual_range);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
void memory_bandwidth_bound_task_impl(const FunctionRef<void()> function)
|
|
{
|
|
#ifdef WITH_TBB
|
|
/* This is the maximum number of threads that may perform these memory bandwidth bound tasks at
|
|
* the same time. Often fewer threads are already enough to use up the full bandwidth capacity.
|
|
* Additional threads usually have a negilible benefit and can even make performance worse.
|
|
*
|
|
* It's better to use fewer threads here so that the CPU cores can do other tasks at the same
|
|
* time which may be more compute intensive. */
|
|
const int num_threads = 8;
|
|
if (num_threads >= BLI_task_scheduler_num_threads()) {
|
|
/* Avoid overhead of using a task arena when it would not have any effect anyway. */
|
|
function();
|
|
return;
|
|
}
|
|
static tbb::task_arena arena{num_threads};
|
|
|
|
/* Make sure the lazy threading hints are send now, because they shouldn't be send out of an
|
|
* isolated region. */
|
|
lazy_threading::send_hint();
|
|
lazy_threading::ReceiverIsolation isolation;
|
|
|
|
arena.execute(function);
|
|
#else
|
|
function();
|
|
#endif
|
|
}
|
|
|
|
} // namespace blender::threading::detail
|