BLI: move tbb part of parallel_for to implementation file

Previously, `tbb::parallel_for` was instantiated every time `threading::parallel_for`
is used. However, when actual parallelism is used, the overhead of a function
call is negilible. Therefor it is possible to move that part out of the header
without causing noticable performance regressions.

This reduces the size of the Blender binary from 308.2 to 303.5 MB, which is
a reduction of about 1.5%.
This commit is contained in:
Jacques Lucke
2023-05-21 13:31:21 +02:00
parent 57f593b7ca
commit f6d824bca6
2 changed files with 38 additions and 15 deletions

View File

@@ -30,6 +30,7 @@
# endif
#endif
#include "BLI_function_ref.hh"
#include "BLI_index_range.hh"
#include "BLI_lazy_threading.hh"
#include "BLI_utildefines.h"
@@ -48,27 +49,23 @@ void parallel_for_each(Range &&range, const Function &function)
#endif
}
namespace detail {
void parallel_for_impl(IndexRange range,
int64_t grain_size,
FunctionRef<void(IndexRange)> function);
} // namespace detail
template<typename Function>
void parallel_for(IndexRange range, int64_t grain_size, const Function &function)
inline void parallel_for(IndexRange range, int64_t grain_size, const Function &function)
{
if (range.size() == 0) {
if (range.is_empty()) {
return;
}
#ifdef WITH_TBB
/* Invoking tbb for small workloads has a large overhead. */
if (range.size() >= grain_size) {
lazy_threading::send_hint();
tbb::parallel_for(
tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
[&](const tbb::blocked_range<int64_t> &subrange) {
function(IndexRange(subrange.begin(), subrange.size()));
});
if (range.size() <= grain_size) {
function(range);
return;
}
#else
UNUSED_VARS(grain_size);
#endif
function(range);
detail::parallel_for_impl(range, grain_size, function);
}
/**

View File

@@ -14,6 +14,7 @@
#include "BLI_lazy_threading.hh"
#include "BLI_task.h"
#include "BLI_task.hh"
#include "BLI_threads.h"
#include "atomic_ops.h"
@@ -154,3 +155,28 @@ int BLI_task_parallel_thread_id(const TaskParallelTLS * /*tls*/)
return 0;
#endif
}
namespace blender::threading::detail {
void parallel_for_impl(const IndexRange range,
const int64_t grain_size,
const FunctionRef<void(IndexRange)> function)
{
#ifdef WITH_TBB
/* Invoking tbb for small workloads has a large overhead. */
if (range.size() >= grain_size) {
lazy_threading::send_hint();
tbb::parallel_for(
tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
[function](const tbb::blocked_range<int64_t> &subrange) {
function(IndexRange(subrange.begin(), subrange.size()));
});
return;
}
#else
UNUSED_VARS(grain_size);
#endif
function(range);
}
} // namespace blender::threading::detail