This improves performance by **reducing** the amounts of threads used for tasks which require a high memory bandwidth. This works because the underlying hardware has a certain maximum memory bandwidth. If that is used up by a few threads already, any additional threads wanting to use a lot of memory will just cause more contention which actually slows things down. By reducing the number of threads that can perform certain tasks, the remaining threads are also not locked up doing work that they can't do efficiently. It's best if there is enough scheduled work so that these tasks can do more compute intensive tasks instead. To use this new functionality, one has to put the parallel code in question into a `threading::memory_bandwidth_bound_task(...)` block. Additionally, one also has to provide a (very) rough approximation for how many bytes are accessed. If the number is low, the number of threads shouldn't be reduced because it's likely that all touched memory can be in L3 cache which generally has a much higher bandwidth than main memory. The exact number of threads that are allowed to do bandwidth bound tasks at the same time is generally highly context and hardware dependent. It's also not really possible to measure reliably because it depends on so many static and dynamic factors. The thread count is now hardcoded to 8. It seems that this many threads are easily capable of maxing out the bandwidth capacity. With this technique I can measure surprisingly good performance improvements: * Generating a 3000x3000 grid: 133ms -> 103ms. * Generating a mesh line with 100'000'000 vertices: 212ms -> 189ms. * Realize mesh instances resulting in ~27'000'000 vertices: 460ms -> 305ms. In all of these cases, only 8 instead of 24 threads are used. The remaining threads are idle in these cases, but they could do other work if available. Pull Request: https://projects.blender.org/blender/blender/pulls/118939
101 lines
3.3 KiB
C++
101 lines
3.3 KiB
C++
/* SPDX-FileCopyrightText: 2023 Blender Authors
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
#include "BLI_array_utils.hh"
|
|
#include "BLI_offset_indices.hh"
|
|
#include "BLI_task.hh"
|
|
|
|
namespace blender::offset_indices {
|
|
|
|
OffsetIndices<int> accumulate_counts_to_offsets(MutableSpan<int> counts_to_offsets,
|
|
const int start_offset)
|
|
{
|
|
int offset = start_offset;
|
|
for (const int i : counts_to_offsets.index_range().drop_back(1)) {
|
|
const int count = counts_to_offsets[i];
|
|
BLI_assert(count >= 0);
|
|
counts_to_offsets[i] = offset;
|
|
offset += count;
|
|
}
|
|
counts_to_offsets.last() = offset;
|
|
return OffsetIndices<int>(counts_to_offsets);
|
|
}
|
|
|
|
void fill_constant_group_size(const int size, const int start_offset, MutableSpan<int> offsets)
|
|
{
|
|
threading::memory_bandwidth_bound_task(offsets.size_in_bytes(), [&]() {
|
|
threading::parallel_for(offsets.index_range(), 1024, [&](const IndexRange range) {
|
|
for (const int64_t i : range) {
|
|
offsets[i] = size * i + start_offset;
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
void copy_group_sizes(const OffsetIndices<int> offsets,
|
|
const IndexMask &mask,
|
|
MutableSpan<int> sizes)
|
|
{
|
|
mask.foreach_index_optimized<int64_t>(GrainSize(4096),
|
|
[&](const int64_t i) { sizes[i] = offsets[i].size(); });
|
|
}
|
|
|
|
void gather_group_sizes(const OffsetIndices<int> offsets,
|
|
const IndexMask &mask,
|
|
MutableSpan<int> sizes)
|
|
{
|
|
mask.foreach_index_optimized<int64_t>(GrainSize(4096), [&](const int64_t i, const int64_t pos) {
|
|
sizes[pos] = offsets[i].size();
|
|
});
|
|
}
|
|
|
|
void gather_group_sizes(const OffsetIndices<int> offsets,
|
|
const Span<int> indices,
|
|
MutableSpan<int> sizes)
|
|
{
|
|
threading::memory_bandwidth_bound_task(
|
|
sizes.size_in_bytes() + offsets.data().size_in_bytes() + indices.size_in_bytes(), [&]() {
|
|
threading::parallel_for(indices.index_range(), 4096, [&](const IndexRange range) {
|
|
for (const int i : range) {
|
|
sizes[i] = offsets[indices[i]].size();
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
OffsetIndices<int> gather_selected_offsets(const OffsetIndices<int> src_offsets,
|
|
const IndexMask &selection,
|
|
const int start_offset,
|
|
MutableSpan<int> dst_offsets)
|
|
{
|
|
if (selection.is_empty()) {
|
|
return {};
|
|
}
|
|
int offset = start_offset;
|
|
selection.foreach_index_optimized<int>([&](const int i, const int pos) {
|
|
dst_offsets[pos] = offset;
|
|
offset += src_offsets[i].size();
|
|
});
|
|
dst_offsets.last() = offset;
|
|
return OffsetIndices<int>(dst_offsets);
|
|
}
|
|
|
|
void build_reverse_map(OffsetIndices<int> offsets, MutableSpan<int> r_map)
|
|
{
|
|
threading::parallel_for(offsets.index_range(), 1024, [&](const IndexRange range) {
|
|
for (const int64_t i : range) {
|
|
r_map.slice(offsets[i]).fill(i);
|
|
}
|
|
});
|
|
}
|
|
|
|
void build_reverse_offsets(const Span<int> indices, MutableSpan<int> offsets)
|
|
{
|
|
BLI_assert(std::all_of(offsets.begin(), offsets.end(), [](int value) { return value == 0; }));
|
|
array_utils::count_indices(indices, offsets);
|
|
offset_indices::accumulate_counts_to_offsets(offsets);
|
|
}
|
|
|
|
} // namespace blender::offset_indices
|