BLI: speedup memory bandwidth bound tasks by reducing threading

This improves performance by **reducing** the amounts of threads used for tasks
which require a high memory bandwidth.

This works because the underlying hardware has a certain maximum memory
bandwidth. If that is used up by a few threads already, any additional threads
wanting to use a lot of memory will just cause more contention which actually
slows things down. By reducing the number of threads that can perform certain
tasks, the remaining threads are also not locked up doing work that they can't
do efficiently. It's best if there is enough scheduled work so that these tasks
can do more compute intensive tasks instead.

To use this new functionality, one has to put the parallel code in question into
a `threading::memory_bandwidth_bound_task(...)` block. Additionally, one also
has to provide a (very) rough approximation for how many bytes are accessed. If
the number is low, the number of threads shouldn't be reduced because it's
likely that all touched memory can be in L3 cache which generally has a much
higher bandwidth than main memory.

The exact number of threads that are allowed to do bandwidth bound tasks at the
same time is generally highly context and hardware dependent. It's also not
really possible to measure reliably because it depends on so many static and
dynamic factors. The thread count is now hardcoded to 8. It seems that this many
threads are easily capable of maxing out the bandwidth capacity.

With this technique I can measure surprisingly good performance improvements:
* Generating a 3000x3000 grid: 133ms -> 103ms.
* Generating a mesh line with 100'000'000 vertices: 212ms -> 189ms.
* Realize mesh instances resulting in ~27'000'000 vertices: 460ms -> 305ms.

In all of these cases, only 8 instead of 24 threads are used. The remaining
threads are idle in these cases, but they could do other work if available.

Pull Request: https://projects.blender.org/blender/blender/pulls/118939
This commit is contained in:
Jacques Lucke
2024-03-19 18:23:56 +01:00
parent c12ac94520
commit b99c1abc3a
6 changed files with 196 additions and 100 deletions

View File

@@ -73,6 +73,7 @@ void parallel_for_weighted_impl(IndexRange range,
int64_t grain_size,
FunctionRef<void(IndexRange)> function,
FunctionRef<void(IndexRange, MutableSpan<int64_t>)> task_sizes_fn);
void memory_bandwidth_bound_task_impl(FunctionRef<void()> function);
} // namespace detail
template<typename Function>
@@ -247,4 +248,26 @@ template<typename Function> inline void isolate_task(const Function &function)
#endif
}
/**
* Should surround parallel code that is highly bandwidth intensive, e.g. it just fills a buffer
* with no or just few additional operations. If the buffers are large, it's benefitial to limit
* the number of threads doing the work because that just creates more overhead on the hardware
* level and doesn't provide a notable performance benefit beyond a certain point.
*/
template<typename Function>
inline void memory_bandwidth_bound_task(const int64_t approximate_bytes_touched,
const Function &function)
{
/* Don't limit threading when all touched memory can stay in the CPU cache, because there a much
* higher memory bandwidth is available compared to accessing RAM. This value is supposed to be
* on the order of the L3 cache size. Accessing that value is not quite straight forward and even
* if it was, it's not clear if using the exact cache size would be benefitial because there is
* often more stuff going on on the CPU at the same time. */
if (approximate_bytes_touched <= 8 * 1024 * 1024) {
function();
return;
}
detail::memory_bandwidth_bound_task_impl(function);
}
} // namespace blender::threading

View File

@@ -24,10 +24,12 @@ OffsetIndices<int> accumulate_counts_to_offsets(MutableSpan<int> counts_to_offse
void fill_constant_group_size(const int size, const int start_offset, MutableSpan<int> offsets)
{
threading::parallel_for(offsets.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t i : range) {
offsets[i] = size * i + start_offset;
}
threading::memory_bandwidth_bound_task(offsets.size_in_bytes(), [&]() {
threading::parallel_for(offsets.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t i : range) {
offsets[i] = size * i + start_offset;
}
});
});
}
@@ -52,11 +54,14 @@ void gather_group_sizes(const OffsetIndices<int> offsets,
const Span<int> indices,
MutableSpan<int> sizes)
{
threading::parallel_for(indices.index_range(), 4096, [&](const IndexRange range) {
for (const int i : range) {
sizes[i] = offsets[indices[i]].size();
}
});
threading::memory_bandwidth_bound_task(
sizes.size_in_bytes() + offsets.data().size_in_bytes() + indices.size_in_bytes(), [&]() {
threading::parallel_for(indices.index_range(), 4096, [&](const IndexRange range) {
for (const int i : range) {
sizes[i] = offsets[indices[i]].size();
}
});
});
}
OffsetIndices<int> gather_selected_offsets(const OffsetIndices<int> src_offsets,

View File

@@ -223,4 +223,32 @@ void parallel_for_weighted_impl(
});
}
void memory_bandwidth_bound_task_impl(const FunctionRef<void()> function)
{
#ifdef WITH_TBB
/* This is the maximum number of threads that may perform these memory bandwidth bound tasks at
* the same time. Often fewer threads are already enough to use up the full bandwidth capacity.
* Additional threads usually have a negilible benefit and can even make performance worse.
*
* It's better to use fewer threads here so that the CPU cores can do other tasks at the same
* time which may be more compute intensive. */
const int num_threads = 8;
if (num_threads >= BLI_task_scheduler_num_threads()) {
/* Avoid overhead of using a task arena when it would not have any effect anyway. */
function();
return;
}
static tbb::task_arena arena{num_threads};
/* Make sure the lazy threading hints are send now, because they shouldn't be send out of an
* isolated region. */
lazy_threading::send_hint();
lazy_threading::ReceiverIsolation isolation;
arena.execute(function);
#else
function();
#endif
}
} // namespace blender::threading::detail

View File

@@ -22,13 +22,17 @@ static void calculate_uvs(Mesh *mesh,
const float dx = (size_x == 0.0f) ? 0.0f : 1.0f / size_x;
const float dy = (size_y == 0.0f) ? 0.0f : 1.0f / size_y;
threading::parallel_for(corner_verts.index_range(), 1024, [&](IndexRange range) {
for (const int i : range) {
const float3 &co = positions[corner_verts[i]];
uv_attribute.span[i].x = (co.x + size_x * 0.5f) * dx;
uv_attribute.span[i].y = (co.y + size_y * 0.5f) * dy;
}
});
threading::memory_bandwidth_bound_task(
uv_attribute.span.size_in_bytes() + positions.size_in_bytes() + corner_verts.size_in_bytes(),
[&]() {
threading::parallel_for(corner_verts.index_range(), 1024, [&](IndexRange range) {
for (const int i : range) {
const float3 &co = positions[corner_verts[i]];
uv_attribute.span[i].x = (co.x + size_x * 0.5f) * dx;
uv_attribute.span[i].y = (co.y + size_y * 0.5f) * dy;
}
});
});
uv_attribute.finish();
}
@@ -59,18 +63,20 @@ Mesh *create_grid_mesh(const int verts_x,
const float dy = edges_y == 0 ? 0.0f : size_y / edges_y;
const float x_shift = edges_x / 2.0f;
const float y_shift = edges_y / 2.0f;
threading::parallel_for(IndexRange(verts_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int y_offset = x * verts_y;
threading::parallel_for(IndexRange(verts_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int vert_index = y_offset + y;
positions[vert_index].x = (x - x_shift) * dx;
positions[vert_index].y = (y - y_shift) * dy;
positions[vert_index].z = 0.0f;
}
});
}
threading::memory_bandwidth_bound_task(positions.size_in_bytes(), [&]() {
threading::parallel_for(IndexRange(verts_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int y_offset = x * verts_y;
threading::parallel_for(IndexRange(verts_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int vert_index = y_offset + y;
positions[vert_index].x = (x - x_shift) * dx;
positions[vert_index].y = (y - y_shift) * dy;
positions[vert_index].z = 0.0f;
}
});
}
});
});
}
@@ -78,56 +84,63 @@ Mesh *create_grid_mesh(const int verts_x,
const int x_edges_start = verts_x * edges_y;
/* Build the horizontal edges in the X direction. */
threading::parallel_for(IndexRange(verts_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int y_vert_offset = x * verts_y;
const int y_edge_offset = y_edges_start + x * edges_y;
threading::parallel_for(IndexRange(edges_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int vert_index = y_vert_offset + y;
edges[y_edge_offset + y] = int2(vert_index, vert_index + 1);
}
});
}
threading::memory_bandwidth_bound_task(edges.size_in_bytes(), [&]() {
threading::parallel_for(IndexRange(verts_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int y_vert_offset = x * verts_y;
const int y_edge_offset = y_edges_start + x * edges_y;
threading::parallel_for(IndexRange(edges_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int vert_index = y_vert_offset + y;
edges[y_edge_offset + y] = int2(vert_index, vert_index + 1);
}
});
}
});
});
/* Build the vertical edges in the Y direction. */
threading::parallel_for(IndexRange(verts_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int x_edge_offset = x_edges_start + y * edges_x;
threading::parallel_for(IndexRange(edges_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int vert_index = x * verts_y + y;
edges[x_edge_offset + x] = int2(vert_index, vert_index + verts_y);
}
});
}
threading::memory_bandwidth_bound_task(edges.size_in_bytes(), [&]() {
threading::parallel_for(IndexRange(verts_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int x_edge_offset = x_edges_start + y * edges_x;
threading::parallel_for(IndexRange(edges_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int vert_index = x * verts_y + y;
edges[x_edge_offset + x] = int2(vert_index, vert_index + verts_y);
}
});
}
});
});
threading::parallel_for(IndexRange(edges_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int y_offset = x * edges_y;
threading::parallel_for(IndexRange(edges_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int face_index = y_offset + y;
const int loop_index = face_index * 4;
const int vert_index = x * verts_y + y;
threading::memory_bandwidth_bound_task(
corner_edges.size_in_bytes() + corner_verts.size_in_bytes(), [&]() {
threading::parallel_for(IndexRange(edges_x), 512, [&](IndexRange x_range) {
for (const int x : x_range) {
const int y_offset = x * edges_y;
threading::parallel_for(IndexRange(edges_y), 512, [&](IndexRange y_range) {
for (const int y : y_range) {
const int face_index = y_offset + y;
const int loop_index = face_index * 4;
const int vert_index = x * verts_y + y;
corner_verts[loop_index] = vert_index;
corner_edges[loop_index] = x_edges_start + edges_x * y + x;
corner_verts[loop_index] = vert_index;
corner_edges[loop_index] = x_edges_start + edges_x * y + x;
corner_verts[loop_index + 1] = vert_index + verts_y;
corner_edges[loop_index + 1] = y_edges_start + edges_y * (x + 1) + y;
corner_verts[loop_index + 1] = vert_index + verts_y;
corner_edges[loop_index + 1] = y_edges_start + edges_y * (x + 1) + y;
corner_verts[loop_index + 2] = vert_index + verts_y + 1;
corner_edges[loop_index + 2] = x_edges_start + edges_x * (y + 1) + x;
corner_verts[loop_index + 2] = vert_index + verts_y + 1;
corner_edges[loop_index + 2] = x_edges_start + edges_x * (y + 1) + x;
corner_verts[loop_index + 3] = vert_index + 1;
corner_edges[loop_index + 3] = y_edges_start + edges_y * x + y;
}
corner_verts[loop_index + 3] = vert_index + 1;
corner_edges[loop_index + 3] = y_edges_start + edges_y * x + y;
}
});
}
});
});
}
});
if (uv_map_id && mesh->faces_num != 0) {
calculate_uvs(mesh, positions, corner_verts, size_x, size_y, uv_map_id);

View File

@@ -20,23 +20,25 @@ Mesh *create_line_mesh(const float3 start, const float3 delta, const int count)
MutableSpan<float3> positions = mesh->vert_positions_for_write();
MutableSpan<int2> edges = mesh->edges_for_write();
threading::parallel_invoke(
1024 < count,
[&]() {
threading::parallel_for(positions.index_range(), 4096, [&](IndexRange range) {
for (const int i : range) {
positions[i] = start + delta * i;
}
threading::memory_bandwidth_bound_task(positions.size_in_bytes() + edges.size_in_bytes(), [&]() {
threading::parallel_invoke(
1024 < count,
[&]() {
threading::parallel_for(positions.index_range(), 4096, [&](IndexRange range) {
for (const int i : range) {
positions[i] = start + delta * i;
}
});
},
[&]() {
threading::parallel_for(edges.index_range(), 4096, [&](IndexRange range) {
for (const int i : range) {
edges[i][0] = i;
edges[i][1] = i + 1;
}
});
});
},
[&]() {
threading::parallel_for(edges.index_range(), 4096, [&](IndexRange range) {
for (const int i : range) {
edges[i][0] = i;
edges[i][1] = i + 1;
}
});
});
});
mesh->tag_loose_verts_none();
mesh->tag_overlapping_none();

View File

@@ -286,6 +286,24 @@ struct InstanceContext {
}
};
static int64_t get_final_points_num(const GatherTasks &tasks)
{
int64_t points_num = 0;
if (!tasks.pointcloud_tasks.is_empty()) {
const RealizePointCloudTask &task = tasks.pointcloud_tasks.last();
points_num += task.start_index + task.pointcloud_info->pointcloud->totpoint;
}
if (!tasks.mesh_tasks.is_empty()) {
const RealizeMeshTask &task = tasks.mesh_tasks.last();
points_num += task.start_indices.vertex + task.mesh_info->mesh->verts_num;
}
if (!tasks.curve_tasks.is_empty()) {
const RealizeCurveTask &task = tasks.curve_tasks.last();
points_num += task.start_indices.point + task.curve_info->curves->geometry.point_num;
}
return points_num;
}
static void copy_transformed_positions(const Span<float3> src,
const float4x4 &transform,
MutableSpan<float3> dst)
@@ -1612,22 +1630,29 @@ bke::GeometrySet realize_instances(bke::GeometrySet geometry_set,
gather_realize_tasks_recursive(gather_info, geometry_set, transform, attribute_fallbacks);
bke::GeometrySet new_geometry_set;
execute_realize_pointcloud_tasks(options,
all_pointclouds_info,
gather_info.r_tasks.pointcloud_tasks,
all_pointclouds_info.attributes,
new_geometry_set);
execute_realize_mesh_tasks(options,
all_meshes_info,
gather_info.r_tasks.mesh_tasks,
all_meshes_info.attributes,
all_meshes_info.materials,
new_geometry_set);
execute_realize_curve_tasks(options,
all_curves_info,
gather_info.r_tasks.curve_tasks,
all_curves_info.attributes,
new_geometry_set);
const int64_t total_points_num = get_final_points_num(gather_info.r_tasks);
/* This doesn't have to be exact at all, it's just a rough estimate ot make decisions about
* multi-threading (overhead). */
const int64_t approximate_used_bytes_num = total_points_num * 32;
threading::memory_bandwidth_bound_task(approximate_used_bytes_num, [&]() {
execute_realize_pointcloud_tasks(options,
all_pointclouds_info,
gather_info.r_tasks.pointcloud_tasks,
all_pointclouds_info.attributes,
new_geometry_set);
execute_realize_mesh_tasks(options,
all_meshes_info,
gather_info.r_tasks.mesh_tasks,
all_meshes_info.attributes,
all_meshes_info.materials,
new_geometry_set);
execute_realize_curve_tasks(options,
all_curves_info,
gather_info.r_tasks.curve_tasks,
all_curves_info.attributes,
new_geometry_set);
});
if (gather_info.r_tasks.first_volume) {
new_geometry_set.add(*gather_info.r_tasks.first_volume);