From ca60419b3a0b5634147a1bc2e1b0523b68d172e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=B8=D0=BB=D1=8C=D1=8F=20=5F?= Date: Tue, 19 Aug 2025 03:46:03 +0200 Subject: [PATCH] Mesh: Avoid mutex lock in hot loop for custom normals Use a thread local buffer to gather info about the custom normal space for each corners group in threading to concatenate them later. This avoids need to lock a mutex to write into a buffer shared betwen threads in the corner groups traversal hot loop. The performance improvement depends on the mesh size and the number of CPU threads. In some files there might be no change, in other files an improvement of over 2x was observed, mostly because we can now remove the compromise from c8a4026984df4de1050eaa2a228629f93ec4a07b. Pull Request: https://projects.blender.org/blender/blender/pulls/144660 --- source/blender/blenkernel/BKE_mesh.hh | 9 +- .../blender/blenkernel/intern/mesh_normals.cc | 107 ++++++++++++------ 2 files changed, 75 insertions(+), 41 deletions(-) diff --git a/source/blender/blenkernel/BKE_mesh.hh b/source/blender/blenkernel/BKE_mesh.hh index cd838d032ca..b55daeb7446 100644 --- a/source/blender/blenkernel/BKE_mesh.hh +++ b/source/blender/blenkernel/BKE_mesh.hh @@ -149,15 +149,10 @@ struct CornerNormalSpace { /** * Storage for corner fan coordinate spaces for an entire mesh. + * For performance reason the distribution of #spaces and index mapping of them in + * #corner_space_indices are non-deterministic. */ struct CornerNormalSpaceArray { - /** - * Results are added from multiple threads. The lock is an easy way to parallelize adding results - * for each corner fan. This method means the order of spaces in the `spaces` vector and - * `corners_by_face` is non-deterministic. That shouldn't affect the final output for the user - * though. - */ - Mutex build_mutex; /** * The normal coordinate spaces, potentially shared between multiple face corners in a smooth fan * connected to a vertex (and not per face corner). Depending on the mesh (the amount of sharing diff --git a/source/blender/blenkernel/intern/mesh_normals.cc b/source/blender/blenkernel/intern/mesh_normals.cc index 6d68534a0dd..1207c465135 100644 --- a/source/blender/blenkernel/intern/mesh_normals.cc +++ b/source/blender/blenkernel/intern/mesh_normals.cc @@ -17,6 +17,7 @@ #include "BLI_array_utils.hh" #include "BLI_bit_vector.hh" +#include "BLI_enumerable_thread_specific.hh" #include "BLI_linklist.h" #include "BLI_math_base.hh" #include "BLI_math_vector.hh" @@ -1177,6 +1178,12 @@ static float3 accumulate_fan_normal(const Span corner_infos, return math::normalize(fan_normal); } +struct CornerSpaceGroup { + /* Maybe acyclic and unordered set of adjacent corners in same smooth group around vertex. */ + Array fan_corners; + CornerNormalSpace space; +}; + /** Don't inline this function to simplify the code path without custom normals. */ BLI_NOINLINE static void handle_fan_result_and_custom_normals( const Span custom_normals, @@ -1184,7 +1191,8 @@ BLI_NOINLINE static void handle_fan_result_and_custom_normals( const Span edge_dirs, const Span local_corners_in_fan, float3 &fan_normal, - CornerNormalSpaceArray *r_fan_spaces) + CornerNormalSpaceArray *r_fan_spaces, + Vector &r_local_space_groups) { const int local_edge_first = corner_infos[local_corners_in_fan.first()].local_edge_next; const int local_edge_last = corner_infos[local_corners_in_fan.last()].local_edge_prev; @@ -1214,23 +1222,16 @@ BLI_NOINLINE static void handle_fan_result_and_custom_normals( fan_normal = corner_space_custom_data_to_normal(fan_space, short2(average_custom_normal)); } - if (r_fan_spaces) { - std::lock_guard lock(r_fan_spaces->build_mutex); - r_fan_spaces->spaces.append(fan_space); - const int fan_space_index = r_fan_spaces->spaces.size() - 1; - for (const int local_corner : local_corners_in_fan) { - const VertCornerInfo &info = corner_infos[local_corner]; - r_fan_spaces->corner_space_indices[info.corner] = fan_space_index; - } - if (r_fan_spaces->create_corners_by_space) { - Array corners_in_space(local_corners_in_fan.size()); - for (const int i : local_corners_in_fan.index_range()) { - const VertCornerInfo &info = corner_infos[local_corners_in_fan[i]]; - corners_in_space[i] = info.corner; - } - r_fan_spaces->corners_by_space.append(std::move(corners_in_space)); - } + if (!r_fan_spaces) { + return; } + + Array fan_corners(local_corners_in_fan.size()); + for (const int i : local_corners_in_fan.index_range()) { + const VertCornerInfo &info = corner_infos[local_corners_in_fan[i]]; + fan_corners[i] = info.corner; + } + r_local_space_groups.append({std::move(fan_corners), fan_space}); } void normals_calc_corners(const Span vert_positions, @@ -1245,28 +1246,19 @@ void normals_calc_corners(const Span vert_positions, CornerNormalSpaceArray *r_fan_spaces, MutableSpan r_corner_normals) { - if (r_fan_spaces) { - /* These are potentially-wasteful over-allocations. */ - r_fan_spaces->spaces.reserve(corner_verts.size()); - r_fan_spaces->corner_space_indices.reinitialize(corner_verts.size()); - if (r_fan_spaces->create_corners_by_space) { - r_fan_spaces->corners_by_space.reserve(corner_verts.size()); - } - } + threading::EnumerableThreadSpecific> space_groups; - int64_t grain_size = 256; - /* Decrease parallelism in case where lock is used to avoid contention. */ - if (!custom_normals.is_empty() || r_fan_spaces) { - grain_size = std::max(int64_t(16384), vert_positions.size() / 2); - } - - threading::parallel_for(vert_positions.index_range(), grain_size, [&](const IndexRange range) { + threading::parallel_for(vert_positions.index_range(), 256, [&](const IndexRange range) { Vector corner_infos; LocalEdgeVectorSet local_edge_by_vert; Vector edge_infos; Vector edge_dirs; Vector local_corner_visited; Vector corners_in_fan; + + Vector *local_space_groups = r_fan_spaces ? &space_groups.local() : + nullptr; + for (const int vert : range) { const float3 vert_position = vert_positions[vert]; const Span vert_faces = vert_to_face_map[vert]; @@ -1308,8 +1300,13 @@ void normals_calc_corners(const Span vert_positions, corner_infos, edge_dirs, face_normals, corners_in_fan); if (!custom_normals.is_empty() || r_fan_spaces) { - handle_fan_result_and_custom_normals( - custom_normals, corner_infos, edge_dirs, corners_in_fan, fan_normal, r_fan_spaces); + handle_fan_result_and_custom_normals(custom_normals, + corner_infos, + edge_dirs, + corners_in_fan, + fan_normal, + r_fan_spaces, + *local_space_groups); } for (const int local_corner : corners_in_fan) { @@ -1333,6 +1330,48 @@ void normals_calc_corners(const Span vert_positions, BLI_assert(visited_count == corner_infos.size()); } }); + + if (!r_fan_spaces) { + return; + } + + Vector space_groups_count; + Vector> all_space_groups; + for (auto &groups : space_groups) { + space_groups_count.append(groups.size()); + all_space_groups.append(std::move(groups)); + } + space_groups_count.append(0); + const OffsetIndices space_offsets = offset_indices::accumulate_counts_to_offsets( + space_groups_count); + + r_fan_spaces->spaces.reinitialize(space_offsets.total_size()); + r_fan_spaces->corner_space_indices.reinitialize(corner_verts.size()); + if (r_fan_spaces->create_corners_by_space) { + r_fan_spaces->corners_by_space.reinitialize(space_offsets.total_size()); + } + + const int64_t mean_size = space_offsets.total_size() / space_offsets.size(); + const int64_t grain_size = math::clamp((1024 * 512) / mean_size, 256, 1024 * 16); + threading::parallel_for(all_space_groups.index_range(), grain_size, [&](const IndexRange range) { + for (const int thread_i : range) { + Vector &local_space_groups = all_space_groups[thread_i]; + for (const int group_i : local_space_groups.index_range()) { + const int space_index = space_offsets[thread_i][group_i]; + r_fan_spaces->spaces[space_index] = local_space_groups[group_i].space; + r_fan_spaces->corner_space_indices.as_mutable_span().fill_indices( + local_space_groups[group_i].fan_corners.as_span(), space_index); + } + if (!r_fan_spaces->create_corners_by_space) { + continue; + } + for (const int group_i : local_space_groups.index_range()) { + const int space_index = space_offsets[thread_i][group_i]; + r_fan_spaces->corners_by_space[space_index] = std::move( + local_space_groups[group_i].fan_corners); + } + } + }); } #undef INDEX_UNSET