From ca60419b3a0b5634147a1bc2e1b0523b68d172e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=B8=D0=BB=D1=8C=D1=8F=20=5F?= <modormoder@gmail.com>
Date: Tue, 19 Aug 2025 03:46:03 +0200
Subject: [PATCH] Mesh: Avoid mutex lock in hot loop for custom normals

Use a thread local buffer to gather info about the custom normal space
for each corners group in threading to concatenate them later. This
avoids need to lock a mutex to write into a buffer shared betwen threads
in the corner groups traversal hot loop.

The performance improvement depends on the mesh size and the number of
CPU threads. In some files there might be no change, in other files an
improvement of over 2x was observed, mostly because we can now remove
the compromise from c8a4026984df4de1050eaa2a228629f93ec4a07b.

Pull Request: https://projects.blender.org/blender/blender/pulls/144660
---
 source/blender/blenkernel/BKE_mesh.hh         |   9 +-
 .../blender/blenkernel/intern/mesh_normals.cc | 107 ++++++++++++------
 2 files changed, 75 insertions(+), 41 deletions(-)
diff --git a/source/blender/blenkernel/BKE_mesh.hh b/source/blender/blenkernel/BKE_mesh.hh
index cd838d032ca..b55daeb7446 100644
--- a/source/blender/blenkernel/BKE_mesh.hh
+++ b/source/blender/blenkernel/BKE_mesh.hh
@@ -149,15 +149,10 @@ struct CornerNormalSpace {
 
 /**
  * Storage for corner fan coordinate spaces for an entire mesh.
+ * For performance reason the distribution of #spaces and index mapping of them in
+ * #corner_space_indices are non-deterministic.
  */
 struct CornerNormalSpaceArray {
-  /**
-   * Results are added from multiple threads. The lock is an easy way to parallelize adding results
-   * for each corner fan. This method means the order of spaces in the `spaces` vector and
-   * `corners_by_face` is non-deterministic. That shouldn't affect the final output for the user
-   * though.
-   */
-  Mutex build_mutex;
   /**
    * The normal coordinate spaces, potentially shared between multiple face corners in a smooth fan
    * connected to a vertex (and not per face corner). Depending on the mesh (the amount of sharing
diff --git a/source/blender/blenkernel/intern/mesh_normals.cc b/source/blender/blenkernel/intern/mesh_normals.cc
index 6d68534a0dd..1207c465135 100644
--- a/source/blender/blenkernel/intern/mesh_normals.cc
+++ b/source/blender/blenkernel/intern/mesh_normals.cc
@@ -17,6 +17,7 @@
 
 #include "BLI_array_utils.hh"
 #include "BLI_bit_vector.hh"
+#include "BLI_enumerable_thread_specific.hh"
 #include "BLI_linklist.h"
 #include "BLI_math_base.hh"
 #include "BLI_math_vector.hh"
@@ -1177,6 +1178,12 @@ static float3 accumulate_fan_normal(const Span<VertCornerInfo> corner_infos,
   return math::normalize(fan_normal);
 }
 
+struct CornerSpaceGroup {
+  /* Maybe acyclic and unordered set of adjacent corners in same smooth group around vertex. */
+  Array<int> fan_corners;
+  CornerNormalSpace space;
+};
+
 /** Don't inline this function to simplify the code path without custom normals. */
 BLI_NOINLINE static void handle_fan_result_and_custom_normals(
     const Span<short2> custom_normals,
@@ -1184,7 +1191,8 @@ BLI_NOINLINE static void handle_fan_result_and_custom_normals(
     const Span<float3> edge_dirs,
     const Span<int> local_corners_in_fan,
     float3 &fan_normal,
-    CornerNormalSpaceArray *r_fan_spaces)
+    CornerNormalSpaceArray *r_fan_spaces,
+    Vector<CornerSpaceGroup, 0> &r_local_space_groups)
 {
   const int local_edge_first = corner_infos[local_corners_in_fan.first()].local_edge_next;
   const int local_edge_last = corner_infos[local_corners_in_fan.last()].local_edge_prev;
@@ -1214,23 +1222,16 @@ BLI_NOINLINE static void handle_fan_result_and_custom_normals(
     fan_normal = corner_space_custom_data_to_normal(fan_space, short2(average_custom_normal));
   }
 
-  if (r_fan_spaces) {
-    std::lock_guard lock(r_fan_spaces->build_mutex);
-    r_fan_spaces->spaces.append(fan_space);
-    const int fan_space_index = r_fan_spaces->spaces.size() - 1;
-    for (const int local_corner : local_corners_in_fan) {
-      const VertCornerInfo &info = corner_infos[local_corner];
-      r_fan_spaces->corner_space_indices[info.corner] = fan_space_index;
-    }
-    if (r_fan_spaces->create_corners_by_space) {
-      Array<int> corners_in_space(local_corners_in_fan.size());
-      for (const int i : local_corners_in_fan.index_range()) {
-        const VertCornerInfo &info = corner_infos[local_corners_in_fan[i]];
-        corners_in_space[i] = info.corner;
-      }
-      r_fan_spaces->corners_by_space.append(std::move(corners_in_space));
-    }
+  if (!r_fan_spaces) {
+    return;
   }
+
+  Array<int> fan_corners(local_corners_in_fan.size());
+  for (const int i : local_corners_in_fan.index_range()) {
+    const VertCornerInfo &info = corner_infos[local_corners_in_fan[i]];
+    fan_corners[i] = info.corner;
+  }
+  r_local_space_groups.append({std::move(fan_corners), fan_space});
 }
 
 void normals_calc_corners(const Span<float3> vert_positions,
@@ -1245,28 +1246,19 @@ void normals_calc_corners(const Span<float3> vert_positions,
                           CornerNormalSpaceArray *r_fan_spaces,
                           MutableSpan<float3> r_corner_normals)
 {
-  if (r_fan_spaces) {
-    /* These are potentially-wasteful over-allocations. */
-    r_fan_spaces->spaces.reserve(corner_verts.size());
-    r_fan_spaces->corner_space_indices.reinitialize(corner_verts.size());
-    if (r_fan_spaces->create_corners_by_space) {
-      r_fan_spaces->corners_by_space.reserve(corner_verts.size());
-    }
-  }
+  threading::EnumerableThreadSpecific<Vector<CornerSpaceGroup, 0>> space_groups;
 
-  int64_t grain_size = 256;
-  /* Decrease parallelism in case where lock is used to avoid contention. */
-  if (!custom_normals.is_empty() || r_fan_spaces) {
-    grain_size = std::max(int64_t(16384), vert_positions.size() / 2);
-  }
-
-  threading::parallel_for(vert_positions.index_range(), grain_size, [&](const IndexRange range) {
+  threading::parallel_for(vert_positions.index_range(), 256, [&](const IndexRange range) {
     Vector<VertCornerInfo, 16> corner_infos;
     LocalEdgeVectorSet local_edge_by_vert;
     Vector<VertEdgeInfo, 16> edge_infos;
     Vector<float3, 16> edge_dirs;
     Vector<bool, 16> local_corner_visited;
     Vector<int, 16> corners_in_fan;
+
+    Vector<CornerSpaceGroup, 0> *local_space_groups = r_fan_spaces ? &space_groups.local() :
+                                                                     nullptr;
+
     for (const int vert : range) {
       const float3 vert_position = vert_positions[vert];
       const Span<int> vert_faces = vert_to_face_map[vert];
@@ -1308,8 +1300,13 @@ void normals_calc_corners(const Span<float3> vert_positions,
             corner_infos, edge_dirs, face_normals, corners_in_fan);
 
         if (!custom_normals.is_empty() || r_fan_spaces) {
-          handle_fan_result_and_custom_normals(
-              custom_normals, corner_infos, edge_dirs, corners_in_fan, fan_normal, r_fan_spaces);
+          handle_fan_result_and_custom_normals(custom_normals,
+                                               corner_infos,
+                                               edge_dirs,
+                                               corners_in_fan,
+                                               fan_normal,
+                                               r_fan_spaces,
+                                               *local_space_groups);
         }
 
         for (const int local_corner : corners_in_fan) {
@@ -1333,6 +1330,48 @@ void normals_calc_corners(const Span<float3> vert_positions,
       BLI_assert(visited_count == corner_infos.size());
     }
   });
+
+  if (!r_fan_spaces) {
+    return;
+  }
+
+  Vector<int> space_groups_count;
+  Vector<Vector<CornerSpaceGroup, 0>> all_space_groups;
+  for (auto &groups : space_groups) {
+    space_groups_count.append(groups.size());
+    all_space_groups.append(std::move(groups));
+  }
+  space_groups_count.append(0);
+  const OffsetIndices<int> space_offsets = offset_indices::accumulate_counts_to_offsets(
+      space_groups_count);
+
+  r_fan_spaces->spaces.reinitialize(space_offsets.total_size());
+  r_fan_spaces->corner_space_indices.reinitialize(corner_verts.size());
+  if (r_fan_spaces->create_corners_by_space) {
+    r_fan_spaces->corners_by_space.reinitialize(space_offsets.total_size());
+  }
+
+  const int64_t mean_size = space_offsets.total_size() / space_offsets.size();
+  const int64_t grain_size = math::clamp<int64_t>((1024 * 512) / mean_size, 256, 1024 * 16);
+  threading::parallel_for(all_space_groups.index_range(), grain_size, [&](const IndexRange range) {
+    for (const int thread_i : range) {
+      Vector<CornerSpaceGroup, 0> &local_space_groups = all_space_groups[thread_i];
+      for (const int group_i : local_space_groups.index_range()) {
+        const int space_index = space_offsets[thread_i][group_i];
+        r_fan_spaces->spaces[space_index] = local_space_groups[group_i].space;
+        r_fan_spaces->corner_space_indices.as_mutable_span().fill_indices(
+            local_space_groups[group_i].fan_corners.as_span(), space_index);
+      }
+      if (!r_fan_spaces->create_corners_by_space) {
+        continue;
+      }
+      for (const int group_i : local_space_groups.index_range()) {
+        const int space_index = space_offsets[thread_i][group_i];
+        r_fan_spaces->corners_by_space[space_index] = std::move(
+            local_space_groups[group_i].fan_corners);
+      }
+    }
+  });
 }
 
 #undef INDEX_UNSET