Mesh: Parallelize vertex and edge to corner topology map creation

Change the algorithm to make better use of multiple CPU cores. First offsets are created by counting the number of elements using each vertex. Those offsets are used during the next phase that adds indices to each group in parallel. Atomic increments are used to add elements to each group. Since the order in each group is non-deterministic, they are sorted in parallel afterwards. The performance improvement depends on the number of cores, CPU caches, memory bandwidth, single threaded performance, and mesh topology. In our tests, performance improved by 3-4.5x for large grid-like meshes. See [1] for investigation of this algorithm and potential alternatives. 1. https://hackmd.io/@s0TMIS4lTAGwHVO20ECwpw/build_edge_to_loop_map_tests. Pull Request: https://projects.blender.org/blender/blender/pulls/110707
2023-08-28 22:32:31 +02:00
parent eafe8332c7
commit 226359ec48
1 changed files with 50 additions and 20 deletions
--- a/source/blender/blenkernel/intern/mesh_mapping.cc
+++ b/source/blender/blenkernel/intern/mesh_mapping.cc
@@ -11,6 +11,8 @@

 #include "MEM_guardedalloc.h"

+#include "atomic_ops.h"
+
 #include "DNA_meshdata_types.h"
 #include "DNA_vec_types.h"

@@ -306,6 +308,52 @@ static Array<int> create_reverse_offsets(const Span<int> indices, const int item
  return offsets;
 }

+static void sort_small_groups(const OffsetIndices<int> groups,
+                              const int grain_size,
+                              MutableSpan<int> indices)
+{
+  threading::parallel_for(groups.index_range(), grain_size, [&](const IndexRange range) {
+    for (const int64_t index : range) {
+      MutableSpan<int> group = indices.slice(groups[index]);
+      std::sort(group.begin(), group.end());
+    }
+  });
+}
+
+static Array<int> reverse_indices_in_groups(const Span<int> group_indices,
+                                            const OffsetIndices<int> offsets)
+{
+  BLI_assert(!group_indices.is_empty());
+  BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size());
+  BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0);
+  Array<int> counts(offsets.size(), -1);
+  Array<int> results(group_indices.size());
+  threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) {
+    for (const int64_t i : range) {
+      const int group_index = group_indices[i];
+      const int index_in_group = atomic_add_and_fetch_int32(&counts[group_index], 1);
+      results[offsets[group_index][index_in_group]] = int(i);
+    }
+  });
+  sort_small_groups(offsets, 1024, results);
+  return results;
+}
+
+static GroupedSpan<int> gather_groups(const Span<int> group_indices,
+                                      const int groups_num,
+                                      Array<int> &r_offsets,
+                                      Array<int> &r_indices)
+{
+  if (group_indices.is_empty()) {
+    r_offsets.reinitialize(groups_num + 1);
+    r_offsets.as_mutable_span().fill(0);
+    return {OffsetIndices<int>(r_offsets), {}};
+  }
+  r_offsets = create_reverse_offsets(group_indices, groups_num);
+  r_indices = reverse_indices_in_groups(group_indices, r_offsets.as_span());
+  return {OffsetIndices<int>(r_offsets), r_indices};
+}
+
 Array<int> build_loop_to_face_map(const OffsetIndices<int> faces)
 {
  Array<int> map(faces.total_size());
@@ -355,16 +403,7 @@ GroupedSpan<int> build_vert_to_loop_map(const Span<int> corner_verts,
                                        Array<int> &r_offsets,
                                        Array<int> &r_indices)
 {
-  r_offsets = create_reverse_offsets(corner_verts, verts_num);
-  r_indices.reinitialize(r_offsets.last());
-  Array<int> counts(verts_num, 0);
-
-  for (const int64_t corner : corner_verts.index_range()) {
-    const int vert = corner_verts[corner];
-    r_indices[r_offsets[vert] + counts[vert]] = int(corner);
-    counts[vert]++;
-  }
-  return {OffsetIndices<int>(r_offsets), r_indices};
+  return gather_groups(corner_verts, verts_num, r_offsets, r_indices);
 }

 GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
@@ -372,16 +411,7 @@ GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
                                        Array<int> &r_offsets,
                                        Array<int> &r_indices)
 {
-  r_offsets = create_reverse_offsets(corner_edges, edges_num);
-  r_indices.reinitialize(r_offsets.last());
-  Array<int> counts(edges_num, 0);
-
-  for (const int64_t corner : corner_edges.index_range()) {
-    const int edge = corner_edges[corner];
-    r_indices[r_offsets[edge] + counts[edge]] = int(corner);
-    counts[edge]++;
-  }
-  return {OffsetIndices<int>(r_offsets), r_indices};
+  return gather_groups(corner_edges, edges_num, r_offsets, r_indices);
 }

 GroupedSpan<int> build_edge_to_face_map(const OffsetIndices<int> faces,