From 9fcfba4aae7982327072524252113893f7aa9ea5 Mon Sep 17 00:00:00 2001 From: Hans Goudey Date: Wed, 26 Apr 2023 22:31:08 -0400 Subject: [PATCH] Mesh: Reduce memory corner normals memory usage and simplify threading Instead of storing a 24 byte struct for every face corner we must do calculations for, just gather the face corner index in the first single threaded loop. And don't fill them in chunks and use the task pool API. Instead just fill vectors and use standard "parallel_for" threading. The check that avoided threading for tiny meshes becomes redundant this way too, which simplifies the code more. Overall this removes over 100 lines of code. On a Ryzen 7950x, face corner ("split"/"loop") normal calculation in a small armature modifier setup with custom normals went from 4.1 ms to 2.8 ms. Calculation for a 12 million face mesh generated with curve to mesh with end caps went from 776 ms to 568 ms. Similar commits: - 9e9ebcdd728dfee01335f2d224d60e31140951f5 - 9338ab1d625a14c74b39adfa3c6a8fcefa85e70f --- .../blender/blenkernel/intern/mesh_normals.cc | 222 +++++------------- 1 file changed, 58 insertions(+), 164 deletions(-) diff --git a/source/blender/blenkernel/intern/mesh_normals.cc b/source/blender/blenkernel/intern/mesh_normals.cc index 17cdd57beb5..ba0dfe129eb 100644 --- a/source/blender/blenkernel/intern/mesh_normals.cc +++ b/source/blender/blenkernel/intern/mesh_normals.cc @@ -25,7 +25,6 @@ #include "BLI_memarena.h" #include "BLI_span.hh" #include "BLI_stack.h" -#include "BLI_task.h" #include "BLI_task.hh" #include "BLI_timeit.hh" #include "BLI_utildefines.h" @@ -666,23 +665,6 @@ void BKE_lnor_space_custom_normal_to_data(const MLoopNorSpace *lnor_space, namespace blender::bke::mesh { -#define LOOP_SPLIT_TASK_BLOCK_SIZE 1024 - -struct LoopSplitTaskData { - enum class Type : int8_t { - BlockEnd = 0, /* Set implicitly by calloc. */ - Fan = 1, - Single = 2, - }; - - /** We have to create those outside of tasks, since #MemArena is not thread-safe. */ - MLoopNorSpace *lnor_space; - int ml_curr_index; - int poly_index; - - Type flag; -}; - struct LoopSplitTaskDataCommon { /* Read/write. * Note we do not need to protect it, though, since two different tasks will *always* affect @@ -855,47 +837,32 @@ static void loop_manifold_fan_around_vert_next(const Span corner_verts, } } -static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *data) +static void lnor_space_for_single_fan(LoopSplitTaskDataCommon *common_data, + const int ml_curr_index, + MLoopNorSpace *lnor_space) { - MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr; - const Span clnors_data = common_data->clnors_data; - - const Span positions = common_data->positions; - const Span edges = common_data->edges; - const OffsetIndices polys = common_data->polys; - const Span corner_verts = common_data->corner_verts; - const Span corner_edges = common_data->corner_edges; + const Span loop_to_poly = common_data->loop_to_poly; const Span poly_normals = common_data->poly_normals; MutableSpan loop_normals = common_data->loop_normals; - MLoopNorSpace *lnor_space = data->lnor_space; - const int ml_curr_index = data->ml_curr_index; - const int poly_index = data->poly_index; + loop_normals[ml_curr_index] = poly_normals[loop_to_poly[ml_curr_index]]; - /* Simple case (both edges around that vertex are sharp in current polygon), - * this loop just takes its poly normal. - */ - loop_normals[ml_curr_index] = poly_normals[poly_index]; + if (MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr) { + const Span positions = common_data->positions; + const Span edges = common_data->edges; + const OffsetIndices polys = common_data->polys; + const Span corner_verts = common_data->corner_verts; + const Span corner_edges = common_data->corner_edges; + const Span clnors_data = common_data->clnors_data; -#if 0 - printf("BASIC: handling loop %d / edge %d / vert %d / poly %d\n", - ml_curr_index, - loops[ml_curr_index].e, - loops[ml_curr_index].v, - poly_index); -#endif - - /* If needed, generate this (simple!) lnor space. */ - if (lnors_spacearr) { float vec_curr[3], vec_prev[3]; + const int poly_index = loop_to_poly[ml_curr_index]; const int ml_prev_index = mesh::poly_corner_prev(polys[poly_index], ml_curr_index); /* The vertex we are "fanning" around. */ const int vert_pivot = corner_verts[ml_curr_index]; - const int2 &edge = edges[corner_edges[ml_curr_index]]; - const int vert_2 = edge_other_vert(edge, vert_pivot); - const int2 &edge_prev = edges[corner_edges[ml_prev_index]]; - const int vert_3 = edge_other_vert(edge_prev, vert_pivot); + const int vert_2 = edge_other_vert(edges[corner_edges[ml_curr_index]], vert_pivot); + const int vert_3 = edge_other_vert(edges[corner_edges[ml_prev_index]], vert_pivot); sub_v3_v3v3(vec_curr, positions[vert_2], positions[vert_pivot]); normalize_v3(vec_curr); @@ -914,7 +881,8 @@ static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopS } static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, - LoopSplitTaskData *data, + const int ml_curr_index, + MLoopNorSpace *lnor_space, BLI_Stack *edge_vectors) { MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr; @@ -930,12 +898,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, const Span loop_to_poly = common_data->loop_to_poly; const Span poly_normals = common_data->poly_normals; - MLoopNorSpace *lnor_space = data->lnor_space; -#if 0 /* Not needed for 'fan' loops. */ - float(*lnor)[3] = data->lnor; -#endif - const int ml_curr_index = data->ml_curr_index; - const int poly_index = data->poly_index; + const int poly_index = loop_to_poly[ml_curr_index]; const int ml_prev_index = poly_corner_prev(polys[poly_index], ml_curr_index); /* Sigh! we have to fan around current vertex, until we find the other non-smooth edge, @@ -1111,42 +1074,6 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, } } -static void loop_split_worker_do(LoopSplitTaskDataCommon *common_data, - LoopSplitTaskData *data, - BLI_Stack *edge_vectors) -{ - if (data->flag == LoopSplitTaskData::Type::Fan) { - BLI_assert((edge_vectors == nullptr) || BLI_stack_is_empty(edge_vectors)); - split_loop_nor_fan_do(common_data, data, edge_vectors); - } - else { - /* No need for edge_vectors for 'single' case! */ - split_loop_nor_single_do(common_data, data); - } -} - -static void loop_split_worker(TaskPool *__restrict pool, void *taskdata) -{ - LoopSplitTaskDataCommon *common_data = (LoopSplitTaskDataCommon *)BLI_task_pool_user_data(pool); - LoopSplitTaskData *data = (LoopSplitTaskData *)taskdata; - - /* Temp edge vectors stack, only used when computing lnor spacearr. */ - BLI_Stack *edge_vectors = common_data->lnors_spacearr ? - BLI_stack_new(sizeof(float[3]), __func__) : - nullptr; - - for (int i = 0; i < LOOP_SPLIT_TASK_BLOCK_SIZE; i++, data++) { - if (data->flag == LoopSplitTaskData::Type::BlockEnd) { - break; - } - loop_split_worker_do(common_data, data, edge_vectors); - } - - if (edge_vectors) { - BLI_stack_free(edge_vectors); - } -} - /** * Check whether given loop is part of an unknown-so-far cyclic smooth fan, or not. * Needed because cyclic smooth fans have no obvious 'entry point', @@ -1218,10 +1145,10 @@ static bool loop_split_generator_check_cyclic_smooth_fan(const Span corner_ } } -static void loop_split_generator(TaskPool *pool, LoopSplitTaskDataCommon *common_data) +static void loop_split_generator(LoopSplitTaskDataCommon *common_data, + Vector &r_single_corners, + Vector &r_fan_corners) { - MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr; - const Span corner_verts = common_data->corner_verts; const Span corner_edges = common_data->corner_edges; const OffsetIndices polys = common_data->polys; @@ -1230,23 +1157,10 @@ static void loop_split_generator(TaskPool *pool, LoopSplitTaskDataCommon *common BitVector<> skip_loops(corner_verts.size(), false); - LoopSplitTaskData *data_buff = nullptr; - int data_idx = 0; - - /* Temp edge vectors stack, only used when computing lnor spacearr - * (and we are not multi-threading). */ - BLI_Stack *edge_vectors = nullptr; - #ifdef DEBUG_TIME SCOPED_TIMER_AVERAGED(__func__); #endif - if (!pool) { - if (lnors_spacearr) { - edge_vectors = BLI_stack_new(sizeof(float[3]), __func__); - } - } - /* We now know edges that can be smoothed (with their vector, and their two loops), * and edges that will be hard! Now, time to generate the normals. */ @@ -1290,30 +1204,11 @@ static void loop_split_generator(TaskPool *pool, LoopSplitTaskDataCommon *common // printf("SKIPPING!\n"); } else { - LoopSplitTaskData *data, data_local; - - // printf("PROCESSING!\n"); - - if (pool) { - if (data_idx == 0) { - data_buff = (LoopSplitTaskData *)MEM_calloc_arrayN( - LOOP_SPLIT_TASK_BLOCK_SIZE, sizeof(*data_buff), __func__); - } - data = &data_buff[data_idx]; - } - else { - data = &data_local; - memset(data, 0, sizeof(*data)); - } - if (IS_EDGE_SHARP(edge_to_loops[corner_edges[ml_curr_index]]) && IS_EDGE_SHARP(edge_to_loops[corner_edges[ml_prev_index]])) { - data->ml_curr_index = ml_curr_index; - data->flag = LoopSplitTaskData::Type::Single; - data->poly_index = poly_index; - if (lnors_spacearr) { - data->lnor_space = BKE_lnor_space_create(lnors_spacearr); - } + /* Simple case (both edges around that vertex are sharp in current polygon), + * this corner just takes its poly normal. */ + r_single_corners.append(ml_curr_index); } else { /* We do not need to check/tag loops as already computed. Due to the fact that a loop @@ -1323,35 +1218,11 @@ static void loop_split_generator(TaskPool *pool, LoopSplitTaskDataCommon *common * current edge, smooth previous edge), and not the alternative (smooth current edge, * sharp previous edge). All this due/thanks to the link between normals and loop * ordering (i.e. winding). */ - data->ml_curr_index = ml_curr_index; - data->flag = LoopSplitTaskData::Type::Fan; - data->poly_index = poly_index; - if (lnors_spacearr) { - data->lnor_space = BKE_lnor_space_create(lnors_spacearr); - } - } - - if (pool) { - data_idx++; - if (data_idx == LOOP_SPLIT_TASK_BLOCK_SIZE) { - BLI_task_pool_push(pool, loop_split_worker, data_buff, true, nullptr); - data_idx = 0; - } - } - else { - loop_split_worker_do(common_data, data, edge_vectors); + r_fan_corners.append(ml_curr_index); } } } } - - if (pool && data_idx) { - BLI_task_pool_push(pool, loop_split_worker, data_buff, true, nullptr); - } - - if (edge_vectors) { - BLI_stack_free(edge_vectors); - } } void normals_calc_loop(const Span vert_positions, @@ -1472,19 +1343,42 @@ void normals_calc_loop(const Span vert_positions, edge_to_loops, {}); - if (corner_verts.size() < LOOP_SPLIT_TASK_BLOCK_SIZE * 8) { - /* Not enough loops to be worth the whole threading overhead. */ - loop_split_generator(nullptr, &common_data); + Vector single_corners; + Vector fan_corners; + loop_split_generator(&common_data, single_corners, fan_corners); + + if (r_lnors_spacearr) { + /** Allocate #MLoopNorSpace structs outside of tasks, since #MemArena is not thread-safe. */ + for (const int corner : single_corners) { + r_lnors_spacearr->lspacearr[corner] = BKE_lnor_space_create(r_lnors_spacearr); + } + for (const int corner : fan_corners) { + r_lnors_spacearr->lspacearr[corner] = BKE_lnor_space_create(r_lnors_spacearr); + } } - else { - TaskPool *task_pool = BLI_task_pool_create(&common_data, TASK_PRIORITY_HIGH); - loop_split_generator(task_pool, &common_data); + threading::parallel_for(single_corners.index_range(), 1024, [&](const IndexRange range) { + for (const int i : range) { + const int corner = single_corners[i]; + lnor_space_for_single_fan( + &common_data, corner, r_lnors_spacearr ? r_lnors_spacearr->lspacearr[corner] : nullptr); + } + }); - BLI_task_pool_work_and_wait(task_pool); - - BLI_task_pool_free(task_pool); - } + threading::parallel_for(fan_corners.index_range(), 1024, [&](const IndexRange range) { + BLI_Stack *edge_vectors = r_lnors_spacearr ? BLI_stack_new(sizeof(float[3]), __func__) : + nullptr; + for (const int i : range) { + const int corner = fan_corners[i]; + split_loop_nor_fan_do(&common_data, + corner, + r_lnors_spacearr ? r_lnors_spacearr->lspacearr[corner] : nullptr, + edge_vectors); + } + if (edge_vectors) { + BLI_stack_free(edge_vectors); + } + }); if (r_lnors_spacearr) { if (r_lnors_spacearr == &_lnors_spacearr) {