From da9a9093ec1c4d6639bd63e469e6c72523a5688b Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Wed, 2 Apr 2025 00:18:00 +0200 Subject: [PATCH] Refactor: Eigen: Switch from OpenMP to TBB Only the parallel sparse matrix code was updated. This is used by e.g. LSCM and ABF unwrap, and performance seems about the same or better. Parallel GEMM (dense matrix-matrix multiplication) is used by libmv, for example in libmv_keyframe_selection_test for a 54 x 54 matrix. However it appears to harm performance, removing parallelization makes that test run 5x faster on a Apple M3 Max. There has been no new Eigen release since 2021, however there is active development in master and it includes support for a C++ thread pool for GEMM. So we could upgrade, but the algorithm remains the same and looking at the implementation it just does not seem designed for modern many core CPUs. Unless the matrix is much larger, there's too much thread synchronization overhead. So it does not seem useful to enable that thread pool for us. Pull Request: https://projects.blender.org/blender/blender/pulls/136865 --- .../cmake/platform/dependency_targets.cmake | 6 ++ .../Eigen/src/SparseCore/SparseDenseProduct.h | 40 +++++----- extern/Eigen3/eigen-update.sh | 1 + extern/Eigen3/patches/blender.patch | 74 +++++++++++++++++++ intern/eigen/CMakeLists.txt | 6 -- 5 files changed, 102 insertions(+), 25 deletions(-) create mode 100644 extern/Eigen3/patches/blender.patch diff --git a/build_files/cmake/platform/dependency_targets.cmake b/build_files/cmake/platform/dependency_targets.cmake index b08acc0f952..8c81b10109f 100644 --- a/build_files/cmake/platform/dependency_targets.cmake +++ b/build_files/cmake/platform/dependency_targets.cmake @@ -31,3 +31,9 @@ add_library(bf_deps_eigen INTERFACE) add_library(bf::dependencies::eigen ALIAS bf_deps_eigen) target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS}) + +if(WITH_TBB) + target_compile_definitions(bf_deps_eigen INTERFACE WITH_TBB) + target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${TBB_INCLUDE_DIRS}) + target_link_libraries(bf_deps_eigen INTERFACE ${TBB_LIBRARIES}) +endif() diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h index f005a18a18e..b1d96494500 100644 --- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h @@ -10,7 +10,11 @@ #ifndef EIGEN_SPARSEDENSEPRODUCT_H #define EIGEN_SPARSEDENSEPRODUCT_H -namespace Eigen { +#ifdef WITH_TBB +#include +#endif + +namespace Eigen { namespace internal { @@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl1 && lhsEval.nonZerosEstimate() > 20000) + if(lhsEval.nonZerosEstimate() > 20000) { - #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) - for(Index i=0; i(0, n, 1024), + [&](const tbb::blocked_range& range) { + for(Index i=range.begin(); i1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) + if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000) { - #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) - for(Index i=0; i(0, n, 1024), + [&](const tbb::blocked_range& range) { + for(Index i=range.begin(); i ++#endif ++ ++namespace Eigen { + + namespace internal { + +@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl1 && lhsEval.nonZerosEstimate() > 20000) ++ if(lhsEval.nonZerosEstimate() > 20000) + { +- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) +- for(Index i=0; i(0, n, 1024), ++ [&](const tbb::blocked_range& range) { ++ for(Index i=range.begin(); i1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) ++ if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000) + { +- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) +- for(Index i=0; i(0, n, 1024), ++ [&](const tbb::blocked_range& range) { ++ for(Index i=range.begin(); i