Refactor: Eigen: Switch from OpenMP to TBB

Only the parallel sparse matrix code was updated. This is used by e.g. LSCM and ABF unwrap, and performance seems about the same or better. Parallel GEMM (dense matrix-matrix multiplication) is used by libmv, for example in libmv_keyframe_selection_test for a 54 x 54 matrix. However it appears to harm performance, removing parallelization makes that test run 5x faster on a Apple M3 Max. There has been no new Eigen release since 2021, however there is active development in master and it includes support for a C++ thread pool for GEMM. So we could upgrade, but the algorithm remains the same and looking at the implementation it just does not seem designed for modern many core CPUs. Unless the matrix is much larger, there's too much thread synchronization overhead. So it does not seem useful to enable that thread pool for us. Pull Request: https://projects.blender.org/blender/blender/pulls/136865
2025-04-02 00:18:00 +02:00
parent 98b3b36411
commit da9a9093ec
5 changed files with 102 additions and 25 deletions
--- a/build_files/cmake/platform/dependency_targets.cmake
+++ b/build_files/cmake/platform/dependency_targets.cmake
@@ -31,3 +31,9 @@ add_library(bf_deps_eigen INTERFACE)
 add_library(bf::dependencies::eigen ALIAS bf_deps_eigen)
 target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS})
 if(WITH_TBB)
  target_compile_definitions(bf_deps_eigen INTERFACE WITH_TBB)
  target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${TBB_INCLUDE_DIRS})
  target_link_libraries(bf_deps_eigen INTERFACE ${TBB_LIBRARIES})
 endif()
--- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -10,7 +10,11 @@
 #ifndef EIGEN_SPARSEDENSEPRODUCT_H
 #define EIGEN_SPARSEDENSEPRODUCT_H
-namespace Eigen { 
+#ifdef WITH_TBB
 #include <tbb/parallel_for.h>
 #endif
 namespace Eigen {
 namespace internal {
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
  {
    LhsEval lhsEval(lhs);
-    
+
    Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
+
    Eigen::initParallel();
    Index threads = Eigen::nbThreads();
 #endif
    for(Index c=0; c<rhs.cols(); ++c)
    {
-#ifdef EIGEN_HAS_OPENMP
+#ifdef WITH_TBB
      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
      // It basically represents the minimal amount of work to be done to be worth it.
-      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
+      if(lhsEval.nonZerosEstimate() > 20000)
      {
-        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+        tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
-        for(Index i=0; i<n; ++i)
+          [&](const tbb::blocked_range<Index>& range) {
-          processRow(lhsEval,rhs,res,alpha,i,c);
+            for(Index i=range.begin(); i<range.end(); ++i)
              processRow(lhsEval,rhs,res,alpha,i,c);
        });
      }
      else
 #endif
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
    Index n = lhs.rows();
    LhsEval lhsEval(lhs);
-#ifdef EIGEN_HAS_OPENMP
+#ifdef WITH_TBB
    Eigen::initParallel();
    Index threads = Eigen::nbThreads();
    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
    // It basically represents the minimal amount of work to be done to be worth it.
-    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
+    if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
    {
-      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+      tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
-      for(Index i=0; i<n; ++i)
+        [&](const tbb::blocked_range<Index>& range) {
-        processRow(lhsEval,rhs,res,alpha,i);
+          for(Index i=range.begin(); i<range.end(); ++i)
            processRow(lhsEval,rhs,res,alpha,i);
      });
    }
    else
 #endif
--- a/extern/Eigen3/eigen-update.sh
+++ b/extern/Eigen3/eigen-update.sh
@@ -24,6 +24,7 @@ then
    cd ..
    rm -rf eigen.git
    find Eigen -type f -exec chmod 644 {} \;
    patch -p3 < patches/blender.patch
 else
    echo "Did you install Git?"
 fi
--- a/extern/Eigen3/patches/blender.patch
+++ b/extern/Eigen3/patches/blender.patch
@@ -0,0 +1,74 @@
 diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
 index f005a18a18e..b1d96494500 100644
 --- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
 +++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -10,7 +10,11 @@
 #ifndef EIGEN_SPARSEDENSEPRODUCT_H
 #define EIGEN_SPARSEDENSEPRODUCT_H
 -namespace Eigen { 
 +#ifdef WITH_TBB
 +#include <tbb/parallel_for.h>
 +#endif
 +
 +namespace Eigen {
 namespace internal {
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
     LhsEval lhsEval(lhs);
 -    
 +
     Index n = lhs.outerSize();
 -#ifdef EIGEN_HAS_OPENMP
 -    Eigen::initParallel();
 -    Index threads = Eigen::nbThreads();
 -#endif
 -    
 +
     for(Index c=0; c<rhs.cols(); ++c)
     {
 -#ifdef EIGEN_HAS_OPENMP
 +#ifdef WITH_TBB
       // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
       // It basically represents the minimal amount of work to be done to be worth it.
 -      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
 +      if(lhsEval.nonZerosEstimate() > 20000)
       {
 -        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
 -        for(Index i=0; i<n; ++i)
 -          processRow(lhsEval,rhs,res,alpha,i,c);
 +        tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
 +          [&](const tbb::blocked_range<Index>& range) {
 +            for(Index i=range.begin(); i<range.end(); ++i)
 +              processRow(lhsEval,rhs,res,alpha,i,c);
 +        });
       }
       else
 #endif
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
     Index n = lhs.rows();
     LhsEval lhsEval(lhs);
 -#ifdef EIGEN_HAS_OPENMP
 -    Eigen::initParallel();
 -    Index threads = Eigen::nbThreads();
 +#ifdef WITH_TBB
     // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
     // It basically represents the minimal amount of work to be done to be worth it.
 -    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
 +    if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
     {
 -      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
 -      for(Index i=0; i<n; ++i)
 -        processRow(lhsEval,rhs,res,alpha,i);
 +      tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
 +        [&](const tbb::blocked_range<Index>& range) {
 +          for(Index i=range.begin(); i<range.end(); ++i)
 +            processRow(lhsEval,rhs,res,alpha,i);
 +      });
     }
     else
 #endif
--- a/intern/eigen/CMakeLists.txt
+++ b/intern/eigen/CMakeLists.txt
@@ -27,10 +27,4 @@ set(LIB
  PRIVATE bf::dependencies::eigen
 )
 if(WITH_OPENMP AND WITH_OPENMP_STATIC)
  list(APPEND LIB
    ${OpenMP_LIBRARIES}
  )
 endif()
 blender_add_lib(bf_intern_eigen "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")