From da9a9093ec1c4d6639bd63e469e6c72523a5688b Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 2 Apr 2025 00:18:00 +0200
Subject: [PATCH] Refactor: Eigen: Switch from OpenMP to TBB

Only the parallel sparse matrix code was updated. This is used by e.g.
LSCM and ABF unwrap, and performance seems about the same or better.

Parallel GEMM (dense matrix-matrix multiplication) is used by libmv,
for example in libmv_keyframe_selection_test for a 54 x 54 matrix.
However it appears to harm performance, removing parallelization makes
that test run 5x faster on a Apple M3 Max.

There has been no new Eigen release since 2021, however there is active
development in master and it includes support for a C++ thread pool for
GEMM. So we could upgrade, but the algorithm remains the same and
looking at the implementation it just does not seem designed for modern
many core CPUs. Unless the matrix is much larger, there's too much thread
synchronization overhead. So it does not seem useful to enable that
thread pool for us.

Pull Request: https://projects.blender.org/blender/blender/pulls/136865
---
 .../cmake/platform/dependency_targets.cmake   |  6 ++
 .../Eigen/src/SparseCore/SparseDenseProduct.h | 40 +++++-----
 extern/Eigen3/eigen-update.sh                 |  1 +
 extern/Eigen3/patches/blender.patch           | 74 +++++++++++++++++++
 intern/eigen/CMakeLists.txt                   |  6 --
 5 files changed, 102 insertions(+), 25 deletions(-)
 create mode 100644 extern/Eigen3/patches/blender.patch
diff --git a/build_files/cmake/platform/dependency_targets.cmake b/build_files/cmake/platform/dependency_targets.cmake
index b08acc0f952..8c81b10109f 100644
--- a/build_files/cmake/platform/dependency_targets.cmake
+++ b/build_files/cmake/platform/dependency_targets.cmake
@@ -31,3 +31,9 @@ add_library(bf_deps_eigen INTERFACE)
 add_library(bf::dependencies::eigen ALIAS bf_deps_eigen)
 
 target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS})
+
+if(WITH_TBB)
+  target_compile_definitions(bf_deps_eigen INTERFACE WITH_TBB)
+  target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${TBB_INCLUDE_DIRS})
+  target_link_libraries(bf_deps_eigen INTERFACE ${TBB_LIBRARIES})
+endif()
diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
index f005a18a18e..b1d96494500 100644
--- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -10,7 +10,11 @@
 #ifndef EIGEN_SPARSEDENSEPRODUCT_H
 #define EIGEN_SPARSEDENSEPRODUCT_H
 
-namespace Eigen { 
+#ifdef WITH_TBB
+#include <tbb/parallel_for.h>
+#endif
+
+namespace Eigen {
 
 namespace internal {
 
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
     LhsEval lhsEval(lhs);
-    
+
     Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
-    Eigen::initParallel();
-    Index threads = Eigen::nbThreads();
-#endif
-    
+
     for(Index c=0; c<rhs.cols(); ++c)
     {
-#ifdef EIGEN_HAS_OPENMP
+#ifdef WITH_TBB
       // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
       // It basically represents the minimal amount of work to be done to be worth it.
-      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
+      if(lhsEval.nonZerosEstimate() > 20000)
       {
-        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
-        for(Index i=0; i<n; ++i)
-          processRow(lhsEval,rhs,res,alpha,i,c);
+        tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
+          [&](const tbb::blocked_range<Index>& range) {
+            for(Index i=range.begin(); i<range.end(); ++i)
+              processRow(lhsEval,rhs,res,alpha,i,c);
+        });
       }
       else
 #endif
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
     Index n = lhs.rows();
     LhsEval lhsEval(lhs);
 
-#ifdef EIGEN_HAS_OPENMP
-    Eigen::initParallel();
-    Index threads = Eigen::nbThreads();
+#ifdef WITH_TBB
     // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
     // It basically represents the minimal amount of work to be done to be worth it.
-    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
+    if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
     {
-      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
-      for(Index i=0; i<n; ++i)
-        processRow(lhsEval,rhs,res,alpha,i);
+      tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
+        [&](const tbb::blocked_range<Index>& range) {
+          for(Index i=range.begin(); i<range.end(); ++i)
+            processRow(lhsEval,rhs,res,alpha,i);
+      });
     }
     else
 #endif
diff --git a/extern/Eigen3/eigen-update.sh b/extern/Eigen3/eigen-update.sh
index a7ebe0a4d22..c205dda7e39 100755
--- a/extern/Eigen3/eigen-update.sh
+++ b/extern/Eigen3/eigen-update.sh
@@ -24,6 +24,7 @@ then
     cd ..
     rm -rf eigen.git
     find Eigen -type f -exec chmod 644 {} \;
+    patch -p3 < patches/blender.patch
 else
     echo "Did you install Git?"
 fi
diff --git a/extern/Eigen3/patches/blender.patch b/extern/Eigen3/patches/blender.patch
new file mode 100644
index 00000000000..2971ec76a53
--- /dev/null
+++ b/extern/Eigen3/patches/blender.patch
@@ -0,0 +1,74 @@
+diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
+index f005a18a18e..b1d96494500 100644
+--- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
++++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
+@@ -10,7 +10,11 @@
+ #ifndef EIGEN_SPARSEDENSEPRODUCT_H
+ #define EIGEN_SPARSEDENSEPRODUCT_H
+ 
+-namespace Eigen { 
++#ifdef WITH_TBB
++#include <tbb/parallel_for.h>
++#endif
++
++namespace Eigen {
+ 
+ namespace internal {
+ 
+@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
+   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+   {
+     LhsEval lhsEval(lhs);
+-    
++
+     Index n = lhs.outerSize();
+-#ifdef EIGEN_HAS_OPENMP
+-    Eigen::initParallel();
+-    Index threads = Eigen::nbThreads();
+-#endif
+-    
++
+     for(Index c=0; c<rhs.cols(); ++c)
+     {
+-#ifdef EIGEN_HAS_OPENMP
++#ifdef WITH_TBB
+       // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+       // It basically represents the minimal amount of work to be done to be worth it.
+-      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
++      if(lhsEval.nonZerosEstimate() > 20000)
+       {
+-        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+-        for(Index i=0; i<n; ++i)
+-          processRow(lhsEval,rhs,res,alpha,i,c);
++        tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
++          [&](const tbb::blocked_range<Index>& range) {
++            for(Index i=range.begin(); i<range.end(); ++i)
++              processRow(lhsEval,rhs,res,alpha,i,c);
++        });
+       }
+       else
+ #endif
+@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
+     Index n = lhs.rows();
+     LhsEval lhsEval(lhs);
+ 
+-#ifdef EIGEN_HAS_OPENMP
+-    Eigen::initParallel();
+-    Index threads = Eigen::nbThreads();
++#ifdef WITH_TBB
+     // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+     // It basically represents the minimal amount of work to be done to be worth it.
+-    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
++    if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
+     {
+-      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+-      for(Index i=0; i<n; ++i)
+-        processRow(lhsEval,rhs,res,alpha,i);
++      tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
++        [&](const tbb::blocked_range<Index>& range) {
++          for(Index i=range.begin(); i<range.end(); ++i)
++            processRow(lhsEval,rhs,res,alpha,i);
++      });
+     }
+     else
+ #endif
diff --git a/intern/eigen/CMakeLists.txt b/intern/eigen/CMakeLists.txt
index 8e833677ae1..c631eb7a24f 100644
--- a/intern/eigen/CMakeLists.txt
+++ b/intern/eigen/CMakeLists.txt
@@ -27,10 +27,4 @@ set(LIB
   PRIVATE bf::dependencies::eigen
 )
 
-if(WITH_OPENMP AND WITH_OPENMP_STATIC)
-  list(APPEND LIB
-    ${OpenMP_LIBRARIES}
-  )
-endif()
-
 blender_add_lib(bf_intern_eigen "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")