Refactor: Eigen: Switch from OpenMP to TBB

Only the parallel sparse matrix code was updated. This is used by e.g.
LSCM and ABF unwrap, and performance seems about the same or better.

Parallel GEMM (dense matrix-matrix multiplication) is used by libmv,
for example in libmv_keyframe_selection_test for a 54 x 54 matrix.
However it appears to harm performance, removing parallelization makes
that test run 5x faster on a Apple M3 Max.

There has been no new Eigen release since 2021, however there is active
development in master and it includes support for a C++ thread pool for
GEMM. So we could upgrade, but the algorithm remains the same and
looking at the implementation it just does not seem designed for modern
many core CPUs. Unless the matrix is much larger, there's too much thread
synchronization overhead. So it does not seem useful to enable that
thread pool for us.

Pull Request: https://projects.blender.org/blender/blender/pulls/136865
This commit is contained in:
Brecht Van Lommel
2025-04-02 00:18:00 +02:00
parent 98b3b36411
commit da9a9093ec
5 changed files with 102 additions and 25 deletions

View File

@@ -31,3 +31,9 @@ add_library(bf_deps_eigen INTERFACE)
add_library(bf::dependencies::eigen ALIAS bf_deps_eigen) add_library(bf::dependencies::eigen ALIAS bf_deps_eigen)
target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS}) target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS})
if(WITH_TBB)
target_compile_definitions(bf_deps_eigen INTERFACE WITH_TBB)
target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${TBB_INCLUDE_DIRS})
target_link_libraries(bf_deps_eigen INTERFACE ${TBB_LIBRARIES})
endif()

View File

@@ -10,7 +10,11 @@
#ifndef EIGEN_SPARSEDENSEPRODUCT_H #ifndef EIGEN_SPARSEDENSEPRODUCT_H
#define EIGEN_SPARSEDENSEPRODUCT_H #define EIGEN_SPARSEDENSEPRODUCT_H
namespace Eigen { #ifdef WITH_TBB
#include <tbb/parallel_for.h>
#endif
namespace Eigen {
namespace internal { namespace internal {
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
{ {
LhsEval lhsEval(lhs); LhsEval lhsEval(lhs);
Index n = lhs.outerSize(); Index n = lhs.outerSize();
#ifdef EIGEN_HAS_OPENMP
Eigen::initParallel();
Index threads = Eigen::nbThreads();
#endif
for(Index c=0; c<rhs.cols(); ++c) for(Index c=0; c<rhs.cols(); ++c)
{ {
#ifdef EIGEN_HAS_OPENMP #ifdef WITH_TBB
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
// It basically represents the minimal amount of work to be done to be worth it. // It basically represents the minimal amount of work to be done to be worth it.
if(threads>1 && lhsEval.nonZerosEstimate() > 20000) if(lhsEval.nonZerosEstimate() > 20000)
{ {
#pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
for(Index i=0; i<n; ++i) [&](const tbb::blocked_range<Index>& range) {
processRow(lhsEval,rhs,res,alpha,i,c); for(Index i=range.begin(); i<range.end(); ++i)
processRow(lhsEval,rhs,res,alpha,i,c);
});
} }
else else
#endif #endif
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
Index n = lhs.rows(); Index n = lhs.rows();
LhsEval lhsEval(lhs); LhsEval lhsEval(lhs);
#ifdef EIGEN_HAS_OPENMP #ifdef WITH_TBB
Eigen::initParallel();
Index threads = Eigen::nbThreads();
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
// It basically represents the minimal amount of work to be done to be worth it. // It basically represents the minimal amount of work to be done to be worth it.
if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
{ {
#pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
for(Index i=0; i<n; ++i) [&](const tbb::blocked_range<Index>& range) {
processRow(lhsEval,rhs,res,alpha,i); for(Index i=range.begin(); i<range.end(); ++i)
processRow(lhsEval,rhs,res,alpha,i);
});
} }
else else
#endif #endif

View File

@@ -24,6 +24,7 @@ then
cd .. cd ..
rm -rf eigen.git rm -rf eigen.git
find Eigen -type f -exec chmod 644 {} \; find Eigen -type f -exec chmod 644 {} \;
patch -p3 < patches/blender.patch
else else
echo "Did you install Git?" echo "Did you install Git?"
fi fi

74
extern/Eigen3/patches/blender.patch vendored Normal file
View File

@@ -0,0 +1,74 @@
diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
index f005a18a18e..b1d96494500 100644
--- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -10,7 +10,11 @@
#ifndef EIGEN_SPARSEDENSEPRODUCT_H
#define EIGEN_SPARSEDENSEPRODUCT_H
-namespace Eigen {
+#ifdef WITH_TBB
+#include <tbb/parallel_for.h>
+#endif
+
+namespace Eigen {
namespace internal {
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
{
LhsEval lhsEval(lhs);
-
+
Index n = lhs.outerSize();
-#ifdef EIGEN_HAS_OPENMP
- Eigen::initParallel();
- Index threads = Eigen::nbThreads();
-#endif
-
+
for(Index c=0; c<rhs.cols(); ++c)
{
-#ifdef EIGEN_HAS_OPENMP
+#ifdef WITH_TBB
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
// It basically represents the minimal amount of work to be done to be worth it.
- if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
+ if(lhsEval.nonZerosEstimate() > 20000)
{
- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
- for(Index i=0; i<n; ++i)
- processRow(lhsEval,rhs,res,alpha,i,c);
+ tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
+ [&](const tbb::blocked_range<Index>& range) {
+ for(Index i=range.begin(); i<range.end(); ++i)
+ processRow(lhsEval,rhs,res,alpha,i,c);
+ });
}
else
#endif
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
Index n = lhs.rows();
LhsEval lhsEval(lhs);
-#ifdef EIGEN_HAS_OPENMP
- Eigen::initParallel();
- Index threads = Eigen::nbThreads();
+#ifdef WITH_TBB
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
// It basically represents the minimal amount of work to be done to be worth it.
- if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
+ if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
{
- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
- for(Index i=0; i<n; ++i)
- processRow(lhsEval,rhs,res,alpha,i);
+ tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
+ [&](const tbb::blocked_range<Index>& range) {
+ for(Index i=range.begin(); i<range.end(); ++i)
+ processRow(lhsEval,rhs,res,alpha,i);
+ });
}
else
#endif

View File

@@ -27,10 +27,4 @@ set(LIB
PRIVATE bf::dependencies::eigen PRIVATE bf::dependencies::eigen
) )
if(WITH_OPENMP AND WITH_OPENMP_STATIC)
list(APPEND LIB
${OpenMP_LIBRARIES}
)
endif()
blender_add_lib(bf_intern_eigen "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") blender_add_lib(bf_intern_eigen "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")