Refactor: Eigen: Switch from OpenMP to TBB
Only the parallel sparse matrix code was updated. This is used by e.g. LSCM and ABF unwrap, and performance seems about the same or better. Parallel GEMM (dense matrix-matrix multiplication) is used by libmv, for example in libmv_keyframe_selection_test for a 54 x 54 matrix. However it appears to harm performance, removing parallelization makes that test run 5x faster on a Apple M3 Max. There has been no new Eigen release since 2021, however there is active development in master and it includes support for a C++ thread pool for GEMM. So we could upgrade, but the algorithm remains the same and looking at the implementation it just does not seem designed for modern many core CPUs. Unless the matrix is much larger, there's too much thread synchronization overhead. So it does not seem useful to enable that thread pool for us. Pull Request: https://projects.blender.org/blender/blender/pulls/136865
This commit is contained in:
@@ -31,3 +31,9 @@ add_library(bf_deps_eigen INTERFACE)
|
|||||||
add_library(bf::dependencies::eigen ALIAS bf_deps_eigen)
|
add_library(bf::dependencies::eigen ALIAS bf_deps_eigen)
|
||||||
|
|
||||||
target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS})
|
target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${EIGEN3_INCLUDE_DIRS})
|
||||||
|
|
||||||
|
if(WITH_TBB)
|
||||||
|
target_compile_definitions(bf_deps_eigen INTERFACE WITH_TBB)
|
||||||
|
target_include_directories(bf_deps_eigen SYSTEM INTERFACE ${TBB_INCLUDE_DIRS})
|
||||||
|
target_link_libraries(bf_deps_eigen INTERFACE ${TBB_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
|||||||
@@ -10,7 +10,11 @@
|
|||||||
#ifndef EIGEN_SPARSEDENSEPRODUCT_H
|
#ifndef EIGEN_SPARSEDENSEPRODUCT_H
|
||||||
#define EIGEN_SPARSEDENSEPRODUCT_H
|
#define EIGEN_SPARSEDENSEPRODUCT_H
|
||||||
|
|
||||||
namespace Eigen {
|
#ifdef WITH_TBB
|
||||||
|
#include <tbb/parallel_for.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
|
|||||||
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
|
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
|
||||||
{
|
{
|
||||||
LhsEval lhsEval(lhs);
|
LhsEval lhsEval(lhs);
|
||||||
|
|
||||||
Index n = lhs.outerSize();
|
Index n = lhs.outerSize();
|
||||||
#ifdef EIGEN_HAS_OPENMP
|
|
||||||
Eigen::initParallel();
|
|
||||||
Index threads = Eigen::nbThreads();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(Index c=0; c<rhs.cols(); ++c)
|
for(Index c=0; c<rhs.cols(); ++c)
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_HAS_OPENMP
|
#ifdef WITH_TBB
|
||||||
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
||||||
// It basically represents the minimal amount of work to be done to be worth it.
|
// It basically represents the minimal amount of work to be done to be worth it.
|
||||||
if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
|
if(lhsEval.nonZerosEstimate() > 20000)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
|
tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
|
||||||
for(Index i=0; i<n; ++i)
|
[&](const tbb::blocked_range<Index>& range) {
|
||||||
processRow(lhsEval,rhs,res,alpha,i,c);
|
for(Index i=range.begin(); i<range.end(); ++i)
|
||||||
|
processRow(lhsEval,rhs,res,alpha,i,c);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
|
|||||||
Index n = lhs.rows();
|
Index n = lhs.rows();
|
||||||
LhsEval lhsEval(lhs);
|
LhsEval lhsEval(lhs);
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_OPENMP
|
#ifdef WITH_TBB
|
||||||
Eigen::initParallel();
|
|
||||||
Index threads = Eigen::nbThreads();
|
|
||||||
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
||||||
// It basically represents the minimal amount of work to be done to be worth it.
|
// It basically represents the minimal amount of work to be done to be worth it.
|
||||||
if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
|
if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
|
||||||
{
|
{
|
||||||
#pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
|
tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
|
||||||
for(Index i=0; i<n; ++i)
|
[&](const tbb::blocked_range<Index>& range) {
|
||||||
processRow(lhsEval,rhs,res,alpha,i);
|
for(Index i=range.begin(); i<range.end(); ++i)
|
||||||
|
processRow(lhsEval,rhs,res,alpha,i);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
1
extern/Eigen3/eigen-update.sh
vendored
1
extern/Eigen3/eigen-update.sh
vendored
@@ -24,6 +24,7 @@ then
|
|||||||
cd ..
|
cd ..
|
||||||
rm -rf eigen.git
|
rm -rf eigen.git
|
||||||
find Eigen -type f -exec chmod 644 {} \;
|
find Eigen -type f -exec chmod 644 {} \;
|
||||||
|
patch -p3 < patches/blender.patch
|
||||||
else
|
else
|
||||||
echo "Did you install Git?"
|
echo "Did you install Git?"
|
||||||
fi
|
fi
|
||||||
|
|||||||
74
extern/Eigen3/patches/blender.patch
vendored
Normal file
74
extern/Eigen3/patches/blender.patch
vendored
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
|
||||||
|
index f005a18a18e..b1d96494500 100644
|
||||||
|
--- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
|
||||||
|
+++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
|
||||||
|
@@ -10,7 +10,11 @@
|
||||||
|
#ifndef EIGEN_SPARSEDENSEPRODUCT_H
|
||||||
|
#define EIGEN_SPARSEDENSEPRODUCT_H
|
||||||
|
|
||||||
|
-namespace Eigen {
|
||||||
|
+#ifdef WITH_TBB
|
||||||
|
+#include <tbb/parallel_for.h>
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+namespace Eigen {
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
|
||||||
|
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
|
||||||
|
{
|
||||||
|
LhsEval lhsEval(lhs);
|
||||||
|
-
|
||||||
|
+
|
||||||
|
Index n = lhs.outerSize();
|
||||||
|
-#ifdef EIGEN_HAS_OPENMP
|
||||||
|
- Eigen::initParallel();
|
||||||
|
- Index threads = Eigen::nbThreads();
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
+
|
||||||
|
for(Index c=0; c<rhs.cols(); ++c)
|
||||||
|
{
|
||||||
|
-#ifdef EIGEN_HAS_OPENMP
|
||||||
|
+#ifdef WITH_TBB
|
||||||
|
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
||||||
|
// It basically represents the minimal amount of work to be done to be worth it.
|
||||||
|
- if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
|
||||||
|
+ if(lhsEval.nonZerosEstimate() > 20000)
|
||||||
|
{
|
||||||
|
- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
|
||||||
|
- for(Index i=0; i<n; ++i)
|
||||||
|
- processRow(lhsEval,rhs,res,alpha,i,c);
|
||||||
|
+ tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
|
||||||
|
+ [&](const tbb::blocked_range<Index>& range) {
|
||||||
|
+ for(Index i=range.begin(); i<range.end(); ++i)
|
||||||
|
+ processRow(lhsEval,rhs,res,alpha,i,c);
|
||||||
|
+ });
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
|
||||||
|
Index n = lhs.rows();
|
||||||
|
LhsEval lhsEval(lhs);
|
||||||
|
|
||||||
|
-#ifdef EIGEN_HAS_OPENMP
|
||||||
|
- Eigen::initParallel();
|
||||||
|
- Index threads = Eigen::nbThreads();
|
||||||
|
+#ifdef WITH_TBB
|
||||||
|
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
||||||
|
// It basically represents the minimal amount of work to be done to be worth it.
|
||||||
|
- if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
|
||||||
|
+ if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
|
||||||
|
{
|
||||||
|
- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
|
||||||
|
- for(Index i=0; i<n; ++i)
|
||||||
|
- processRow(lhsEval,rhs,res,alpha,i);
|
||||||
|
+ tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
|
||||||
|
+ [&](const tbb::blocked_range<Index>& range) {
|
||||||
|
+ for(Index i=range.begin(); i<range.end(); ++i)
|
||||||
|
+ processRow(lhsEval,rhs,res,alpha,i);
|
||||||
|
+ });
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
@@ -27,10 +27,4 @@ set(LIB
|
|||||||
PRIVATE bf::dependencies::eigen
|
PRIVATE bf::dependencies::eigen
|
||||||
)
|
)
|
||||||
|
|
||||||
if(WITH_OPENMP AND WITH_OPENMP_STATIC)
|
|
||||||
list(APPEND LIB
|
|
||||||
${OpenMP_LIBRARIES}
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
blender_add_lib(bf_intern_eigen "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
|
blender_add_lib(bf_intern_eigen "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
|
||||||
|
|||||||
Reference in New Issue
Block a user