Only the parallel sparse matrix code was updated. This is used by e.g. LSCM and ABF unwrap, and performance seems about the same or better. Parallel GEMM (dense matrix-matrix multiplication) is used by libmv, for example in libmv_keyframe_selection_test for a 54 x 54 matrix. However it appears to harm performance, removing parallelization makes that test run 5x faster on a Apple M3 Max. There has been no new Eigen release since 2021, however there is active development in master and it includes support for a C++ thread pool for GEMM. So we could upgrade, but the algorithm remains the same and looking at the implementation it just does not seem designed for modern many core CPUs. Unless the matrix is much larger, there's too much thread synchronization overhead. So it does not seem useful to enable that thread pool for us. Pull Request: https://projects.blender.org/blender/blender/pulls/136865
75 lines
2.7 KiB
Diff
75 lines
2.7 KiB
Diff
diff --git a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
|
|
index f005a18a18e..b1d96494500 100644
|
|
--- a/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
|
|
+++ b/extern/Eigen3/Eigen/src/SparseCore/SparseDenseProduct.h
|
|
@@ -10,7 +10,11 @@
|
|
#ifndef EIGEN_SPARSEDENSEPRODUCT_H
|
|
#define EIGEN_SPARSEDENSEPRODUCT_H
|
|
|
|
-namespace Eigen {
|
|
+#ifdef WITH_TBB
|
|
+#include <tbb/parallel_for.h>
|
|
+#endif
|
|
+
|
|
+namespace Eigen {
|
|
|
|
namespace internal {
|
|
|
|
@@ -34,23 +38,21 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
|
|
static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
|
|
{
|
|
LhsEval lhsEval(lhs);
|
|
-
|
|
+
|
|
Index n = lhs.outerSize();
|
|
-#ifdef EIGEN_HAS_OPENMP
|
|
- Eigen::initParallel();
|
|
- Index threads = Eigen::nbThreads();
|
|
-#endif
|
|
-
|
|
+
|
|
for(Index c=0; c<rhs.cols(); ++c)
|
|
{
|
|
-#ifdef EIGEN_HAS_OPENMP
|
|
+#ifdef WITH_TBB
|
|
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
|
// It basically represents the minimal amount of work to be done to be worth it.
|
|
- if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
|
|
+ if(lhsEval.nonZerosEstimate() > 20000)
|
|
{
|
|
- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
|
|
- for(Index i=0; i<n; ++i)
|
|
- processRow(lhsEval,rhs,res,alpha,i,c);
|
|
+ tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
|
|
+ [&](const tbb::blocked_range<Index>& range) {
|
|
+ for(Index i=range.begin(); i<range.end(); ++i)
|
|
+ processRow(lhsEval,rhs,res,alpha,i,c);
|
|
+ });
|
|
}
|
|
else
|
|
#endif
|
|
@@ -119,16 +121,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
|
|
Index n = lhs.rows();
|
|
LhsEval lhsEval(lhs);
|
|
|
|
-#ifdef EIGEN_HAS_OPENMP
|
|
- Eigen::initParallel();
|
|
- Index threads = Eigen::nbThreads();
|
|
+#ifdef WITH_TBB
|
|
// This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
|
|
// It basically represents the minimal amount of work to be done to be worth it.
|
|
- if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
|
|
+ if(lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
|
|
{
|
|
- #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
|
|
- for(Index i=0; i<n; ++i)
|
|
- processRow(lhsEval,rhs,res,alpha,i);
|
|
+ tbb::parallel_for(tbb::blocked_range<Index>(0, n, 1024),
|
|
+ [&](const tbb::blocked_range<Index>& range) {
|
|
+ for(Index i=range.begin(); i<range.end(); ++i)
|
|
+ processRow(lhsEval,rhs,res,alpha,i);
|
|
+ });
|
|
}
|
|
else
|
|
#endif
|