BLI: move tbb part of parallel_for to implementation file

Previously, `tbb::parallel_for` was instantiated every time `threading::parallel_for` is used. However, when actual parallelism is used, the overhead of a function call is negilible. Therefor it is possible to move that part out of the header without causing noticable performance regressions. This reduces the size of the Blender binary from 308.2 to 303.5 MB, which is a reduction of about 1.5%.
2023-05-21 13:31:21 +02:00
parent 57f593b7ca
commit f6d824bca6
2 changed files with 38 additions and 15 deletions
--- a/source/blender/blenlib/BLI_task.hh
+++ b/source/blender/blenlib/BLI_task.hh
@@ -30,6 +30,7 @@
 #  endif
 #endif

+#include "BLI_function_ref.hh"
 #include "BLI_index_range.hh"
 #include "BLI_lazy_threading.hh"
 #include "BLI_utildefines.h"
@@ -48,27 +49,23 @@ void parallel_for_each(Range &&range, const Function &function)
 #endif
 }

+namespace detail {
+void parallel_for_impl(IndexRange range,
+                       int64_t grain_size,
+                       FunctionRef<void(IndexRange)> function);
+}  // namespace detail
+
 template<typename Function>
-void parallel_for(IndexRange range, int64_t grain_size, const Function &function)
+inline void parallel_for(IndexRange range, int64_t grain_size, const Function &function)
 {
-  if (range.size() == 0) {
+  if (range.is_empty()) {
    return;
  }
-#ifdef WITH_TBB
-  /* Invoking tbb for small workloads has a large overhead. */
-  if (range.size() >= grain_size) {
-    lazy_threading::send_hint();
-    tbb::parallel_for(
-        tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
-        [&](const tbb::blocked_range<int64_t> &subrange) {
-          function(IndexRange(subrange.begin(), subrange.size()));
-        });
+  if (range.size() <= grain_size) {
+    function(range);
    return;
  }
-#else
-  UNUSED_VARS(grain_size);
-#endif
-  function(range);
+  detail::parallel_for_impl(range, grain_size, function);
 }

 /**
--- a/source/blender/blenlib/intern/task_range.cc
+++ b/source/blender/blenlib/intern/task_range.cc
@@ -14,6 +14,7 @@

 #include "BLI_lazy_threading.hh"
 #include "BLI_task.h"
+#include "BLI_task.hh"
 #include "BLI_threads.h"

 #include "atomic_ops.h"
@@ -154,3 +155,28 @@ int BLI_task_parallel_thread_id(const TaskParallelTLS * /*tls*/)
  return 0;
 #endif
 }
+
+namespace blender::threading::detail {
+
+void parallel_for_impl(const IndexRange range,
+                       const int64_t grain_size,
+                       const FunctionRef<void(IndexRange)> function)
+{
+#ifdef WITH_TBB
+  /* Invoking tbb for small workloads has a large overhead. */
+  if (range.size() >= grain_size) {
+    lazy_threading::send_hint();
+    tbb::parallel_for(
+        tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
+        [function](const tbb::blocked_range<int64_t> &subrange) {
+          function(IndexRange(subrange.begin(), subrange.size()));
+        });
+    return;
+  }
+#else
+  UNUSED_VARS(grain_size);
+#endif
+  function(range);
+}
+
+}  // namespace blender::threading::detail