Fix: Blur is two times slower in 4.4

The symmetric separate blur operation in the compositor is two times slower in 4.4 compared to 4.3. On Linux, this only happens when Blender is compiled with GCC, because Clang inlines a small function that GCC doesn't. To fix this, we specialize an if statement using templates to help GCC inline the function. This results in a 3.5 times faster execution. Pull Request: https://projects.blender.org/blender/blender/pulls/134336
2025-02-10 12:58:02 +01:00
parent f75126a253
commit d006924ebc
1 changed files with 27 additions and 10 deletions
--- a/source/blender/compositor/algorithms/intern/symmetric_separable_blur.cc
+++ b/source/blender/compositor/algorithms/intern/symmetric_separable_blur.cc
@@ -19,11 +19,8 @@

 namespace blender::compositor {

-template<typename T>
-static void blur_pass(const Result &input,
-                      const Result &weights,
-                      Result &output,
-                      const bool extend_bounds)
+template<typename T, bool ExtendBounds>
+static void blur_pass(const Result &input, const Result &weights, Result &output)
 {
  /* Loads the input color of the pixel at the given texel. If bounds are extended, then the input
   * is treated as padded by a blur size amount of pixels of zero color, and the given texel is
@@ -33,7 +30,7 @@ static void blur_pass(const Result &input,
   * thus zero, hence the introduced offset. */
  auto load_input = [&](const int2 texel) {
    T color;
-    if (extend_bounds) {
+    if constexpr (ExtendBounds) {
      /* Notice that we subtract 1 because the weights result have an extra center weight, see the
       * SymmetricBlurWeights class for more information. */
      int2 blur_radius = weights.domain().size - 1;
@@ -167,11 +164,21 @@ static Result horizontal_pass_cpu(Context &context,

  switch (input.type()) {
    case ResultType::Float:
-      blur_pass<float>(input, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float, true>(input, weights, output);
+      }
+      else {
+        blur_pass<float, false>(input, weights, output);
+      }
      break;
    case ResultType::Vector:
    case ResultType::Color:
-      blur_pass<float4>(input, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float4, true>(input, weights, output);
+      }
+      else {
+        blur_pass<float4, false>(input, weights, output);
+      }
      break;
    case ResultType::Float2:
    case ResultType::Float3:
@@ -255,11 +262,21 @@ static void vertical_pass_cpu(Context &context,

  switch (original_input.type()) {
    case ResultType::Float:
-      blur_pass<float>(horizontal_pass_result, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float, true>(horizontal_pass_result, weights, output);
+      }
+      else {
+        blur_pass<float, false>(horizontal_pass_result, weights, output);
+      }
      break;
    case ResultType::Vector:
    case ResultType::Color:
-      blur_pass<float4>(horizontal_pass_result, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float4, true>(horizontal_pass_result, weights, output);
+      }
+      else {
+        blur_pass<float4, false>(horizontal_pass_result, weights, output);
+      }
      break;
    case ResultType::Float2:
    case ResultType::Float3: