From d006924ebca91afe420cd2e80a6a57b64d92e3d1 Mon Sep 17 00:00:00 2001
From: Omar Emara <mail@OmarEmara.dev>
Date: Mon, 10 Feb 2025 12:58:02 +0100
Subject: [PATCH] Fix: Blur is two times slower in 4.4

The symmetric separate blur operation in the compositor is two times
slower in 4.4 compared to 4.3. On Linux, this only happens when Blender
is compiled with GCC, because Clang inlines a small function that GCC
doesn't.

To fix this, we specialize an if statement using templates to help GCC
inline the function. This results in a 3.5 times faster execution.

Pull Request: https://projects.blender.org/blender/blender/pulls/134336
---
 .../intern/symmetric_separable_blur.cc        | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)
diff --git a/source/blender/compositor/algorithms/intern/symmetric_separable_blur.cc b/source/blender/compositor/algorithms/intern/symmetric_separable_blur.cc
index afa62a40dbc..b5f9f2e68ee 100644
--- a/source/blender/compositor/algorithms/intern/symmetric_separable_blur.cc
+++ b/source/blender/compositor/algorithms/intern/symmetric_separable_blur.cc
@@ -19,11 +19,8 @@
 
 namespace blender::compositor {
 
-template<typename T>
-static void blur_pass(const Result &input,
-                      const Result &weights,
-                      Result &output,
-                      const bool extend_bounds)
+template<typename T, bool ExtendBounds>
+static void blur_pass(const Result &input, const Result &weights, Result &output)
 {
   /* Loads the input color of the pixel at the given texel. If bounds are extended, then the input
    * is treated as padded by a blur size amount of pixels of zero color, and the given texel is
@@ -33,7 +30,7 @@ static void blur_pass(const Result &input,
    * thus zero, hence the introduced offset. */
   auto load_input = [&](const int2 texel) {
     T color;
-    if (extend_bounds) {
+    if constexpr (ExtendBounds) {
       /* Notice that we subtract 1 because the weights result have an extra center weight, see the
        * SymmetricBlurWeights class for more information. */
       int2 blur_radius = weights.domain().size - 1;
@@ -167,11 +164,21 @@ static Result horizontal_pass_cpu(Context &context,
 
   switch (input.type()) {
     case ResultType::Float:
-      blur_pass<float>(input, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float, true>(input, weights, output);
+      }
+      else {
+        blur_pass<float, false>(input, weights, output);
+      }
       break;
     case ResultType::Vector:
     case ResultType::Color:
-      blur_pass<float4>(input, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float4, true>(input, weights, output);
+      }
+      else {
+        blur_pass<float4, false>(input, weights, output);
+      }
       break;
     case ResultType::Float2:
     case ResultType::Float3:
@@ -255,11 +262,21 @@ static void vertical_pass_cpu(Context &context,
 
   switch (original_input.type()) {
     case ResultType::Float:
-      blur_pass<float>(horizontal_pass_result, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float, true>(horizontal_pass_result, weights, output);
+      }
+      else {
+        blur_pass<float, false>(horizontal_pass_result, weights, output);
+      }
       break;
     case ResultType::Vector:
     case ResultType::Color:
-      blur_pass<float4>(horizontal_pass_result, weights, output, extend_bounds);
+      if (extend_bounds) {
+        blur_pass<float4, true>(horizontal_pass_result, weights, output);
+      }
+      else {
+        blur_pass<float4, false>(horizontal_pass_result, weights, output);
+      }
       break;
     case ResultType::Float2:
     case ResultType::Float3: