Fix: Blur is two times slower in 4.4

The symmetric separate blur operation in the compositor is two times
slower in 4.4 compared to 4.3. On Linux, this only happens when Blender
is compiled with GCC, because Clang inlines a small function that GCC
doesn't.

To fix this, we specialize an if statement using templates to help GCC
inline the function. This results in a 3.5 times faster execution.

Pull Request: https://projects.blender.org/blender/blender/pulls/134336
This commit is contained in:
Omar Emara
2025-02-10 12:58:02 +01:00
committed by Omar Emara
parent f75126a253
commit d006924ebc

View File

@@ -19,11 +19,8 @@
namespace blender::compositor {
template<typename T>
static void blur_pass(const Result &input,
const Result &weights,
Result &output,
const bool extend_bounds)
template<typename T, bool ExtendBounds>
static void blur_pass(const Result &input, const Result &weights, Result &output)
{
/* Loads the input color of the pixel at the given texel. If bounds are extended, then the input
* is treated as padded by a blur size amount of pixels of zero color, and the given texel is
@@ -33,7 +30,7 @@ static void blur_pass(const Result &input,
* thus zero, hence the introduced offset. */
auto load_input = [&](const int2 texel) {
T color;
if (extend_bounds) {
if constexpr (ExtendBounds) {
/* Notice that we subtract 1 because the weights result have an extra center weight, see the
* SymmetricBlurWeights class for more information. */
int2 blur_radius = weights.domain().size - 1;
@@ -167,11 +164,21 @@ static Result horizontal_pass_cpu(Context &context,
switch (input.type()) {
case ResultType::Float:
blur_pass<float>(input, weights, output, extend_bounds);
if (extend_bounds) {
blur_pass<float, true>(input, weights, output);
}
else {
blur_pass<float, false>(input, weights, output);
}
break;
case ResultType::Vector:
case ResultType::Color:
blur_pass<float4>(input, weights, output, extend_bounds);
if (extend_bounds) {
blur_pass<float4, true>(input, weights, output);
}
else {
blur_pass<float4, false>(input, weights, output);
}
break;
case ResultType::Float2:
case ResultType::Float3:
@@ -255,11 +262,21 @@ static void vertical_pass_cpu(Context &context,
switch (original_input.type()) {
case ResultType::Float:
blur_pass<float>(horizontal_pass_result, weights, output, extend_bounds);
if (extend_bounds) {
blur_pass<float, true>(horizontal_pass_result, weights, output);
}
else {
blur_pass<float, false>(horizontal_pass_result, weights, output);
}
break;
case ResultType::Vector:
case ResultType::Color:
blur_pass<float4>(horizontal_pass_result, weights, output, extend_bounds);
if (extend_bounds) {
blur_pass<float4, true>(horizontal_pass_result, weights, output);
}
else {
blur_pass<float4, false>(horizontal_pass_result, weights, output);
}
break;
case ResultType::Float2:
case ResultType::Float3: