Compositor: Add Convolve node

This patch adds a simple Convolve node to the compositor.

Pull Request: https://projects.blender.org/blender/blender/pulls/144619
This commit is contained in:
Omar Emara
2025-08-28 15:58:39 +02:00
committed by Omar Emara
parent 90a55a3e0b
commit fdf95b53fa
9 changed files with 473 additions and 2 deletions

View File

@@ -139,6 +139,7 @@ class NODE_MT_category_compositor_filter(Menu):
layout.menu("NODE_MT_category_compositor_filter_blur")
layout.separator()
node_add_menu.add_node_type(layout, "CompositorNodeAntiAliasing")
node_add_menu.add_node_type(layout, "CompositorNodeConvolve")
node_add_menu.add_node_type(layout, "CompositorNodeDenoise")
node_add_menu.add_node_type(layout, "CompositorNodeDespeckle")
layout.separator()

View File

@@ -69,6 +69,7 @@ set(SRC
intern/utilities.cc
algorithms/intern/compute_preview.cc
algorithms/intern/convolve.cc
algorithms/intern/deriche_gaussian_blur.cc
algorithms/intern/extract_alpha.cc
algorithms/intern/jump_flooding.cc
@@ -86,6 +87,7 @@ set(SRC
algorithms/intern/van_vliet_gaussian_blur.cc
algorithms/COM_algorithm_compute_preview.hh
algorithms/COM_algorithm_convolve.hh
algorithms/COM_algorithm_deriche_gaussian_blur.hh
algorithms/COM_algorithm_extract_alpha.hh
algorithms/COM_algorithm_jump_flooding.hh

View File

@@ -238,7 +238,12 @@ class Result {
/* Creates and allocates a new result that matches the type and precision of this result and
* uploads the CPU data that exist in this result. The result is assumed to be allocated on the
* CPU. See the allocate_data method for more information on the from_pool parameters. */
Result upload_to_gpu(const bool from_pool);
Result upload_to_gpu(const bool from_pool) const;
/* Creates and allocates a new result that matches the type and precision of this result and
* downloads the GPU data that exist in this result. The result is assumed to be allocated on the
* GPU. */
Result download_to_cpu() const;
/* Bind the GPU texture of the result to the texture image unit with the given name in the
* currently bound given shader. This also inserts a memory barrier for texture fetches to ensure
@@ -273,6 +278,10 @@ class Result {
* actual output of the operation. See the uses of the method for a practical example of use. */
void steal_data(Result &source);
/* Similar to the Result variant of steal_data, but steals from a raw data buffer. The buffer is
* assumed to be allocated using Blender's guarded allocator. */
void steal_data(void *data, int2 size);
/* Set up the result to wrap an external GPU texture that is not allocated nor managed by the
* result. The is_external_ member will be set to true, the domain will be set to have the same
* size as the texture, and the texture will be set to the given texture. See the is_external_
@@ -351,6 +360,9 @@ class Result {
/* Computes the number of channels of the result based on its type. */
int64_t channels_count() const;
/* Computes the size of the result's data in bytes. */
int64_t size_in_bytes() const;
blender::gpu::Texture *gpu_texture() const;
GSpan cpu_data() const;

View File

@@ -0,0 +1,22 @@
/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
#include "COM_context.hh"
#include "COM_result.hh"
namespace blender::compositor {
/* Convolves the given color input by the given float or color kernel and write the result to the
* given output. If normalize_kernel is true, the kernel will be normalized such that it integrates
* to 1. The output will be allocated internally and is thus expected not to be previously
* allocated. */
void convolve(Context &context,
const Result &input,
const Result &kernel,
Result &output,
const bool normalize_kernel);
} // namespace blender::compositor

View File

@@ -0,0 +1,274 @@
/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#include <complex>
#include <numeric>
#include "BLI_array.hh"
#include "BLI_assert.h"
#include "BLI_enumerable_thread_specific.hh"
#include "BLI_fftw.hh"
#include "BLI_index_range.hh"
#include "BLI_memory_utils.hh"
#include "BLI_task.hh"
#if defined(WITH_FFTW3)
# include <fftw3.h>
#endif
#include "COM_context.hh"
#include "COM_result.hh"
#include "COM_utilities.hh"
#include "COM_algorithm_convolve.hh"
namespace blender::compositor {
void convolve(Context &context,
const Result &input,
const Result &kernel,
Result &output,
const bool normalize_kernel)
{
#if defined(WITH_FFTW3)
BLI_assert(input.type() == ResultType::Color);
BLI_assert(kernel.type() == ResultType::Float || kernel.type() == ResultType::Color);
BLI_assert(output.type() == ResultType::Color);
/* Since we will be doing a circular convolution, we need to zero pad the input image by the
* kernel size and vice versa to avoid the kernel affecting the pixels at the other side of
* image. The kernel size is limited by the image size since it will have no effect on the image
* during convolution. */
const int2 image_size = input.domain().size;
const int2 kernel_size = kernel.domain().size;
const int2 needed_padding_amount = math::max(kernel_size, image_size);
const int2 needed_spatial_size = image_size + needed_padding_amount - 1;
const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size);
/* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and
* stores only half the output since the other half is redundant, so we only allocate half of
* the first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for
* more information. */
const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y);
constexpr int input_channels_count = 4;
const int64_t spatial_pixels_count = int64_t(spatial_size.x) * spatial_size.y;
const int64_t frequency_pixels_count = int64_t(frequency_size.x) * frequency_size.y;
/* A structure to gather all buffers that need to be forward transformed from the real to the
* frequency domain. */
struct ForwardTransformTask {
float *input;
std::complex<float> *output;
};
Vector<ForwardTransformTask> forward_transform_tasks;
/* Allocate a real buffer and a complex buffer for each of the input channels for the FFT input
* and output respectively, then add a forward transform task for it. */
Array<float *> image_spatial_domain_channels(input_channels_count);
Array<std::complex<float> *> image_frequency_domain_channels(input_channels_count);
for (const int channel : image_spatial_domain_channels.index_range()) {
image_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
image_frequency_domain_channels[channel] = reinterpret_cast<std::complex<float> *>(
fftwf_alloc_complex(frequency_pixels_count));
forward_transform_tasks.append(ForwardTransformTask{image_spatial_domain_channels[channel],
image_frequency_domain_channels[channel]});
}
BLI_SCOPED_DEFER([&]() {
for (const int channel : image_spatial_domain_channels.index_range()) {
fftwf_free(image_spatial_domain_channels[channel]);
fftwf_free(image_frequency_domain_channels[channel]);
}
});
const int kernel_channels_count = kernel.channels_count();
const bool is_color_kernel = kernel_channels_count == 4;
/* Allocate a real buffer and a complex buffer for each of the kernel channels for the FFT input
* and output respectively, then add a forward transform task for it. */
Array<float *> kernel_spatial_domain_channels(kernel_channels_count);
Array<std::complex<float> *> kernel_frequency_domain_channels(kernel_channels_count);
for (const int channel : kernel_spatial_domain_channels.index_range()) {
kernel_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
kernel_frequency_domain_channels[channel] = reinterpret_cast<std::complex<float> *>(
fftwf_alloc_complex(frequency_pixels_count));
forward_transform_tasks.append(ForwardTransformTask{
kernel_spatial_domain_channels[channel], kernel_frequency_domain_channels[channel]});
}
BLI_SCOPED_DEFER([&]() {
for (const int channel : kernel_spatial_domain_channels.index_range()) {
fftwf_free(kernel_spatial_domain_channels[channel]);
fftwf_free(kernel_frequency_domain_channels[channel]);
}
});
/* Create a real to complex and complex to real plans to transform the image to the frequency
* domain.
*
* Notice that FFTW provides an advanced interface as per Section 4.4.2 Advanced Real-data DFTs
* to transform all image channels simultaneously with interleaved pixel layouts. But profiling
* showed better performance when running a single plan in parallel for all image channels with a
* planner pixel format, so this is what we will be doing.
*
* The input and output buffers here are dummy buffers and still not initialized, because they
* are required by the planner internally for planning and their data will be overwritten. So
* make sure not to initialize the buffers before creating the plan. */
fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d(
spatial_size.y,
spatial_size.x,
image_spatial_domain_channels[0],
reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[0]),
FFTW_ESTIMATE);
fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d(
spatial_size.y,
spatial_size.x,
reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[0]),
image_spatial_domain_channels[0],
FFTW_ESTIMATE);
BLI_SCOPED_DEFER([&]() {
fftwf_destroy_plan(forward_plan);
fftwf_destroy_plan(backward_plan);
});
/* Download GPU results to CPU for GPU contexts. */
Result input_cpu = context.use_gpu() ? input.download_to_cpu() : input;
Result kernel_cpu = context.use_gpu() ? kernel.download_to_cpu() : kernel;
BLI_SCOPED_DEFER([&]() {
if (context.use_gpu()) {
input_cpu.release();
kernel_cpu.release();
}
});
/* Zero pad the image to the required spatial domain size, storing each channel in planar
* format for better cache locality, that is, RRRR...GGGG...BBBB...AAAA. */
threading::memory_bandwidth_bound_task(spatial_pixels_count * sizeof(float), [&]() {
parallel_for(spatial_size, [&](const int2 texel) {
const float4 pixel_color = input_cpu.load_pixel_zero<float4>(texel);
for (const int channel : IndexRange(input_channels_count)) {
float *buffer = image_spatial_domain_channels[channel];
const int64_t index = texel.y * int64_t(spatial_size.x) + texel.x;
buffer[index] = pixel_color[channel];
}
});
});
/* Use doubles to sum the kernel since floats are not stable with threaded summation. We always
* use a double4 even for float kernels for generality, in that case, only the first component
* is initialized. */
threading::EnumerableThreadSpecific<double4> sum_by_thread([]() { return double4(0.0); });
/* Compute the kernel while zero padding to match the spatial size. */
const int2 kernel_center = kernel_size / 2;
parallel_for(spatial_size, [&](const int2 texel) {
/* We offset the computed kernel with wrap around such that it is centered at the zero
* point, which is the expected format for doing circular convolutions in the frequency
* domain. */
const int2 centered_texel = kernel_center - texel;
const int2 wrapped_texel = int2(mod_i(centered_texel.x, spatial_size.x),
mod_i(centered_texel.y, spatial_size.y));
const float4 kernel_value = is_color_kernel ?
kernel_cpu.load_pixel_zero<float4>(wrapped_texel) :
float4(kernel_cpu.load_pixel_zero<float>(wrapped_texel));
for (const int channel : IndexRange(kernel_channels_count)) {
float *buffer = kernel_spatial_domain_channels[channel];
buffer[texel.x + texel.y * int64_t(spatial_size.x)] = kernel_value[channel];
}
sum_by_thread.local() += double4(kernel_value);
});
/* The computed kernel is not normalized and should be normalized, but instead of normalizing the
* kernel during computation, we normalize it in the frequency domain when convolving the kernel
* to the image since we will be doing sample normalization anyways. This is okay since the
* Fourier transform is linear. */
const float4 sum = float4(
std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), double4(0.0)));
const float4 sanitized_sum = float4(sum[0] == 0.0f ? 1.0f : sum[0],
sum[1] == 0.0f ? 1.0f : sum[1],
sum[2] == 0.0f ? 1.0f : sum[2],
sum[3] == 0.0f ? 1.0f : sum[3]);
const float4 normalization_factor = normalize_kernel ? sanitized_sum : float4(1.0f);
/* Transform all necessary data from the real domain to the frequency domain. */
threading::parallel_for(
forward_transform_tasks.index_range(), 1, [&](const IndexRange sub_range) {
for (const int64_t i : sub_range) {
fftwf_execute_dft_r2c(
forward_plan,
forward_transform_tasks[i].input,
reinterpret_cast<fftwf_complex *>(forward_transform_tasks[i].output));
}
});
/* Multiply the kernel and the image in the frequency domain to perform the convolution. The
* FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result
* in an image that is scaled by a factor of the product of the width and height, so we take
* that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms
* of the FFTW manual for more information. */
const float4 normalization_scale = float(spatial_size.x) * spatial_size.y * normalization_factor;
threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) {
for (const int64_t channel : IndexRange(input_channels_count)) {
const int kernel_channel = is_color_kernel ? channel : 0;
std::complex<float> *image_buffer = image_frequency_domain_channels[channel];
const std::complex<float> *kernel_buffer = kernel_frequency_domain_channels[kernel_channel];
for (const int64_t y : sub_y_range) {
for (const int64_t x : IndexRange(frequency_size.x)) {
const int64_t index = x + y * int64_t(frequency_size.x);
image_buffer[index] *= kernel_buffer[index] / normalization_scale[kernel_channel];
}
}
}
});
/* Transform channels from the frequency domain to the real domain. */
threading::parallel_for(IndexRange(input_channels_count), 1, [&](const IndexRange sub_range) {
for (const int64_t channel : sub_range) {
fftwf_execute_dft_c2r(
backward_plan,
reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[channel]),
image_spatial_domain_channels[channel]);
}
});
Result output_cpu = context.create_result(input.type());
output_cpu.allocate_texture(input.domain(), true, ResultStorageType::CPU);
/* Copy the result to the output. */
threading::memory_bandwidth_bound_task(input.size_in_bytes(), [&]() {
parallel_for(image_size, [&](const int2 texel) {
float4 color = float4(0.0f);
for (const int channel : IndexRange(input_channels_count)) {
const int64_t index = texel.x + texel.y * int64_t(spatial_size.x);
color[channel] = image_spatial_domain_channels[channel][index];
}
output_cpu.store_pixel(texel, color);
});
});
if (context.use_gpu()) {
output = output_cpu.upload_to_gpu(true);
output_cpu.release();
}
else {
output.steal_data(output_cpu);
}
#else
output.allocate_texture(input.domain());
if (context.use_gpu()) {
GPU_texture_copy(output, input);
}
else {
parallel_for(output.domain().size, [&](const int2 texel) {
output.store_pixel(texel, input.load_pixel<float4>(texel));
});
}
#endif
}
} // namespace blender::compositor

View File

@@ -459,7 +459,7 @@ void Result::allocate_invalid()
this->allocate_single_value();
}
Result Result::upload_to_gpu(const bool from_pool)
Result Result::upload_to_gpu(const bool from_pool) const
{
BLI_assert(storage_type_ == ResultStorageType::CPU);
BLI_assert(this->is_allocated());
@@ -471,6 +471,19 @@ Result Result::upload_to_gpu(const bool from_pool)
return result;
}
Result Result::download_to_cpu() const
{
BLI_assert(storage_type_ == ResultStorageType::GPU);
BLI_assert(this->is_allocated());
Result result = Result(*context_, this->type(), this->precision());
GPU_memory_barrier(GPU_BARRIER_TEXTURE_UPDATE);
void *data = GPU_texture_read(*this, this->get_gpu_data_format(), 0);
result.steal_data(data, this->domain().size);
return result;
}
void Result::bind_as_texture(gpu::Shader *shader, const char *texture_name) const
{
BLI_assert(storage_type_ == ResultStorageType::GPU);
@@ -538,6 +551,17 @@ void Result::steal_data(Result &source)
source = Result(*context_, type_, precision_);
}
void Result::steal_data(void *data, int2 size)
{
BLI_assert(!this->is_allocated());
const int64_t array_size = int64_t(size.x) * int64_t(size.y);
cpu_data_ = GMutableSpan(this->get_cpp_type(), data, array_size);
storage_type_ = ResultStorageType::CPU;
domain_ = Domain(size);
data_reference_count_ = new int(1);
}
/* Returns true if the given GPU texture is compatible with the type and precision of the given
* result. */
[[maybe_unused]] static bool is_compatible_texture(const blender::gpu::Texture *texture,
@@ -753,6 +777,16 @@ int Result::reference_count() const
return reference_count_;
}
int64_t Result::size_in_bytes() const
{
const int64_t pixel_size = this->get_cpp_type().size;
if (this->is_single_value()) {
return pixel_size;
}
const int2 image_size = this->domain().size;
return pixel_size * image_size.x * image_size.y;
}
GPointer Result::single_value() const
{
return std::visit([](const auto &value) { return GPointer(&value); }, single_value_);

View File

@@ -9907,6 +9907,7 @@ static void rna_def_nodes(BlenderRNA *brna)
define("CompositorNode", "CompositorNodeColorCorrection");
define("CompositorNode", "CompositorNodeColorMatte");
define("CompositorNode", "CompositorNodeColorSpill", def_cmp_color_spill);
define("CompositorNode", "CompositorNodeConvolve");
define("CompositorNode", "CompositorNodeCombHSVA");
define("CompositorNode", "CompositorNodeCombineColor", def_cmp_combsep_color);
define("CompositorNode", "CompositorNodeCombRGBA");

View File

@@ -37,6 +37,7 @@ set(SRC
nodes/node_composite_color_spill.cc
nodes/node_composite_colorbalance.cc
nodes/node_composite_colorcorrection.cc
nodes/node_composite_convolve.cc
nodes/node_composite_common.cc
nodes/node_composite_convert_color_space.cc
nodes/node_composite_cornerpin.cc

View File

@@ -0,0 +1,124 @@
/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#include <cstdint>
#include "COM_algorithm_convolve.hh"
#include "COM_node_operation.hh"
#include "node_composite_util.hh"
namespace blender::nodes::node_composite_convolve_cc {
enum class KernelDataType : uint8_t {
Float = 0,
Color = 1,
};
static const EnumPropertyItem kernel_data_type_items[] = {
{int(KernelDataType::Float),
"FLOAT",
0,
"Float",
"The kernel is a float and will be convolved with all input channels"},
{int(KernelDataType::Color),
"COLOR",
0,
"Color",
"The kernel is a color and each channel of the kernel will be convolved with each respective "
"channel in the input"},
{0, nullptr, 0, nullptr, nullptr},
};
static void node_declare(NodeDeclarationBuilder &b)
{
b.add_input<decl::Color>("Image").hide_value().structure_type(StructureType::Dynamic);
b.add_input<decl::Menu>("Kernel Data Type")
.default_value(KernelDataType::Float)
.static_items(kernel_data_type_items);
b.add_input<decl::Float>("Kernel", "Float Kernel")
.hide_value()
.structure_type(StructureType::Dynamic)
.usage_by_single_menu(int(KernelDataType::Float))
.compositor_realization_mode(CompositorInputRealizationMode::Transforms);
b.add_input<decl::Color>("Kernel", "Color Kernel")
.hide_value()
.structure_type(StructureType::Dynamic)
.usage_by_single_menu(int(KernelDataType::Color))
.compositor_realization_mode(CompositorInputRealizationMode::Transforms);
b.add_input<decl::Bool>("Normalize Kernel")
.default_value(true)
.description("Normalizes the kernel such that it integrates to one");
b.add_output<decl::Color>("Image").structure_type(StructureType::Dynamic);
}
using namespace blender::compositor;
class ConvolveOperation : public NodeOperation {
public:
using NodeOperation::NodeOperation;
void execute() override
{
const Result &input = this->get_input("Image");
const Result &kernel = this->get_kernel_input();
Result &output = this->get_result("Image");
if (input.is_single_value() || kernel.is_single_value()) {
output.share_data(input);
return;
}
convolve(this->context(), input, kernel, output, this->get_normalize_kernel());
}
const Result &get_kernel_input()
{
switch (this->get_kernel_data_type()) {
case KernelDataType::Float:
return this->get_input("Float Kernel");
case KernelDataType::Color:
return this->get_input("Color Kernel");
}
BLI_assert_unreachable();
return this->get_input("Float Kernel");
}
KernelDataType get_kernel_data_type()
{
const Result &input = this->get_input("Kernel Data Type");
const MenuValue default_menu_value = MenuValue(KernelDataType::Float);
const MenuValue menu_value = input.get_single_value_default(default_menu_value);
return static_cast<KernelDataType>(menu_value.value);
}
bool get_normalize_kernel()
{
return this->get_input("Normalize Kernel").get_single_value_default(true);
}
};
static NodeOperation *get_compositor_operation(Context &context, DNode node)
{
return new ConvolveOperation(context, node);
}
static void node_register()
{
static blender::bke::bNodeType ntype;
cmp_node_type_base(&ntype, "CompositorNodeConvolve");
ntype.ui_name = "Convolve";
ntype.ui_description = "Convolves an image with a kernel";
ntype.nclass = NODE_CLASS_OP_FILTER;
ntype.declare = node_declare;
ntype.get_compositor_operation = get_compositor_operation;
blender::bke::node_register_type(ntype);
}
NOD_REGISTER_NODE(node_register)
} // namespace blender::nodes::node_composite_convolve_cc