Files
test/source/blender/compositor/algorithms/intern/convolve.cc
2025-08-29 14:40:48 +10:00

276 lines
12 KiB
C++

/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#include <complex>
#include <numeric>
#include "BLI_array.hh"
#include "BLI_assert.h"
#include "BLI_enumerable_thread_specific.hh"
#include "BLI_fftw.hh"
#include "BLI_index_range.hh"
#include "BLI_memory_utils.hh"
#include "BLI_task.hh"
#if defined(WITH_FFTW3)
# include <fftw3.h>
#endif
#include "COM_context.hh"
#include "COM_result.hh"
#include "COM_utilities.hh"
#include "COM_algorithm_convolve.hh"
namespace blender::compositor {
void convolve(Context &context,
const Result &input,
const Result &kernel,
Result &output,
const bool normalize_kernel)
{
#if defined(WITH_FFTW3)
BLI_assert(input.type() == ResultType::Color);
BLI_assert(kernel.type() == ResultType::Float || kernel.type() == ResultType::Color);
BLI_assert(output.type() == ResultType::Color);
/* Since we will be doing a circular convolution, we need to zero pad the input image by the
* kernel size and vice versa to avoid the kernel affecting the pixels at the other side of
* image. The kernel size is limited by the image size since it will have no effect on the image
* during convolution. */
const int2 image_size = input.domain().size;
const int2 kernel_size = kernel.domain().size;
const int2 needed_padding_amount = math::max(kernel_size, image_size);
const int2 needed_spatial_size = image_size + needed_padding_amount - 1;
const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size);
/* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and
* stores only half the output since the other half is redundant, so we only allocate half of
* the first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for
* more information. */
const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y);
constexpr int input_channels_count = 4;
const int64_t spatial_pixels_count = int64_t(spatial_size.x) * spatial_size.y;
const int64_t frequency_pixels_count = int64_t(frequency_size.x) * frequency_size.y;
/* A structure to gather all buffers that need to be forward transformed from the real to the
* frequency domain. */
struct ForwardTransformTask {
float *input;
std::complex<float> *output;
};
Vector<ForwardTransformTask> forward_transform_tasks;
/* Allocate a real buffer and a complex buffer for each of the input channels for the FFT input
* and output respectively, then add a forward transform task for it. */
Array<float *> image_spatial_domain_channels(input_channels_count);
Array<std::complex<float> *> image_frequency_domain_channels(input_channels_count);
for (const int channel : image_spatial_domain_channels.index_range()) {
image_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
image_frequency_domain_channels[channel] = reinterpret_cast<std::complex<float> *>(
fftwf_alloc_complex(frequency_pixels_count));
forward_transform_tasks.append(ForwardTransformTask{image_spatial_domain_channels[channel],
image_frequency_domain_channels[channel]});
}
BLI_SCOPED_DEFER([&]() {
for (const int channel : image_spatial_domain_channels.index_range()) {
fftwf_free(image_spatial_domain_channels[channel]);
fftwf_free(image_frequency_domain_channels[channel]);
}
});
const int kernel_channels_count = kernel.channels_count();
const bool is_color_kernel = kernel_channels_count == 4;
/* Allocate a real buffer and a complex buffer for each of the kernel channels for the FFT input
* and output respectively, then add a forward transform task for it. */
Array<float *> kernel_spatial_domain_channels(kernel_channels_count);
Array<std::complex<float> *> kernel_frequency_domain_channels(kernel_channels_count);
for (const int channel : kernel_spatial_domain_channels.index_range()) {
kernel_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
kernel_frequency_domain_channels[channel] = reinterpret_cast<std::complex<float> *>(
fftwf_alloc_complex(frequency_pixels_count));
forward_transform_tasks.append(ForwardTransformTask{
kernel_spatial_domain_channels[channel], kernel_frequency_domain_channels[channel]});
}
BLI_SCOPED_DEFER([&]() {
for (const int channel : kernel_spatial_domain_channels.index_range()) {
fftwf_free(kernel_spatial_domain_channels[channel]);
fftwf_free(kernel_frequency_domain_channels[channel]);
}
});
/* Create a real to complex and complex to real plans to transform the image to the frequency
* domain.
*
* Notice that FFTW provides an advanced interface as per Section 4.4.2 Advanced Real-data DFTs
* to transform all image channels simultaneously with interleaved pixel layouts. But profiling
* showed better performance when running a single plan in parallel for all image channels with a
* planner pixel format, so this is what we will be doing.
*
* The input and output buffers here are dummy buffers and still not initialized, because they
* are required by the planner internally for planning and their data will be overwritten. So
* make sure not to initialize the buffers before creating the plan. */
fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d(
spatial_size.y,
spatial_size.x,
image_spatial_domain_channels[0],
reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[0]),
FFTW_ESTIMATE);
fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d(
spatial_size.y,
spatial_size.x,
reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[0]),
image_spatial_domain_channels[0],
FFTW_ESTIMATE);
BLI_SCOPED_DEFER([&]() {
fftwf_destroy_plan(forward_plan);
fftwf_destroy_plan(backward_plan);
});
/* Download GPU results to CPU for GPU contexts. */
Result input_cpu = context.use_gpu() ? input.download_to_cpu() : input;
Result kernel_cpu = context.use_gpu() ? kernel.download_to_cpu() : kernel;
BLI_SCOPED_DEFER([&]() {
if (context.use_gpu()) {
input_cpu.release();
kernel_cpu.release();
}
});
/* Zero pad the image to the required spatial domain size, storing each channel in planar
* format for better cache locality, that is, RRRR...GGGG...BBBB...AAAA. */
threading::memory_bandwidth_bound_task(spatial_pixels_count * sizeof(float), [&]() {
parallel_for(spatial_size, [&](const int2 texel) {
const float4 pixel_color = input_cpu.load_pixel_zero<float4>(texel);
for (const int channel : IndexRange(input_channels_count)) {
float *buffer = image_spatial_domain_channels[channel];
const int64_t index = texel.y * int64_t(spatial_size.x) + texel.x;
buffer[index] = pixel_color[channel];
}
});
});
/* Use doubles to sum the kernel since floats are not stable with threaded summation. We always
* use a double4 even for float kernels for generality, in that case, only the first component
* is initialized. */
threading::EnumerableThreadSpecific<double4> sum_by_thread([]() { return double4(0.0); });
/* Compute the kernel while zero padding to match the spatial size. */
const int2 kernel_center = kernel_size / 2;
parallel_for(spatial_size, [&](const int2 texel) {
/* We offset the computed kernel with wrap around such that it is centered at the zero
* point, which is the expected format for doing circular convolutions in the frequency
* domain. */
const int2 centered_texel = kernel_center - texel;
const int2 wrapped_texel = int2(mod_i(centered_texel.x, spatial_size.x),
mod_i(centered_texel.y, spatial_size.y));
const float4 kernel_value = is_color_kernel ?
kernel_cpu.load_pixel_zero<float4>(wrapped_texel) :
float4(kernel_cpu.load_pixel_zero<float>(wrapped_texel));
for (const int channel : IndexRange(kernel_channels_count)) {
float *buffer = kernel_spatial_domain_channels[channel];
buffer[texel.x + texel.y * int64_t(spatial_size.x)] = kernel_value[channel];
}
sum_by_thread.local() += double4(kernel_value);
});
/* The computed kernel is not normalized and should be normalized, but instead of normalizing the
* kernel during computation, we normalize it in the frequency domain when convolving the kernel
* to the image since we will be doing sample normalization anyways. This is okay since the
* Fourier transform is linear. */
const float4 sum = float4(
std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), double4(0.0)));
const float4 sanitized_sum = float4(sum[0] == 0.0f ? 1.0f : sum[0],
sum[1] == 0.0f ? 1.0f : sum[1],
sum[2] == 0.0f ? 1.0f : sum[2],
sum[3] == 0.0f ? 1.0f : sum[3]);
const float4 normalization_factor = normalize_kernel ? sanitized_sum : float4(1.0f);
/* Transform all necessary data from the real domain to the frequency domain. */
threading::parallel_for(
forward_transform_tasks.index_range(), 1, [&](const IndexRange sub_range) {
for (const int64_t i : sub_range) {
fftwf_execute_dft_r2c(
forward_plan,
forward_transform_tasks[i].input,
reinterpret_cast<fftwf_complex *>(forward_transform_tasks[i].output));
}
});
/* Multiply the kernel and the image in the frequency domain to perform the convolution. The
* FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result
* in an image that is scaled by a factor of the product of the width and height, so we take
* that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms
* of the FFTW manual for more information. */
const float4 normalization_scale = float(spatial_size.x) * spatial_size.y * normalization_factor;
threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) {
for (const int64_t channel : IndexRange(input_channels_count)) {
const int kernel_channel = is_color_kernel ? channel : 0;
std::complex<float> *image_buffer = image_frequency_domain_channels[channel];
const std::complex<float> *kernel_buffer = kernel_frequency_domain_channels[kernel_channel];
for (const int64_t y : sub_y_range) {
for (const int64_t x : IndexRange(frequency_size.x)) {
const int64_t index = x + y * int64_t(frequency_size.x);
image_buffer[index] *= kernel_buffer[index] / normalization_scale[kernel_channel];
}
}
}
});
/* Transform channels from the frequency domain to the real domain. */
threading::parallel_for(IndexRange(input_channels_count), 1, [&](const IndexRange sub_range) {
for (const int64_t channel : sub_range) {
fftwf_execute_dft_c2r(
backward_plan,
reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[channel]),
image_spatial_domain_channels[channel]);
}
});
Result output_cpu = context.create_result(input.type());
output_cpu.allocate_texture(input.domain(), true, ResultStorageType::CPU);
/* Copy the result to the output. */
threading::memory_bandwidth_bound_task(input.size_in_bytes(), [&]() {
parallel_for(image_size, [&](const int2 texel) {
float4 color = float4(0.0f);
for (const int channel : IndexRange(input_channels_count)) {
const int64_t index = texel.x + texel.y * int64_t(spatial_size.x);
color[channel] = image_spatial_domain_channels[channel][index];
}
output_cpu.store_pixel(texel, color);
});
});
if (context.use_gpu()) {
output = output_cpu.upload_to_gpu(true);
output_cpu.release();
}
else {
output.steal_data(output_cpu);
}
#else
UNUSED_VARS(kernel, normalize_kernel);
output.allocate_texture(input.domain());
if (context.use_gpu()) {
GPU_texture_copy(output, input);
}
else {
parallel_for(output.domain().size, [&](const int2 texel) {
output.store_pixel(texel, input.load_pixel<float4>(texel));
});
}
#endif
}
} // namespace blender::compositor