/* SPDX-FileCopyrightText: 2025 Blender Authors * * SPDX-License-Identifier: GPL-2.0-or-later */ #include #include #include "BLI_array.hh" #include "BLI_assert.h" #include "BLI_enumerable_thread_specific.hh" #include "BLI_fftw.hh" #include "BLI_index_range.hh" #include "BLI_memory_utils.hh" #include "BLI_task.hh" #if defined(WITH_FFTW3) # include #endif #include "COM_context.hh" #include "COM_result.hh" #include "COM_utilities.hh" #include "COM_algorithm_convolve.hh" namespace blender::compositor { void convolve(Context &context, const Result &input, const Result &kernel, Result &output, const bool normalize_kernel) { #if defined(WITH_FFTW3) BLI_assert(input.type() == ResultType::Color); BLI_assert(kernel.type() == ResultType::Float || kernel.type() == ResultType::Color); BLI_assert(output.type() == ResultType::Color); /* Since we will be doing a circular convolution, we need to zero pad the input image by the * kernel size and vice versa to avoid the kernel affecting the pixels at the other side of * image. The kernel size is limited by the image size since it will have no effect on the image * during convolution. */ const int2 image_size = input.domain().size; const int2 kernel_size = kernel.domain().size; const int2 needed_padding_amount = math::max(kernel_size, image_size); const int2 needed_spatial_size = image_size + needed_padding_amount - 1; const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size); /* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and * stores only half the output since the other half is redundant, so we only allocate half of * the first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for * more information. */ const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y); constexpr int input_channels_count = 4; const int64_t spatial_pixels_count = int64_t(spatial_size.x) * spatial_size.y; const int64_t frequency_pixels_count = int64_t(frequency_size.x) * frequency_size.y; /* A structure to gather all buffers that need to be forward transformed from the real to the * frequency domain. */ struct ForwardTransformTask { float *input; std::complex *output; }; Vector forward_transform_tasks; /* Allocate a real buffer and a complex buffer for each of the input channels for the FFT input * and output respectively, then add a forward transform task for it. */ Array image_spatial_domain_channels(input_channels_count); Array *> image_frequency_domain_channels(input_channels_count); for (const int channel : image_spatial_domain_channels.index_range()) { image_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count); image_frequency_domain_channels[channel] = reinterpret_cast *>( fftwf_alloc_complex(frequency_pixels_count)); forward_transform_tasks.append(ForwardTransformTask{image_spatial_domain_channels[channel], image_frequency_domain_channels[channel]}); } BLI_SCOPED_DEFER([&]() { for (const int channel : image_spatial_domain_channels.index_range()) { fftwf_free(image_spatial_domain_channels[channel]); fftwf_free(image_frequency_domain_channels[channel]); } }); const int kernel_channels_count = kernel.channels_count(); const bool is_color_kernel = kernel_channels_count == 4; /* Allocate a real buffer and a complex buffer for each of the kernel channels for the FFT input * and output respectively, then add a forward transform task for it. */ Array kernel_spatial_domain_channels(kernel_channels_count); Array *> kernel_frequency_domain_channels(kernel_channels_count); for (const int channel : kernel_spatial_domain_channels.index_range()) { kernel_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count); kernel_frequency_domain_channels[channel] = reinterpret_cast *>( fftwf_alloc_complex(frequency_pixels_count)); forward_transform_tasks.append(ForwardTransformTask{ kernel_spatial_domain_channels[channel], kernel_frequency_domain_channels[channel]}); } BLI_SCOPED_DEFER([&]() { for (const int channel : kernel_spatial_domain_channels.index_range()) { fftwf_free(kernel_spatial_domain_channels[channel]); fftwf_free(kernel_frequency_domain_channels[channel]); } }); /* Create a real to complex and complex to real plans to transform the image to the frequency * domain. * * Notice that FFTW provides an advanced interface as per Section 4.4.2 Advanced Real-data DFTs * to transform all image channels simultaneously with interleaved pixel layouts. But profiling * showed better performance when running a single plan in parallel for all image channels with a * planner pixel format, so this is what we will be doing. * * The input and output buffers here are dummy buffers and still not initialized, because they * are required by the planner internally for planning and their data will be overwritten. So * make sure not to initialize the buffers before creating the plan. */ fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d( spatial_size.y, spatial_size.x, image_spatial_domain_channels[0], reinterpret_cast(image_frequency_domain_channels[0]), FFTW_ESTIMATE); fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d( spatial_size.y, spatial_size.x, reinterpret_cast(image_frequency_domain_channels[0]), image_spatial_domain_channels[0], FFTW_ESTIMATE); BLI_SCOPED_DEFER([&]() { fftwf_destroy_plan(forward_plan); fftwf_destroy_plan(backward_plan); }); /* Download GPU results to CPU for GPU contexts. */ Result input_cpu = context.use_gpu() ? input.download_to_cpu() : input; Result kernel_cpu = context.use_gpu() ? kernel.download_to_cpu() : kernel; BLI_SCOPED_DEFER([&]() { if (context.use_gpu()) { input_cpu.release(); kernel_cpu.release(); } }); /* Zero pad the image to the required spatial domain size, storing each channel in planar * format for better cache locality, that is, RRRR...GGGG...BBBB...AAAA. */ threading::memory_bandwidth_bound_task(spatial_pixels_count * sizeof(float), [&]() { parallel_for(spatial_size, [&](const int2 texel) { const float4 pixel_color = input_cpu.load_pixel_zero(texel); for (const int channel : IndexRange(input_channels_count)) { float *buffer = image_spatial_domain_channels[channel]; const int64_t index = texel.y * int64_t(spatial_size.x) + texel.x; buffer[index] = pixel_color[channel]; } }); }); /* Use doubles to sum the kernel since floats are not stable with threaded summation. We always * use a double4 even for float kernels for generality, in that case, only the first component * is initialized. */ threading::EnumerableThreadSpecific sum_by_thread([]() { return double4(0.0); }); /* Compute the kernel while zero padding to match the spatial size. */ const int2 kernel_center = kernel_size / 2; parallel_for(spatial_size, [&](const int2 texel) { /* We offset the computed kernel with wrap around such that it is centered at the zero * point, which is the expected format for doing circular convolutions in the frequency * domain. */ const int2 centered_texel = kernel_center - texel; const int2 wrapped_texel = int2(mod_i(centered_texel.x, spatial_size.x), mod_i(centered_texel.y, spatial_size.y)); const float4 kernel_value = is_color_kernel ? kernel_cpu.load_pixel_zero(wrapped_texel) : float4(kernel_cpu.load_pixel_zero(wrapped_texel)); for (const int channel : IndexRange(kernel_channels_count)) { float *buffer = kernel_spatial_domain_channels[channel]; buffer[texel.x + texel.y * int64_t(spatial_size.x)] = kernel_value[channel]; } sum_by_thread.local() += double4(kernel_value); }); /* The computed kernel is not normalized and should be normalized, but instead of normalizing the * kernel during computation, we normalize it in the frequency domain when convolving the kernel * to the image since we will be doing sample normalization anyways. This is okay since the * Fourier transform is linear. */ const float4 sum = float4( std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), double4(0.0))); const float4 sanitized_sum = float4(sum[0] == 0.0f ? 1.0f : sum[0], sum[1] == 0.0f ? 1.0f : sum[1], sum[2] == 0.0f ? 1.0f : sum[2], sum[3] == 0.0f ? 1.0f : sum[3]); const float4 normalization_factor = normalize_kernel ? sanitized_sum : float4(1.0f); /* Transform all necessary data from the real domain to the frequency domain. */ threading::parallel_for( forward_transform_tasks.index_range(), 1, [&](const IndexRange sub_range) { for (const int64_t i : sub_range) { fftwf_execute_dft_r2c( forward_plan, forward_transform_tasks[i].input, reinterpret_cast(forward_transform_tasks[i].output)); } }); /* Multiply the kernel and the image in the frequency domain to perform the convolution. The * FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result * in an image that is scaled by a factor of the product of the width and height, so we take * that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms * of the FFTW manual for more information. */ const float4 normalization_scale = float(spatial_size.x) * spatial_size.y * normalization_factor; threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) { for (const int64_t channel : IndexRange(input_channels_count)) { const int kernel_channel = is_color_kernel ? channel : 0; std::complex *image_buffer = image_frequency_domain_channels[channel]; const std::complex *kernel_buffer = kernel_frequency_domain_channels[kernel_channel]; for (const int64_t y : sub_y_range) { for (const int64_t x : IndexRange(frequency_size.x)) { const int64_t index = x + y * int64_t(frequency_size.x); image_buffer[index] *= kernel_buffer[index] / normalization_scale[kernel_channel]; } } } }); /* Transform channels from the frequency domain to the real domain. */ threading::parallel_for(IndexRange(input_channels_count), 1, [&](const IndexRange sub_range) { for (const int64_t channel : sub_range) { fftwf_execute_dft_c2r( backward_plan, reinterpret_cast(image_frequency_domain_channels[channel]), image_spatial_domain_channels[channel]); } }); Result output_cpu = context.create_result(input.type()); output_cpu.allocate_texture(input.domain(), true, ResultStorageType::CPU); /* Copy the result to the output. */ threading::memory_bandwidth_bound_task(input.size_in_bytes(), [&]() { parallel_for(image_size, [&](const int2 texel) { float4 color = float4(0.0f); for (const int channel : IndexRange(input_channels_count)) { const int64_t index = texel.x + texel.y * int64_t(spatial_size.x); color[channel] = image_spatial_domain_channels[channel][index]; } output_cpu.store_pixel(texel, color); }); }); if (context.use_gpu()) { output = output_cpu.upload_to_gpu(true); output_cpu.release(); } else { output.steal_data(output_cpu); } #else output.allocate_texture(input.domain()); if (context.use_gpu()) { GPU_texture_copy(output, input); } else { parallel_for(output.domain().size, [&](const int2 texel) { output.store_pixel(texel, input.load_pixel(texel)); }); } #endif } } // namespace blender::compositor