diff --git a/CMakeLists.txt b/CMakeLists.txt index 78918ba085f..22cadad665b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,7 +306,7 @@ option(WITH_IK_SOLVER "\ Enable Legacy IK solver (only disable for development)" ON ) -option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke, ocean sim, and audio effects)" ON) +option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke, ocean sim, glare, and audio effects)" ON) option(WITH_PUGIXML "Enable PugiXML support (Used for OpenImageIO, Grease Pencil SVG export)" ON) option(WITH_BULLET "Enable Bullet (Physics Engine)" ON) option(WITH_SYSTEM_BULLET "\ diff --git a/build_files/cmake/Modules/FindFftw3.cmake b/build_files/cmake/Modules/FindFftw3.cmake index 5950c2f61c6..3a13d35f368 100644 --- a/build_files/cmake/Modules/FindFftw3.cmake +++ b/build_files/cmake/Modules/FindFftw3.cmake @@ -11,9 +11,11 @@ # FFTW3_ROOT_DIR, The base directory to search for Fftw3. # This can also be an environment variable. # FFTW3_FOUND, If false, do not try to use Fftw3. +# WITH_FFTW3_THREADS_F_SUPPORT, if true, single precision Fftw3 supports threads. # # also defined, but not for general use are # FFTW3_LIBRARY_F, where to find the Fftw3 library (single precision float). +# FFTW3_LIBRARY_THREADS_F, where to find the Fftw3 threads library (single precision float). # FFTW3_LIBRARY_D, where to find the Fftw3 library (double precision float). # If `FFTW3_ROOT_DIR` was defined in the environment, use it. @@ -49,6 +51,15 @@ find_library(FFTW3_LIBRARY_F lib64 lib ) +find_library(FFTW3_LIBRARY_THREADS_F + NAMES + fftw3f_threads + HINTS + ${_fftw3_SEARCH_DIRS} + PATH_SUFFIXES + lib64 lib + ) + find_library(FFTW3_LIBRARY_D NAMES fftw3 @@ -60,6 +71,9 @@ find_library(FFTW3_LIBRARY_D list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_F}") list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_D}") +if(FFTW3_LIBRARY_THREADS_F) + list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_THREADS_F}") +endif() # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if # all listed variables are TRUE @@ -70,12 +84,18 @@ find_package_handle_standard_args(Fftw3 DEFAULT_MSG if(FFTW3_FOUND) set(FFTW3_LIBRARIES ${_FFTW3_LIBRARIES}) set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR}) + if(FFTW3_LIBRARY_THREADS_F) + set(WITH_FFTW3_THREADS_F_SUPPORT ON) + else() + set(WITH_FFTW3_THREADS_F_SUPPORT OFF) + endif() endif() mark_as_advanced( FFTW3_INCLUDE_DIR FFTW3_LIBRARY_F + FFTW3_LIBRARY_THREADS_F FFTW3_LIBRARY_D ) diff --git a/source/blender/blenlib/BLI_fftw.hh b/source/blender/blenlib/BLI_fftw.hh new file mode 100644 index 00000000000..f04a9627658 --- /dev/null +++ b/source/blender/blenlib/BLI_fftw.hh @@ -0,0 +1,22 @@ +/* SPDX-FileCopyrightText: 2024 Blender Authors + * + * SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "BLI_math_vector_types.hh" + +#pragma once + +namespace blender::fftw { + +/* FFTW's real to complex and complex to real transforms are more efficient when their input has a + * specific size. This function finds the most optimal size that is more than or equal the given + * size. The input data can then be zero padded to the optimal size for better performance. See + * Section 4.3.3 Real-data DFTs in the FFTW manual for more information. */ +int optimal_size_for_real_transform(int size); +int2 optimal_size_for_real_transform(int2 size); + +/* Initialize the float variant of FFTW. This essentially setup the multithreading hooks to enable + * multithreading using TBB's parallel_for and makes the FFTW planner thread safe. */ +void initialize_float(); + +} // namespace blender::fftw diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt index 412044ecabb..59cef66f814 100644 --- a/source/blender/blenlib/CMakeLists.txt +++ b/source/blender/blenlib/CMakeLists.txt @@ -65,6 +65,7 @@ set(SRC intern/easing.c intern/endian_switch.c intern/expr_pylike_eval.c + intern/fftw.cc intern/fileops.cc intern/fileops_c.cc intern/filereader_file.c @@ -225,6 +226,7 @@ set(SRC BLI_endian_switch_inline.h BLI_enumerable_thread_specific.hh BLI_expr_pylike_eval.h + BLI_fftw.hh BLI_fileops.h BLI_fileops.hh BLI_fileops_types.h @@ -448,6 +450,19 @@ if(WITH_GMP) ) endif() +if(WITH_FFTW3) + list(APPEND INC_SYS + ${FFTW3_INCLUDE_DIRS} + ) + list(APPEND LIB + ${FFTW3_LIBRARIES} + ) + add_definitions(-DWITH_FFTW3) + if (WITH_FFTW3_THREADS_F_SUPPORT) + add_definitions(-DWITH_FFTW3_THREADS_F_SUPPORT) + endif() +endif() + if(WIN32) if(WITH_BLENDER_THUMBNAILER) # Needed for querying the `thumbnailer .dll` in `winstuff.c`. diff --git a/source/blender/blenlib/intern/fftw.cc b/source/blender/blenlib/intern/fftw.cc new file mode 100644 index 00000000000..cdae65a7f2f --- /dev/null +++ b/source/blender/blenlib/intern/fftw.cc @@ -0,0 +1,93 @@ +/* SPDX-FileCopyrightText: 2024 Blender Authors + * + * SPDX-License-Identifier: GPL-2.0-or-later */ + +#if defined(WITH_FFTW3_THREADS_F_SUPPORT) +# include +#endif + +#include "BLI_index_range.hh" +#include "BLI_math_vector_types.hh" +#include "BLI_task.hh" +#include "BLI_threads.h" + +namespace blender::fftw { + +/* Identifies if the given number is a 7-smooth number. */ +static bool is_humble_number(int n) +{ + if (n <= 1) { + return true; + } + if (n % 2 == 0) { + return is_humble_number(n / 2); + } + if (n % 3 == 0) { + return is_humble_number(n / 3); + } + if (n % 5 == 0) { + return is_humble_number(n / 5); + } + if (n % 7 == 0) { + return is_humble_number(n / 7); + } + return false; +} + +/* Finds the even humble number larger than or equal the given number. */ +static int find_next_even_humble_number(int n) +{ + if (n % 2 == 1) { + n++; + } + + while (true) { + if (is_humble_number(n)) { + return n; + } + n += 2; + } + + return n; +} + +/* FFTW is best at handling sizes of the form 2^a * 3^b * 5^c * 7^d * 11^e * 13^f, where e + f is + * either 0 or 1, and the other exponents are arbitrary. And it is beneficial for the size to be + * evene for real transforms. To simplify computation, we ignore the 11 and 13 factors and find the + * even humble number that is more then or equal the given size. See Section 4.3.3 Real-data DFTs + * in the FFTW manual for more information. */ +int optimal_size_for_real_transform(int size) +{ + return find_next_even_humble_number(size); +} + +int2 optimal_size_for_real_transform(int2 size) +{ + return int2(optimal_size_for_real_transform(size.x), optimal_size_for_real_transform(size.y)); +} + +/* See Section 5.2 Usage of Multi-threaded FFTW in the FFTW manual for more information. */ +[[maybe_unused]] static void tbb_parallel_loop_for_fftw(void *(*work)(char *), + char *job_data, + size_t element_size, + int number_of_jobs, + void * /* data */) +{ + threading::parallel_for(IndexRange(number_of_jobs), 1, [&](const IndexRange sub_range) { + for (const int64_t i : sub_range) { + work(job_data + element_size * i); + } + }); +} + +void initialize_float() +{ +#if defined(WITH_FFTW3_THREADS_F_SUPPORT) + fftwf_init_threads(); + fftwf_make_planner_thread_safe(); + fftwf_plan_with_nthreads(BLI_system_thread_count()); + fftwf_threads_set_callback(tbb_parallel_loop_for_fftw, nullptr); +#endif +} + +} // namespace blender::fftw diff --git a/source/blender/compositor/CMakeLists.txt b/source/blender/compositor/CMakeLists.txt index d6ab78678b4..c58371cd3ea 100644 --- a/source/blender/compositor/CMakeLists.txt +++ b/source/blender/compositor/CMakeLists.txt @@ -588,6 +588,16 @@ if(WITH_COMPOSITOR_CPU) ) endif() + if(WITH_FFTW3) + list(APPEND INC_SYS + ${FFTW3_INCLUDE_DIRS} + ) + list(APPEND LIB + ${FFTW3_LIBRARIES} + ) + add_definitions(-DWITH_FFTW3) + endif() + blender_add_lib(bf_compositor "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") blender_set_target_unity_build(bf_compositor 10) diff --git a/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc b/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc index 283c3e09d5c..bf3684de2b3 100644 --- a/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc +++ b/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc @@ -2,423 +2,221 @@ * * SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include +#include +#include + +#if defined(WITH_FFTW3) +# include +#endif + +#include "BLI_enumerable_thread_specific.hh" +#include "BLI_fftw.hh" +#include "BLI_index_range.hh" #include "BLI_math_base.hh" -#include "BLI_math_vector.h" +#include "BLI_math_vector_types.hh" +#include "BLI_task.hh" #include "COM_GlareFogGlowOperation.h" namespace blender::compositor { -/* - * 2D Fast Hartley Transform, used for convolution - */ - -using fREAL = float; - -/* Returns next highest power of 2 of x, as well its log2 in L2. */ -static uint next_pow2(uint x, uint *L2) +/* Given the x and y location in the range from 0 to kernel_size - 1, where kernel_size is odd, + * compute the fog glow kernel value. The equations are arbitrary and were chosen using visual + * judgement. The kernel is not normalized and need normalization. */ +[[maybe_unused]] static float compute_fog_glow_kernel_value(int x, int y, int kernel_size) { - uint pw, x_notpow2 = x & (x - 1); - *L2 = 0; - while (x >>= 1) { - ++(*L2); - } - pw = 1 << (*L2); - if (x_notpow2) { - (*L2)++; - pw <<= 1; - } - return pw; + const int half_kernel_size = kernel_size / 2; + const float scale = 0.25f * math::sqrt(math::square(kernel_size)); + const float v = ((y - half_kernel_size) / float(half_kernel_size)); + const float u = ((x - half_kernel_size) / float(half_kernel_size)); + const float r = (math::square(u) + math::square(v)) * scale; + const float d = -math::sqrt(math::sqrt(math::sqrt(r))) * 9.0f; + const float kernel_value = math::exp(d); + + const float window = (0.5f + 0.5f * math::cos(u * math::numbers::pi)) * + (0.5f + 0.5f * math::cos(v * math::numbers::pi)); + const float windowed_kernel_value = window * kernel_value; + + return windowed_kernel_value; } -//------------------------------------------------------------------------------ - -/* From FXT library by Joerg Arndt, faster in order bit-reversal - * use: `r = revbin_upd(r, h)` where `h = N>>1`. */ -static uint revbin_upd(uint r, uint h) -{ - while (!((r ^= h) & h)) { - h >>= 1; - } - return r; -} -//------------------------------------------------------------------------------ -static void FHT(fREAL *data, uint M, uint inverse) -{ - double tt, fc, dc, fs, ds, a = M_PI; - fREAL t1, t2; - int n2, bd, bl, istep, k, len = 1 << M, n = 1; - - int i, j = 0; - uint Nh = len >> 1; - for (i = 1; i < (len - 1); i++) { - j = revbin_upd(j, Nh); - if (j > i) { - t1 = data[i]; - data[i] = data[j]; - data[j] = t1; - } - } - - do { - fREAL *data_n = &data[n]; - - istep = n << 1; - for (k = 0; k < len; k += istep) { - t1 = data_n[k]; - data_n[k] = data[k] - t1; - data[k] += t1; - } - - n2 = n >> 1; - if (n > 2) { - fc = dc = cos(a); - fs = ds = sqrt(1.0 - fc * fc); // sin(a); - bd = n - 2; - for (bl = 1; bl < n2; bl++) { - fREAL *data_nbd = &data_n[bd]; - fREAL *data_bd = &data[bd]; - for (k = bl; k < len; k += istep) { - t1 = fc * double(data_n[k]) + fs * double(data_nbd[k]); - t2 = fs * double(data_n[k]) - fc * double(data_nbd[k]); - data_n[k] = data[k] - t1; - data_nbd[k] = data_bd[k] - t2; - data[k] += t1; - data_bd[k] += t2; - } - tt = fc * dc - fs * ds; - fs = fs * dc + fc * ds; - fc = tt; - bd -= 2; - } - } - - if (n > 1) { - for (k = n2; k < len; k += istep) { - t1 = data_n[k]; - data_n[k] = data[k] - t1; - data[k] += t1; - } - } - - n = istep; - a *= 0.5; - } while (n < len); - - if (inverse) { - fREAL sc = (fREAL)1 / (fREAL)len; - for (k = 0; k < len; k++) { - data[k] *= sc; - } - } -} -//------------------------------------------------------------------------------ -/* 2D Fast Hartley Transform, Mx/My -> log2 of width/height, - * nzp -> the row where zero pad data starts, - * inverse -> see above. */ -static void FHT2D(fREAL *data, uint Mx, uint My, uint nzp, uint inverse) -{ - uint i, j, Nx, Ny, maxy; - - Nx = 1 << Mx; - Ny = 1 << My; - - /* Rows (forward transform skips 0 pad data). */ - maxy = inverse ? Ny : nzp; - for (j = 0; j < maxy; j++) { - FHT(&data[Nx * j], Mx, inverse); - } - - /* Transpose data. */ - if (Nx == Ny) { /* Square. */ - for (j = 0; j < Ny; j++) { - for (i = j + 1; i < Nx; i++) { - uint op = i + (j << Mx), np = j + (i << My); - std::swap(data[op], data[np]); - } - } - } - else { /* Rectangular. */ - uint k, Nym = Ny - 1, stm = 1 << (Mx + My); - for (i = 0; stm > 0; i++) { -#define PRED(k) (((k & Nym) << Mx) + (k >> My)) - for (j = PRED(i); j > i; j = PRED(j)) { - /* Pass. */ - } - if (j < i) { - continue; - } - for (k = i, j = PRED(i); j != i; k = j, j = PRED(j), stm--) { - std::swap(data[j], data[k]); - } -#undef PRED - stm--; - } - } - - std::swap(Nx, Ny); - std::swap(Mx, My); - - /* Now columns == transposed rows. */ - for (j = 0; j < Ny; j++) { - FHT(&data[Nx * j], Mx, inverse); - } - - /* Finalize. */ - for (j = 0; j <= (Ny >> 1); j++) { - uint jm = (Ny - j) & (Ny - 1); - uint ji = j << Mx; - uint jmi = jm << Mx; - for (i = 0; i <= (Nx >> 1); i++) { - uint im = (Nx - i) & (Nx - 1); - fREAL A = data[ji + i]; - fREAL B = data[jmi + i]; - fREAL C = data[ji + im]; - fREAL D = data[jmi + im]; - fREAL E = (fREAL)0.5 * ((A + D) - (B + C)); - data[ji + i] = A - E; - data[jmi + i] = B + E; - data[ji + im] = C + E; - data[jmi + im] = D - E; - } - } -} - -//------------------------------------------------------------------------------ - -/* 2D convolution calc, d1 *= d2, M/N - > log2 of width/height. */ -static void fht_convolve(fREAL *d1, const fREAL *d2, uint M, uint N) -{ - fREAL a, b; - uint i, j, k, L, mj, mL; - uint m = 1 << M, n = 1 << N; - uint m2 = 1 << (M - 1), n2 = 1 << (N - 1); - uint mn2 = m << (N - 1); - - d1[0] *= d2[0]; - d1[mn2] *= d2[mn2]; - d1[m2] *= d2[m2]; - d1[m2 + mn2] *= d2[m2 + mn2]; - for (i = 1; i < m2; i++) { - k = m - i; - a = d1[i] * d2[i] - d1[k] * d2[k]; - b = d1[k] * d2[i] + d1[i] * d2[k]; - d1[i] = (b + a) * (fREAL)0.5; - d1[k] = (b - a) * (fREAL)0.5; - a = d1[i + mn2] * d2[i + mn2] - d1[k + mn2] * d2[k + mn2]; - b = d1[k + mn2] * d2[i + mn2] + d1[i + mn2] * d2[k + mn2]; - d1[i + mn2] = (b + a) * (fREAL)0.5; - d1[k + mn2] = (b - a) * (fREAL)0.5; - } - for (j = 1; j < n2; j++) { - L = n - j; - mj = j << M; - mL = L << M; - a = d1[mj] * d2[mj] - d1[mL] * d2[mL]; - b = d1[mL] * d2[mj] + d1[mj] * d2[mL]; - d1[mj] = (b + a) * (fREAL)0.5; - d1[mL] = (b - a) * (fREAL)0.5; - a = d1[m2 + mj] * d2[m2 + mj] - d1[m2 + mL] * d2[m2 + mL]; - b = d1[m2 + mL] * d2[m2 + mj] + d1[m2 + mj] * d2[m2 + mL]; - d1[m2 + mj] = (b + a) * (fREAL)0.5; - d1[m2 + mL] = (b - a) * (fREAL)0.5; - } - for (i = 1; i < m2; i++) { - k = m - i; - for (j = 1; j < n2; j++) { - L = n - j; - mj = j << M; - mL = L << M; - a = d1[i + mj] * d2[i + mj] - d1[k + mL] * d2[k + mL]; - b = d1[k + mL] * d2[i + mj] + d1[i + mj] * d2[k + mL]; - d1[i + mj] = (b + a) * (fREAL)0.5; - d1[k + mL] = (b - a) * (fREAL)0.5; - a = d1[i + mL] * d2[i + mL] - d1[k + mj] * d2[k + mj]; - b = d1[k + mj] * d2[i + mL] + d1[i + mL] * d2[k + mj]; - d1[i + mL] = (b + a) * (fREAL)0.5; - d1[k + mj] = (b - a) * (fREAL)0.5; - } - } -} -//------------------------------------------------------------------------------ - -static void convolve(float *dst, MemoryBuffer *in1, MemoryBuffer *in2) -{ - fREAL *data1, *data2, *fp; - uint w2, h2, hw, hh, log2_w, log2_h; - fRGB wt, *colp; - int x, y, ch; - int xbl, ybl, nxb, nyb, xbsz, ybsz; - bool in2done = false; - const uint kernel_width = in2->get_width(); - const uint kernel_height = in2->get_height(); - const uint image_width = in1->get_width(); - const uint image_height = in1->get_height(); - float *kernel_buffer = in2->get_buffer(); - float *image_buffer = in1->get_buffer(); - - MemoryBuffer *rdst = new MemoryBuffer(DataType::Color, in1->get_rect()); - memset(rdst->get_buffer(), - 0, - rdst->get_width() * rdst->get_height() * COM_DATA_TYPE_COLOR_CHANNELS * sizeof(float)); - - /* Convolution result width & height. */ - w2 = 2 * kernel_width - 1; - h2 = 2 * kernel_height - 1; - /* FFT pow2 required size & log2. */ - w2 = next_pow2(w2, &log2_w); - h2 = next_pow2(h2, &log2_h); - - /* Allocate space. */ - data1 = (fREAL *)MEM_callocN(3 * w2 * h2 * sizeof(fREAL), "convolve_fast FHT data1"); - data2 = (fREAL *)MEM_callocN(w2 * h2 * sizeof(fREAL), "convolve_fast FHT data2"); - - /* Normalize convolution. */ - wt[0] = wt[1] = wt[2] = 0.0f; - for (y = 0; y < kernel_height; y++) { - colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS]; - for (x = 0; x < kernel_width; x++) { - add_v3_v3(wt, colp[x]); - } - } - if (wt[0] != 0.0f) { - wt[0] = 1.0f / wt[0]; - } - if (wt[1] != 0.0f) { - wt[1] = 1.0f / wt[1]; - } - if (wt[2] != 0.0f) { - wt[2] = 1.0f / wt[2]; - } - for (y = 0; y < kernel_height; y++) { - colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS]; - for (x = 0; x < kernel_width; x++) { - mul_v3_v3(colp[x], wt); - } - } - - /* Copy image data, unpacking interleaved RGBA into separate channels - * only need to calc data1 once. */ - - /* Block add-overlap. */ - hw = kernel_width >> 1; - hh = kernel_height >> 1; - xbsz = (w2 + 1) - kernel_width; - ybsz = (h2 + 1) - kernel_height; - nxb = image_width / xbsz; - if (image_width % xbsz) { - nxb++; - } - nyb = image_height / ybsz; - if (image_height % ybsz) { - nyb++; - } - for (ybl = 0; ybl < nyb; ybl++) { - for (xbl = 0; xbl < nxb; xbl++) { - - /* Each channel one by one. */ - for (ch = 0; ch < 3; ch++) { - fREAL *data1ch = &data1[ch * w2 * h2]; - - /* Only need to calc fht data from in2 once, can re-use for every block. */ - if (!in2done) { - /* in2, channel ch -> data1 */ - for (y = 0; y < kernel_height; y++) { - fp = &data1ch[y * w2]; - colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS]; - for (x = 0; x < kernel_width; x++) { - fp[x] = colp[x][ch]; - } - } - } - - /* in1, channel ch -> data2 */ - memset(data2, 0, w2 * h2 * sizeof(fREAL)); - for (y = 0; y < ybsz; y++) { - int yy = ybl * ybsz + y; - if (yy >= image_height) { - continue; - } - fp = &data2[y * w2]; - colp = (fRGB *)&image_buffer[yy * image_width * COM_DATA_TYPE_COLOR_CHANNELS]; - for (x = 0; x < xbsz; x++) { - int xx = xbl * xbsz + x; - if (xx >= image_width) { - continue; - } - fp[x] = colp[xx][ch]; - } - } - - /* Forward FHT - * zero pad data start is different for each == height+1. */ - if (!in2done) { - FHT2D(data1ch, log2_w, log2_h, kernel_height + 1, 0); - } - FHT2D(data2, log2_w, log2_h, kernel_height + 1, 0); - - /* FHT2D transposed data, row/col now swapped - * convolve & inverse FHT. */ - fht_convolve(data2, data1ch, log2_h, log2_w); - FHT2D(data2, log2_h, log2_w, 0, 1); - /* Data again transposed, so in order again. */ - - /* Overlap-add result. */ - for (y = 0; y < int(h2); y++) { - const int yy = ybl * ybsz + y - hh; - if ((yy < 0) || (yy >= image_height)) { - continue; - } - fp = &data2[y * w2]; - colp = (fRGB *)&rdst->get_buffer()[yy * image_width * COM_DATA_TYPE_COLOR_CHANNELS]; - for (x = 0; x < int(w2); x++) { - const int xx = xbl * xbsz + x - hw; - if ((xx < 0) || (xx >= image_width)) { - continue; - } - colp[xx][ch] += fp[x]; - } - } - } - in2done = true; - } - } - - MEM_freeN(data2); - MEM_freeN(data1); - memcpy(dst, - rdst->get_buffer(), - sizeof(float) * image_width * image_height * COM_DATA_TYPE_COLOR_CHANNELS); - delete (rdst); -} - -void GlareFogGlowOperation::generate_glare(float *data, - MemoryBuffer *input_image, +void GlareFogGlowOperation::generate_glare(float *output, + MemoryBuffer *image, const NodeGlare *settings) { - const int kernel_size = 1 << settings->size; - MemoryBuffer kernel = MemoryBuffer(DataType::Color, kernel_size, kernel_size); +#if defined(WITH_FFTW3) + fftw::initialize_float(); - const float scale = 0.25f * math::sqrt(math::square(kernel_size)); + /* We use an odd sized kernel since an even one will typically introduce a tiny offset as it has + * no exact center value. */ + const int kernel_size = (1 << settings->size) + 1; - for (int y = 0; y < kernel_size; y++) { - const float v = 2.0f * (y / float(kernel_size)) - 1.0f; - for (int x = 0; x < kernel_size; x++) { - const float u = 2.0f * (x / float(kernel_size)) - 1.0f; - const float r = (math::square(u) + math::square(v)) * scale; - const float d = -math::sqrt(math::sqrt(math::sqrt(r))) * 9.0f; - const float kernel_value = math::exp(d); + /* Since we will be doing a circular convolution, we need to zero pad our input image by half the + * kernel size to avoid the kernel affecting the pixels at the other side of image. Therefore, + * zero boundary is assumed. */ + const int needed_padding_amount = kernel_size / 2; + const int2 image_size = int2(image->get_width(), image->get_height()); + const int2 needed_spatial_size = image_size + needed_padding_amount; + const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size); - const float window = (0.5f + 0.5f * math::cos(u * math::numbers::pi)) * - (0.5f + 0.5f * math::cos(v * math::numbers::pi)); - const float windowed_kernel_value = window * kernel_value; + /* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and + * stores only half the output since the other half is redundant, so we only allocate half of the + * first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for more + * information. */ + const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y); - copy_v3_fl(kernel.get_elem(x, y), windowed_kernel_value); - kernel.get_elem(x, y)[3] = 1.0f; + float *kernel_spatial_domain = fftwf_alloc_real(spatial_size.x * spatial_size.y); + std::complex *kernel_frequency_domain = reinterpret_cast *>( + fftwf_alloc_complex(frequency_size.x * frequency_size.y)); + + /* Create a real to complex plan to transform the kernel to the frequency domain, but the same + * plan will be used for the image since they both have the same dimensions. */ + fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d( + spatial_size.y, + spatial_size.x, + kernel_spatial_domain, + reinterpret_cast(kernel_frequency_domain), + FFTW_ESTIMATE); + + /* Use a double to sum the kernel since floats are not stable with threaded summation. */ + threading::EnumerableThreadSpecific sum_by_thread([]() { return 0.0; }); + + /* Compute the kernel while zero padding to match the padded image size. */ + threading::parallel_for(IndexRange(spatial_size.y), 1, [&](const IndexRange sub_y_range) { + for (const int64_t y : sub_y_range) { + for (const int64_t x : IndexRange(spatial_size.x)) { + /* We offset the computed kernel with wrap around such that it is centered at the zero + * point, which is the expected format for doing circular convolutions in the frequency + * domain. */ + const int half_kernel_size = kernel_size / 2; + int64_t output_x = mod_i(x - half_kernel_size, spatial_size.x); + int64_t output_y = mod_i(y - half_kernel_size, spatial_size.y); + + const bool is_inside_kernel = x < kernel_size && y < kernel_size; + if (is_inside_kernel) { + const float kernel_value = compute_fog_glow_kernel_value(x, y, kernel_size); + kernel_spatial_domain[output_x + output_y * spatial_size.x] = kernel_value; + sum_by_thread.local() += kernel_value; + } + else { + kernel_spatial_domain[output_x + output_y * spatial_size.x] = 0.0f; + } + } } - } + }); - convolve(data, input_image, &kernel); + /* Instead of normalizing the kernel now, we normalize it in the frequency domain since we will + * be doing sample normalization anyways. This is okay since the Fourier transform is linear. */ + const float kernel_normalization_factor = float( + std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), 0.0)); + + fftwf_execute_dft_r2c(forward_plan, + kernel_spatial_domain, + reinterpret_cast(kernel_frequency_domain)); + fftwf_free(kernel_spatial_domain); + + /* We only process the color channels, the alpha channel is written to the output as is. */ + const int channels_count = 3; + const int64_t spatial_pixels_per_channel = int64_t(spatial_size.x) * spatial_size.y; + const int64_t frequency_pixels_per_channel = int64_t(spatial_size.x) * spatial_size.y; + const int64_t spatial_pixels_count = spatial_pixels_per_channel * channels_count; + const int64_t frequency_pixels_count = frequency_pixels_per_channel * channels_count; + + float *image_spatial_domain = fftwf_alloc_real(spatial_pixels_count); + std::complex *image_frequency_domain = reinterpret_cast *>( + fftwf_alloc_complex(frequency_pixels_count)); + + /* Zero pad the image to the required spatial domain size, storing each channel in planar + * format for better cache locality, that is, RRRR...GGGG...BBBB. */ + threading::parallel_for(IndexRange(spatial_size.y), 1, [&](const IndexRange sub_y_range) { + for (const int64_t y : sub_y_range) { + for (const int64_t x : IndexRange(spatial_size.x)) { + const bool is_inside_image = x < image_size.x && y < image_size.y; + for (const int64_t channel : IndexRange(channels_count)) { + const int64_t base_index = x + y * spatial_size.x; + const int64_t output_index = base_index + spatial_pixels_per_channel * channel; + if (is_inside_image) { + image_spatial_domain[output_index] = image->get_elem(x, y)[channel]; + } + else { + image_spatial_domain[output_index] = 0.0f; + } + } + } + } + }); + + threading::parallel_for(IndexRange(channels_count), 1, [&](const IndexRange sub_range) { + for (const int64_t channel : sub_range) { + fftwf_execute_dft_r2c(forward_plan, + image_spatial_domain + spatial_pixels_per_channel * channel, + reinterpret_cast(image_frequency_domain) + + frequency_pixels_per_channel * channel); + } + }); + + /* Multiply the kernel and the image in the frequency domain to perform the convolution. The + * FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result + * in an image that is scaled by a factor of the product of the width and height, so we take + * that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms of + * the FFTW manual for more information. */ + const float normalization_scale = float(spatial_size.x) * spatial_size.y * + kernel_normalization_factor; + threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) { + for (const int64_t channel : IndexRange(channels_count)) { + for (const int64_t y : sub_y_range) { + for (const int64_t x : IndexRange(frequency_size.x)) { + const int64_t base_index = x + y * frequency_size.x; + const int64_t output_index = base_index + frequency_pixels_per_channel * channel; + const std::complex kernel_value = kernel_frequency_domain[base_index]; + image_frequency_domain[output_index] *= kernel_value / normalization_scale; + } + } + } + }); + + /* Create a complex to real plan to transform the image to the real domain. */ + fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d( + spatial_size.y, + spatial_size.x, + reinterpret_cast(image_frequency_domain), + image_spatial_domain, + FFTW_ESTIMATE); + + threading::parallel_for(IndexRange(channels_count), 1, [&](const IndexRange sub_range) { + for (const int64_t channel : sub_range) { + fftwf_execute_dft_c2r(backward_plan, + reinterpret_cast(image_frequency_domain) + + frequency_pixels_per_channel * channel, + image_spatial_domain + spatial_pixels_per_channel * channel); + } + }); + + /* Copy the result to the output. */ + threading::parallel_for(IndexRange(image_size.y), 1, [&](const IndexRange sub_y_range) { + for (const int64_t y : sub_y_range) { + for (const int64_t x : IndexRange(image_size.x)) { + for (const int64_t channel : IndexRange(channels_count)) { + const int64_t output_index = (x + y * image_size.x) * image->get_num_channels(); + const int64_t base_index = x + y * spatial_size.x; + const int64_t input_index = base_index + spatial_pixels_per_channel * channel; + output[output_index + channel] = image_spatial_domain[input_index]; + output[output_index + 3] = image->get_buffer()[output_index + 3]; + } + } + } + }); + + fftwf_destroy_plan(forward_plan); + fftwf_destroy_plan(backward_plan); + fftwf_free(image_spatial_domain); + fftwf_free(image_frequency_domain); + fftwf_free(kernel_frequency_domain); +#else + UNUSED_VARS(output, image, settings); +#endif } } // namespace blender::compositor diff --git a/source/blender/nodes/composite/CMakeLists.txt b/source/blender/nodes/composite/CMakeLists.txt index 90acb4b0d40..9ed62e073a7 100644 --- a/source/blender/nodes/composite/CMakeLists.txt +++ b/source/blender/nodes/composite/CMakeLists.txt @@ -157,6 +157,11 @@ if(WITH_OPENIMAGEDENOISE) ) endif() + +if(WITH_FFTW3) + add_definitions(-DWITH_FFTW3) +endif() + blender_add_lib(bf_nodes_composite "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") blender_set_target_unity_build(bf_nodes_composite 10) diff --git a/source/blender/nodes/composite/nodes/node_composite_glare.cc b/source/blender/nodes/composite/nodes/node_composite_glare.cc index 3b74960eb50..2f3a7d34b6f 100644 --- a/source/blender/nodes/composite/nodes/node_composite_glare.cc +++ b/source/blender/nodes/composite/nodes/node_composite_glare.cc @@ -68,10 +68,16 @@ static void node_composit_init_glare(bNodeTree * /*ntree*/, bNode *node) static void node_composit_buts_glare(uiLayout *layout, bContext * /*C*/, PointerRNA *ptr) { + const int glare_type = RNA_enum_get(ptr, "glare_type"); +#ifndef WITH_FFTW3 + if (glare_type == CMP_NODE_GLARE_FOG_GLOW) { + uiItemL(layout, RPT_("Disabled, built without FFTW"), ICON_ERROR); + } +#endif + uiItemR(layout, ptr, "glare_type", UI_ITEM_R_SPLIT_EMPTY_NAME, "", ICON_NONE); uiItemR(layout, ptr, "quality", UI_ITEM_R_SPLIT_EMPTY_NAME, "", ICON_NONE); - const int glare_type = RNA_enum_get(ptr, "glare_type"); if (ELEM(glare_type, CMP_NODE_GLARE_SIMPLE_STAR, CMP_NODE_GLARE_GHOST, CMP_NODE_GLARE_STREAKS)) { uiItemR(layout, ptr, "iterations", UI_ITEM_R_SPLIT_EMPTY_NAME, nullptr, ICON_NONE); }