diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78918ba085f..22cadad665b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -306,7 +306,7 @@ option(WITH_IK_SOLVER "\
 Enable Legacy IK solver (only disable for development)"
   ON
 )
-option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke, ocean sim, and audio effects)" ON)
+option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke, ocean sim, glare, and audio effects)" ON)
 option(WITH_PUGIXML "Enable PugiXML support (Used for OpenImageIO, Grease Pencil SVG export)" ON)
 option(WITH_BULLET "Enable Bullet (Physics Engine)" ON)
 option(WITH_SYSTEM_BULLET "\
diff --git a/build_files/cmake/Modules/FindFftw3.cmake b/build_files/cmake/Modules/FindFftw3.cmake
index 5950c2f61c6..3a13d35f368 100644
--- a/build_files/cmake/Modules/FindFftw3.cmake
+++ b/build_files/cmake/Modules/FindFftw3.cmake
@@ -11,9 +11,11 @@
 #  FFTW3_ROOT_DIR, The base directory to search for Fftw3.
 #                    This can also be an environment variable.
 #  FFTW3_FOUND, If false, do not try to use Fftw3.
+#  WITH_FFTW3_THREADS_F_SUPPORT, if true, single precision Fftw3 supports threads.
 #
 # also defined, but not for general use are
 #  FFTW3_LIBRARY_F, where to find the Fftw3 library (single precision float).
+#  FFTW3_LIBRARY_THREADS_F, where to find the Fftw3 threads library (single precision float).
 #  FFTW3_LIBRARY_D, where to find the Fftw3 library (double precision float).
 
 # If `FFTW3_ROOT_DIR` was defined in the environment, use it.
@@ -49,6 +51,15 @@ find_library(FFTW3_LIBRARY_F
     lib64 lib
   )
 
+find_library(FFTW3_LIBRARY_THREADS_F
+  NAMES
+    fftw3f_threads
+  HINTS
+    ${_fftw3_SEARCH_DIRS}
+  PATH_SUFFIXES
+    lib64 lib
+  )
+
 find_library(FFTW3_LIBRARY_D
   NAMES
     fftw3
@@ -60,6 +71,9 @@ find_library(FFTW3_LIBRARY_D
 
 list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_F}")
 list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_D}")
+if(FFTW3_LIBRARY_THREADS_F)
+  list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_THREADS_F}")
+endif()
 
 # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if
 # all listed variables are TRUE
@@ -70,12 +84,18 @@ find_package_handle_standard_args(Fftw3 DEFAULT_MSG
 if(FFTW3_FOUND)
   set(FFTW3_LIBRARIES ${_FFTW3_LIBRARIES})
   set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
+  if(FFTW3_LIBRARY_THREADS_F)
+    set(WITH_FFTW3_THREADS_F_SUPPORT ON)
+  else()
+    set(WITH_FFTW3_THREADS_F_SUPPORT OFF)
+  endif()
 endif()
 
 
 mark_as_advanced(
   FFTW3_INCLUDE_DIR
   FFTW3_LIBRARY_F
+  FFTW3_LIBRARY_THREADS_F
   FFTW3_LIBRARY_D
 )
 
diff --git a/source/blender/blenlib/BLI_fftw.hh b/source/blender/blenlib/BLI_fftw.hh
new file mode 100644
index 00000000000..f04a9627658
--- /dev/null
+++ b/source/blender/blenlib/BLI_fftw.hh
@@ -0,0 +1,22 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_math_vector_types.hh"
+
+#pragma once
+
+namespace blender::fftw {
+
+/* FFTW's real to complex and complex to real transforms are more efficient when their input has a
+ * specific size. This function finds the most optimal size that is more than or equal the given
+ * size. The input data can then be zero padded to the optimal size for better performance. See
+ * Section 4.3.3 Real-data DFTs in the FFTW manual for more information. */
+int optimal_size_for_real_transform(int size);
+int2 optimal_size_for_real_transform(int2 size);
+
+/* Initialize the float variant of FFTW. This essentially setup the multithreading hooks to enable
+ * multithreading using TBB's parallel_for and makes the FFTW planner thread safe. */
+void initialize_float();
+
+}  // namespace blender::fftw
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 412044ecabb..59cef66f814 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -65,6 +65,7 @@ set(SRC
   intern/easing.c
   intern/endian_switch.c
   intern/expr_pylike_eval.c
+  intern/fftw.cc
   intern/fileops.cc
   intern/fileops_c.cc
   intern/filereader_file.c
@@ -225,6 +226,7 @@ set(SRC
   BLI_endian_switch_inline.h
   BLI_enumerable_thread_specific.hh
   BLI_expr_pylike_eval.h
+  BLI_fftw.hh
   BLI_fileops.h
   BLI_fileops.hh
   BLI_fileops_types.h
@@ -448,6 +450,19 @@ if(WITH_GMP)
   )
 endif()
 
+if(WITH_FFTW3)
+  list(APPEND INC_SYS
+    ${FFTW3_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${FFTW3_LIBRARIES}
+  )
+  add_definitions(-DWITH_FFTW3)
+  if (WITH_FFTW3_THREADS_F_SUPPORT)
+    add_definitions(-DWITH_FFTW3_THREADS_F_SUPPORT)
+  endif()
+endif()
+
 if(WIN32)
   if(WITH_BLENDER_THUMBNAILER)
     # Needed for querying the `thumbnailer .dll` in `winstuff.c`.
diff --git a/source/blender/blenlib/intern/fftw.cc b/source/blender/blenlib/intern/fftw.cc
new file mode 100644
index 00000000000..cdae65a7f2f
--- /dev/null
+++ b/source/blender/blenlib/intern/fftw.cc
@@ -0,0 +1,93 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#if defined(WITH_FFTW3_THREADS_F_SUPPORT)
+#  include <fftw3.h>
+#endif
+
+#include "BLI_index_range.hh"
+#include "BLI_math_vector_types.hh"
+#include "BLI_task.hh"
+#include "BLI_threads.h"
+
+namespace blender::fftw {
+
+/* Identifies if the given number is a 7-smooth number. */
+static bool is_humble_number(int n)
+{
+  if (n <= 1) {
+    return true;
+  }
+  if (n % 2 == 0) {
+    return is_humble_number(n / 2);
+  }
+  if (n % 3 == 0) {
+    return is_humble_number(n / 3);
+  }
+  if (n % 5 == 0) {
+    return is_humble_number(n / 5);
+  }
+  if (n % 7 == 0) {
+    return is_humble_number(n / 7);
+  }
+  return false;
+}
+
+/* Finds the even humble number larger than or equal the given number. */
+static int find_next_even_humble_number(int n)
+{
+  if (n % 2 == 1) {
+    n++;
+  }
+
+  while (true) {
+    if (is_humble_number(n)) {
+      return n;
+    }
+    n += 2;
+  }
+
+  return n;
+}
+
+/* FFTW is best at handling sizes of the form 2^a * 3^b * 5^c * 7^d * 11^e * 13^f, where e + f is
+ * either 0 or 1, and the other exponents are arbitrary. And it is beneficial for the size to be
+ * evene for real transforms. To simplify computation, we ignore the 11 and 13 factors and find the
+ * even humble number that is more then or equal the given size. See Section 4.3.3 Real-data DFTs
+ * in the FFTW manual for more information. */
+int optimal_size_for_real_transform(int size)
+{
+  return find_next_even_humble_number(size);
+}
+
+int2 optimal_size_for_real_transform(int2 size)
+{
+  return int2(optimal_size_for_real_transform(size.x), optimal_size_for_real_transform(size.y));
+}
+
+/* See Section 5.2 Usage of Multi-threaded FFTW in the FFTW manual for more information. */
+[[maybe_unused]] static void tbb_parallel_loop_for_fftw(void *(*work)(char *),
+                                                        char *job_data,
+                                                        size_t element_size,
+                                                        int number_of_jobs,
+                                                        void * /* data */)
+{
+  threading::parallel_for(IndexRange(number_of_jobs), 1, [&](const IndexRange sub_range) {
+    for (const int64_t i : sub_range) {
+      work(job_data + element_size * i);
+    }
+  });
+}
+
+void initialize_float()
+{
+#if defined(WITH_FFTW3_THREADS_F_SUPPORT)
+  fftwf_init_threads();
+  fftwf_make_planner_thread_safe();
+  fftwf_plan_with_nthreads(BLI_system_thread_count());
+  fftwf_threads_set_callback(tbb_parallel_loop_for_fftw, nullptr);
+#endif
+}
+
+}  // namespace blender::fftw
diff --git a/source/blender/compositor/CMakeLists.txt b/source/blender/compositor/CMakeLists.txt
index d6ab78678b4..c58371cd3ea 100644
--- a/source/blender/compositor/CMakeLists.txt
+++ b/source/blender/compositor/CMakeLists.txt
@@ -588,6 +588,16 @@ if(WITH_COMPOSITOR_CPU)
     )
   endif()
 
+  if(WITH_FFTW3)
+    list(APPEND INC_SYS
+      ${FFTW3_INCLUDE_DIRS}
+    )
+    list(APPEND LIB
+      ${FFTW3_LIBRARIES}
+    )
+    add_definitions(-DWITH_FFTW3)
+  endif()
+
   blender_add_lib(bf_compositor "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
   blender_set_target_unity_build(bf_compositor 10)
diff --git a/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc b/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc
index 283c3e09d5c..bf3684de2b3 100644
--- a/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc
+++ b/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc
@@ -2,423 +2,221 @@
  *
  * SPDX-License-Identifier: GPL-2.0-or-later */
 
+#include <complex>
+#include <cstring>
+#include <memory>
+#include <numeric>
+
+#if defined(WITH_FFTW3)
+#  include <fftw3.h>
+#endif
+
+#include "BLI_enumerable_thread_specific.hh"
+#include "BLI_fftw.hh"
+#include "BLI_index_range.hh"
 #include "BLI_math_base.hh"
-#include "BLI_math_vector.h"
+#include "BLI_math_vector_types.hh"
+#include "BLI_task.hh"
 
 #include "COM_GlareFogGlowOperation.h"
 
 namespace blender::compositor {
 
-/*
- *  2D Fast Hartley Transform, used for convolution
- */
-
-using fREAL = float;
-
-/* Returns next highest power of 2 of x, as well its log2 in L2. */
-static uint next_pow2(uint x, uint *L2)
+/* Given the x and y location in the range from 0 to kernel_size - 1, where kernel_size is odd,
+ * compute the fog glow kernel value. The equations are arbitrary and were chosen using visual
+ * judgement. The kernel is not normalized and need normalization. */
+[[maybe_unused]] static float compute_fog_glow_kernel_value(int x, int y, int kernel_size)
 {
-  uint pw, x_notpow2 = x & (x - 1);
-  *L2 = 0;
-  while (x >>= 1) {
-    ++(*L2);
-  }
-  pw = 1 << (*L2);
-  if (x_notpow2) {
-    (*L2)++;
-    pw <<= 1;
-  }
-  return pw;
+  const int half_kernel_size = kernel_size / 2;
+  const float scale = 0.25f * math::sqrt(math::square(kernel_size));
+  const float v = ((y - half_kernel_size) / float(half_kernel_size));
+  const float u = ((x - half_kernel_size) / float(half_kernel_size));
+  const float r = (math::square(u) + math::square(v)) * scale;
+  const float d = -math::sqrt(math::sqrt(math::sqrt(r))) * 9.0f;
+  const float kernel_value = math::exp(d);
+
+  const float window = (0.5f + 0.5f * math::cos(u * math::numbers::pi)) *
+                       (0.5f + 0.5f * math::cos(v * math::numbers::pi));
+  const float windowed_kernel_value = window * kernel_value;
+
+  return windowed_kernel_value;
 }
 
-//------------------------------------------------------------------------------
-
-/* From FXT library by Joerg Arndt, faster in order bit-reversal
- * use: `r = revbin_upd(r, h)` where `h = N>>1`. */
-static uint revbin_upd(uint r, uint h)
-{
-  while (!((r ^= h) & h)) {
-    h >>= 1;
-  }
-  return r;
-}
-//------------------------------------------------------------------------------
-static void FHT(fREAL *data, uint M, uint inverse)
-{
-  double tt, fc, dc, fs, ds, a = M_PI;
-  fREAL t1, t2;
-  int n2, bd, bl, istep, k, len = 1 << M, n = 1;
-
-  int i, j = 0;
-  uint Nh = len >> 1;
-  for (i = 1; i < (len - 1); i++) {
-    j = revbin_upd(j, Nh);
-    if (j > i) {
-      t1 = data[i];
-      data[i] = data[j];
-      data[j] = t1;
-    }
-  }
-
-  do {
-    fREAL *data_n = &data[n];
-
-    istep = n << 1;
-    for (k = 0; k < len; k += istep) {
-      t1 = data_n[k];
-      data_n[k] = data[k] - t1;
-      data[k] += t1;
-    }
-
-    n2 = n >> 1;
-    if (n > 2) {
-      fc = dc = cos(a);
-      fs = ds = sqrt(1.0 - fc * fc);  // sin(a);
-      bd = n - 2;
-      for (bl = 1; bl < n2; bl++) {
-        fREAL *data_nbd = &data_n[bd];
-        fREAL *data_bd = &data[bd];
-        for (k = bl; k < len; k += istep) {
-          t1 = fc * double(data_n[k]) + fs * double(data_nbd[k]);
-          t2 = fs * double(data_n[k]) - fc * double(data_nbd[k]);
-          data_n[k] = data[k] - t1;
-          data_nbd[k] = data_bd[k] - t2;
-          data[k] += t1;
-          data_bd[k] += t2;
-        }
-        tt = fc * dc - fs * ds;
-        fs = fs * dc + fc * ds;
-        fc = tt;
-        bd -= 2;
-      }
-    }
-
-    if (n > 1) {
-      for (k = n2; k < len; k += istep) {
-        t1 = data_n[k];
-        data_n[k] = data[k] - t1;
-        data[k] += t1;
-      }
-    }
-
-    n = istep;
-    a *= 0.5;
-  } while (n < len);
-
-  if (inverse) {
-    fREAL sc = (fREAL)1 / (fREAL)len;
-    for (k = 0; k < len; k++) {
-      data[k] *= sc;
-    }
-  }
-}
-//------------------------------------------------------------------------------
-/* 2D Fast Hartley Transform, Mx/My -> log2 of width/height,
- * nzp -> the row where zero pad data starts,
- * inverse -> see above. */
-static void FHT2D(fREAL *data, uint Mx, uint My, uint nzp, uint inverse)
-{
-  uint i, j, Nx, Ny, maxy;
-
-  Nx = 1 << Mx;
-  Ny = 1 << My;
-
-  /* Rows (forward transform skips 0 pad data). */
-  maxy = inverse ? Ny : nzp;
-  for (j = 0; j < maxy; j++) {
-    FHT(&data[Nx * j], Mx, inverse);
-  }
-
-  /* Transpose data. */
-  if (Nx == Ny) { /* Square. */
-    for (j = 0; j < Ny; j++) {
-      for (i = j + 1; i < Nx; i++) {
-        uint op = i + (j << Mx), np = j + (i << My);
-        std::swap(data[op], data[np]);
-      }
-    }
-  }
-  else { /* Rectangular. */
-    uint k, Nym = Ny - 1, stm = 1 << (Mx + My);
-    for (i = 0; stm > 0; i++) {
-#define PRED(k) (((k & Nym) << Mx) + (k >> My))
-      for (j = PRED(i); j > i; j = PRED(j)) {
-        /* Pass. */
-      }
-      if (j < i) {
-        continue;
-      }
-      for (k = i, j = PRED(i); j != i; k = j, j = PRED(j), stm--) {
-        std::swap(data[j], data[k]);
-      }
-#undef PRED
-      stm--;
-    }
-  }
-
-  std::swap(Nx, Ny);
-  std::swap(Mx, My);
-
-  /* Now columns == transposed rows. */
-  for (j = 0; j < Ny; j++) {
-    FHT(&data[Nx * j], Mx, inverse);
-  }
-
-  /* Finalize. */
-  for (j = 0; j <= (Ny >> 1); j++) {
-    uint jm = (Ny - j) & (Ny - 1);
-    uint ji = j << Mx;
-    uint jmi = jm << Mx;
-    for (i = 0; i <= (Nx >> 1); i++) {
-      uint im = (Nx - i) & (Nx - 1);
-      fREAL A = data[ji + i];
-      fREAL B = data[jmi + i];
-      fREAL C = data[ji + im];
-      fREAL D = data[jmi + im];
-      fREAL E = (fREAL)0.5 * ((A + D) - (B + C));
-      data[ji + i] = A - E;
-      data[jmi + i] = B + E;
-      data[ji + im] = C + E;
-      data[jmi + im] = D - E;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-
-/* 2D convolution calc, d1 *= d2, M/N - > log2 of width/height. */
-static void fht_convolve(fREAL *d1, const fREAL *d2, uint M, uint N)
-{
-  fREAL a, b;
-  uint i, j, k, L, mj, mL;
-  uint m = 1 << M, n = 1 << N;
-  uint m2 = 1 << (M - 1), n2 = 1 << (N - 1);
-  uint mn2 = m << (N - 1);
-
-  d1[0] *= d2[0];
-  d1[mn2] *= d2[mn2];
-  d1[m2] *= d2[m2];
-  d1[m2 + mn2] *= d2[m2 + mn2];
-  for (i = 1; i < m2; i++) {
-    k = m - i;
-    a = d1[i] * d2[i] - d1[k] * d2[k];
-    b = d1[k] * d2[i] + d1[i] * d2[k];
-    d1[i] = (b + a) * (fREAL)0.5;
-    d1[k] = (b - a) * (fREAL)0.5;
-    a = d1[i + mn2] * d2[i + mn2] - d1[k + mn2] * d2[k + mn2];
-    b = d1[k + mn2] * d2[i + mn2] + d1[i + mn2] * d2[k + mn2];
-    d1[i + mn2] = (b + a) * (fREAL)0.5;
-    d1[k + mn2] = (b - a) * (fREAL)0.5;
-  }
-  for (j = 1; j < n2; j++) {
-    L = n - j;
-    mj = j << M;
-    mL = L << M;
-    a = d1[mj] * d2[mj] - d1[mL] * d2[mL];
-    b = d1[mL] * d2[mj] + d1[mj] * d2[mL];
-    d1[mj] = (b + a) * (fREAL)0.5;
-    d1[mL] = (b - a) * (fREAL)0.5;
-    a = d1[m2 + mj] * d2[m2 + mj] - d1[m2 + mL] * d2[m2 + mL];
-    b = d1[m2 + mL] * d2[m2 + mj] + d1[m2 + mj] * d2[m2 + mL];
-    d1[m2 + mj] = (b + a) * (fREAL)0.5;
-    d1[m2 + mL] = (b - a) * (fREAL)0.5;
-  }
-  for (i = 1; i < m2; i++) {
-    k = m - i;
-    for (j = 1; j < n2; j++) {
-      L = n - j;
-      mj = j << M;
-      mL = L << M;
-      a = d1[i + mj] * d2[i + mj] - d1[k + mL] * d2[k + mL];
-      b = d1[k + mL] * d2[i + mj] + d1[i + mj] * d2[k + mL];
-      d1[i + mj] = (b + a) * (fREAL)0.5;
-      d1[k + mL] = (b - a) * (fREAL)0.5;
-      a = d1[i + mL] * d2[i + mL] - d1[k + mj] * d2[k + mj];
-      b = d1[k + mj] * d2[i + mL] + d1[i + mL] * d2[k + mj];
-      d1[i + mL] = (b + a) * (fREAL)0.5;
-      d1[k + mj] = (b - a) * (fREAL)0.5;
-    }
-  }
-}
-//------------------------------------------------------------------------------
-
-static void convolve(float *dst, MemoryBuffer *in1, MemoryBuffer *in2)
-{
-  fREAL *data1, *data2, *fp;
-  uint w2, h2, hw, hh, log2_w, log2_h;
-  fRGB wt, *colp;
-  int x, y, ch;
-  int xbl, ybl, nxb, nyb, xbsz, ybsz;
-  bool in2done = false;
-  const uint kernel_width = in2->get_width();
-  const uint kernel_height = in2->get_height();
-  const uint image_width = in1->get_width();
-  const uint image_height = in1->get_height();
-  float *kernel_buffer = in2->get_buffer();
-  float *image_buffer = in1->get_buffer();
-
-  MemoryBuffer *rdst = new MemoryBuffer(DataType::Color, in1->get_rect());
-  memset(rdst->get_buffer(),
-         0,
-         rdst->get_width() * rdst->get_height() * COM_DATA_TYPE_COLOR_CHANNELS * sizeof(float));
-
-  /* Convolution result width & height. */
-  w2 = 2 * kernel_width - 1;
-  h2 = 2 * kernel_height - 1;
-  /* FFT pow2 required size & log2. */
-  w2 = next_pow2(w2, &log2_w);
-  h2 = next_pow2(h2, &log2_h);
-
-  /* Allocate space. */
-  data1 = (fREAL *)MEM_callocN(3 * w2 * h2 * sizeof(fREAL), "convolve_fast FHT data1");
-  data2 = (fREAL *)MEM_callocN(w2 * h2 * sizeof(fREAL), "convolve_fast FHT data2");
-
-  /* Normalize convolution. */
-  wt[0] = wt[1] = wt[2] = 0.0f;
-  for (y = 0; y < kernel_height; y++) {
-    colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS];
-    for (x = 0; x < kernel_width; x++) {
-      add_v3_v3(wt, colp[x]);
-    }
-  }
-  if (wt[0] != 0.0f) {
-    wt[0] = 1.0f / wt[0];
-  }
-  if (wt[1] != 0.0f) {
-    wt[1] = 1.0f / wt[1];
-  }
-  if (wt[2] != 0.0f) {
-    wt[2] = 1.0f / wt[2];
-  }
-  for (y = 0; y < kernel_height; y++) {
-    colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS];
-    for (x = 0; x < kernel_width; x++) {
-      mul_v3_v3(colp[x], wt);
-    }
-  }
-
-  /* Copy image data, unpacking interleaved RGBA into separate channels
-   * only need to calc data1 once. */
-
-  /* Block add-overlap. */
-  hw = kernel_width >> 1;
-  hh = kernel_height >> 1;
-  xbsz = (w2 + 1) - kernel_width;
-  ybsz = (h2 + 1) - kernel_height;
-  nxb = image_width / xbsz;
-  if (image_width % xbsz) {
-    nxb++;
-  }
-  nyb = image_height / ybsz;
-  if (image_height % ybsz) {
-    nyb++;
-  }
-  for (ybl = 0; ybl < nyb; ybl++) {
-    for (xbl = 0; xbl < nxb; xbl++) {
-
-      /* Each channel one by one. */
-      for (ch = 0; ch < 3; ch++) {
-        fREAL *data1ch = &data1[ch * w2 * h2];
-
-        /* Only need to calc fht data from in2 once, can re-use for every block. */
-        if (!in2done) {
-          /* in2, channel ch -> data1 */
-          for (y = 0; y < kernel_height; y++) {
-            fp = &data1ch[y * w2];
-            colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS];
-            for (x = 0; x < kernel_width; x++) {
-              fp[x] = colp[x][ch];
-            }
-          }
-        }
-
-        /* in1, channel ch -> data2 */
-        memset(data2, 0, w2 * h2 * sizeof(fREAL));
-        for (y = 0; y < ybsz; y++) {
-          int yy = ybl * ybsz + y;
-          if (yy >= image_height) {
-            continue;
-          }
-          fp = &data2[y * w2];
-          colp = (fRGB *)&image_buffer[yy * image_width * COM_DATA_TYPE_COLOR_CHANNELS];
-          for (x = 0; x < xbsz; x++) {
-            int xx = xbl * xbsz + x;
-            if (xx >= image_width) {
-              continue;
-            }
-            fp[x] = colp[xx][ch];
-          }
-        }
-
-        /* Forward FHT
-         * zero pad data start is different for each == height+1. */
-        if (!in2done) {
-          FHT2D(data1ch, log2_w, log2_h, kernel_height + 1, 0);
-        }
-        FHT2D(data2, log2_w, log2_h, kernel_height + 1, 0);
-
-        /* FHT2D transposed data, row/col now swapped
-         * convolve & inverse FHT. */
-        fht_convolve(data2, data1ch, log2_h, log2_w);
-        FHT2D(data2, log2_h, log2_w, 0, 1);
-        /* Data again transposed, so in order again. */
-
-        /* Overlap-add result. */
-        for (y = 0; y < int(h2); y++) {
-          const int yy = ybl * ybsz + y - hh;
-          if ((yy < 0) || (yy >= image_height)) {
-            continue;
-          }
-          fp = &data2[y * w2];
-          colp = (fRGB *)&rdst->get_buffer()[yy * image_width * COM_DATA_TYPE_COLOR_CHANNELS];
-          for (x = 0; x < int(w2); x++) {
-            const int xx = xbl * xbsz + x - hw;
-            if ((xx < 0) || (xx >= image_width)) {
-              continue;
-            }
-            colp[xx][ch] += fp[x];
-          }
-        }
-      }
-      in2done = true;
-    }
-  }
-
-  MEM_freeN(data2);
-  MEM_freeN(data1);
-  memcpy(dst,
-         rdst->get_buffer(),
-         sizeof(float) * image_width * image_height * COM_DATA_TYPE_COLOR_CHANNELS);
-  delete (rdst);
-}
-
-void GlareFogGlowOperation::generate_glare(float *data,
-                                           MemoryBuffer *input_image,
+void GlareFogGlowOperation::generate_glare(float *output,
+                                           MemoryBuffer *image,
                                            const NodeGlare *settings)
 {
-  const int kernel_size = 1 << settings->size;
-  MemoryBuffer kernel = MemoryBuffer(DataType::Color, kernel_size, kernel_size);
+#if defined(WITH_FFTW3)
+  fftw::initialize_float();
 
-  const float scale = 0.25f * math::sqrt(math::square(kernel_size));
+  /* We use an odd sized kernel since an even one will typically introduce a tiny offset as it has
+   * no exact center value. */
+  const int kernel_size = (1 << settings->size) + 1;
 
-  for (int y = 0; y < kernel_size; y++) {
-    const float v = 2.0f * (y / float(kernel_size)) - 1.0f;
-    for (int x = 0; x < kernel_size; x++) {
-      const float u = 2.0f * (x / float(kernel_size)) - 1.0f;
-      const float r = (math::square(u) + math::square(v)) * scale;
-      const float d = -math::sqrt(math::sqrt(math::sqrt(r))) * 9.0f;
-      const float kernel_value = math::exp(d);
+  /* Since we will be doing a circular convolution, we need to zero pad our input image by half the
+   * kernel size to avoid the kernel affecting the pixels at the other side of image. Therefore,
+   * zero boundary is assumed. */
+  const int needed_padding_amount = kernel_size / 2;
+  const int2 image_size = int2(image->get_width(), image->get_height());
+  const int2 needed_spatial_size = image_size + needed_padding_amount;
+  const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size);
 
-      const float window = (0.5f + 0.5f * math::cos(u * math::numbers::pi)) *
-                           (0.5f + 0.5f * math::cos(v * math::numbers::pi));
-      const float windowed_kernel_value = window * kernel_value;
+  /* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and
+   * stores only half the output since the other half is redundant, so we only allocate half of the
+   * first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for more
+   * information. */
+  const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y);
 
-      copy_v3_fl(kernel.get_elem(x, y), windowed_kernel_value);
-      kernel.get_elem(x, y)[3] = 1.0f;
+  float *kernel_spatial_domain = fftwf_alloc_real(spatial_size.x * spatial_size.y);
+  std::complex<float> *kernel_frequency_domain = reinterpret_cast<std::complex<float> *>(
+      fftwf_alloc_complex(frequency_size.x * frequency_size.y));
+
+  /* Create a real to complex plan to transform the kernel to the frequency domain, but the same
+   * plan will be used for the image since they both have the same dimensions. */
+  fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d(
+      spatial_size.y,
+      spatial_size.x,
+      kernel_spatial_domain,
+      reinterpret_cast<fftwf_complex *>(kernel_frequency_domain),
+      FFTW_ESTIMATE);
+
+  /* Use a double to sum the kernel since floats are not stable with threaded summation. */
+  threading::EnumerableThreadSpecific<double> sum_by_thread([]() { return 0.0; });
+
+  /* Compute the kernel while zero padding to match the padded image size. */
+  threading::parallel_for(IndexRange(spatial_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t y : sub_y_range) {
+      for (const int64_t x : IndexRange(spatial_size.x)) {
+        /* We offset the computed kernel with wrap around such that it is centered at the zero
+         * point, which is the expected format for doing circular convolutions in the frequency
+         * domain. */
+        const int half_kernel_size = kernel_size / 2;
+        int64_t output_x = mod_i(x - half_kernel_size, spatial_size.x);
+        int64_t output_y = mod_i(y - half_kernel_size, spatial_size.y);
+
+        const bool is_inside_kernel = x < kernel_size && y < kernel_size;
+        if (is_inside_kernel) {
+          const float kernel_value = compute_fog_glow_kernel_value(x, y, kernel_size);
+          kernel_spatial_domain[output_x + output_y * spatial_size.x] = kernel_value;
+          sum_by_thread.local() += kernel_value;
+        }
+        else {
+          kernel_spatial_domain[output_x + output_y * spatial_size.x] = 0.0f;
+        }
+      }
     }
-  }
+  });
 
-  convolve(data, input_image, &kernel);
+  /* Instead of normalizing the kernel now, we normalize it in the frequency domain since we will
+   * be doing sample normalization anyways. This is okay since the Fourier transform is linear. */
+  const float kernel_normalization_factor = float(
+      std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), 0.0));
+
+  fftwf_execute_dft_r2c(forward_plan,
+                        kernel_spatial_domain,
+                        reinterpret_cast<fftwf_complex *>(kernel_frequency_domain));
+  fftwf_free(kernel_spatial_domain);
+
+  /* We only process the color channels, the alpha channel is written to the output as is. */
+  const int channels_count = 3;
+  const int64_t spatial_pixels_per_channel = int64_t(spatial_size.x) * spatial_size.y;
+  const int64_t frequency_pixels_per_channel = int64_t(spatial_size.x) * spatial_size.y;
+  const int64_t spatial_pixels_count = spatial_pixels_per_channel * channels_count;
+  const int64_t frequency_pixels_count = frequency_pixels_per_channel * channels_count;
+
+  float *image_spatial_domain = fftwf_alloc_real(spatial_pixels_count);
+  std::complex<float> *image_frequency_domain = reinterpret_cast<std::complex<float> *>(
+      fftwf_alloc_complex(frequency_pixels_count));
+
+  /* Zero pad the image to the required spatial domain size, storing each channel in planar
+   * format for better cache locality, that is, RRRR...GGGG...BBBB. */
+  threading::parallel_for(IndexRange(spatial_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t y : sub_y_range) {
+      for (const int64_t x : IndexRange(spatial_size.x)) {
+        const bool is_inside_image = x < image_size.x && y < image_size.y;
+        for (const int64_t channel : IndexRange(channels_count)) {
+          const int64_t base_index = x + y * spatial_size.x;
+          const int64_t output_index = base_index + spatial_pixels_per_channel * channel;
+          if (is_inside_image) {
+            image_spatial_domain[output_index] = image->get_elem(x, y)[channel];
+          }
+          else {
+            image_spatial_domain[output_index] = 0.0f;
+          }
+        }
+      }
+    }
+  });
+
+  threading::parallel_for(IndexRange(channels_count), 1, [&](const IndexRange sub_range) {
+    for (const int64_t channel : sub_range) {
+      fftwf_execute_dft_r2c(forward_plan,
+                            image_spatial_domain + spatial_pixels_per_channel * channel,
+                            reinterpret_cast<fftwf_complex *>(image_frequency_domain) +
+                                frequency_pixels_per_channel * channel);
+    }
+  });
+
+  /* Multiply the kernel and the image in the frequency domain to perform the convolution. The
+   * FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result
+   * in an image that is scaled by a factor of the product of the width and height, so we take
+   * that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms of
+   * the FFTW manual for more information. */
+  const float normalization_scale = float(spatial_size.x) * spatial_size.y *
+                                    kernel_normalization_factor;
+  threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t channel : IndexRange(channels_count)) {
+      for (const int64_t y : sub_y_range) {
+        for (const int64_t x : IndexRange(frequency_size.x)) {
+          const int64_t base_index = x + y * frequency_size.x;
+          const int64_t output_index = base_index + frequency_pixels_per_channel * channel;
+          const std::complex<float> kernel_value = kernel_frequency_domain[base_index];
+          image_frequency_domain[output_index] *= kernel_value / normalization_scale;
+        }
+      }
+    }
+  });
+
+  /* Create a complex to real plan to transform the image to the real domain. */
+  fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d(
+      spatial_size.y,
+      spatial_size.x,
+      reinterpret_cast<fftwf_complex *>(image_frequency_domain),
+      image_spatial_domain,
+      FFTW_ESTIMATE);
+
+  threading::parallel_for(IndexRange(channels_count), 1, [&](const IndexRange sub_range) {
+    for (const int64_t channel : sub_range) {
+      fftwf_execute_dft_c2r(backward_plan,
+                            reinterpret_cast<fftwf_complex *>(image_frequency_domain) +
+                                frequency_pixels_per_channel * channel,
+                            image_spatial_domain + spatial_pixels_per_channel * channel);
+    }
+  });
+
+  /* Copy the result to the output. */
+  threading::parallel_for(IndexRange(image_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t y : sub_y_range) {
+      for (const int64_t x : IndexRange(image_size.x)) {
+        for (const int64_t channel : IndexRange(channels_count)) {
+          const int64_t output_index = (x + y * image_size.x) * image->get_num_channels();
+          const int64_t base_index = x + y * spatial_size.x;
+          const int64_t input_index = base_index + spatial_pixels_per_channel * channel;
+          output[output_index + channel] = image_spatial_domain[input_index];
+          output[output_index + 3] = image->get_buffer()[output_index + 3];
+        }
+      }
+    }
+  });
+
+  fftwf_destroy_plan(forward_plan);
+  fftwf_destroy_plan(backward_plan);
+  fftwf_free(image_spatial_domain);
+  fftwf_free(image_frequency_domain);
+  fftwf_free(kernel_frequency_domain);
+#else
+  UNUSED_VARS(output, image, settings);
+#endif
 }
 
 }  // namespace blender::compositor
diff --git a/source/blender/nodes/composite/CMakeLists.txt b/source/blender/nodes/composite/CMakeLists.txt
index 90acb4b0d40..9ed62e073a7 100644
--- a/source/blender/nodes/composite/CMakeLists.txt
+++ b/source/blender/nodes/composite/CMakeLists.txt
@@ -157,6 +157,11 @@ if(WITH_OPENIMAGEDENOISE)
   )
 endif()
 
+
+if(WITH_FFTW3)
+  add_definitions(-DWITH_FFTW3)
+endif()
+
 blender_add_lib(bf_nodes_composite "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
 blender_set_target_unity_build(bf_nodes_composite 10)
diff --git a/source/blender/nodes/composite/nodes/node_composite_glare.cc b/source/blender/nodes/composite/nodes/node_composite_glare.cc
index 3b74960eb50..2f3a7d34b6f 100644
--- a/source/blender/nodes/composite/nodes/node_composite_glare.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_glare.cc
@@ -68,10 +68,16 @@ static void node_composit_init_glare(bNodeTree * /*ntree*/, bNode *node)
 
 static void node_composit_buts_glare(uiLayout *layout, bContext * /*C*/, PointerRNA *ptr)
 {
+  const int glare_type = RNA_enum_get(ptr, "glare_type");
+#ifndef WITH_FFTW3
+  if (glare_type == CMP_NODE_GLARE_FOG_GLOW) {
+    uiItemL(layout, RPT_("Disabled, built without FFTW"), ICON_ERROR);
+  }
+#endif
+
   uiItemR(layout, ptr, "glare_type", UI_ITEM_R_SPLIT_EMPTY_NAME, "", ICON_NONE);
   uiItemR(layout, ptr, "quality", UI_ITEM_R_SPLIT_EMPTY_NAME, "", ICON_NONE);
 
-  const int glare_type = RNA_enum_get(ptr, "glare_type");
   if (ELEM(glare_type, CMP_NODE_GLARE_SIMPLE_STAR, CMP_NODE_GLARE_GHOST, CMP_NODE_GLARE_STREAKS)) {
     uiItemR(layout, ptr, "iterations", UI_ITEM_R_SPLIT_EMPTY_NAME, nullptr, ICON_NONE);
   }