VSE: reduce effects code duplication, making gaussian blur faster in the process

Now that the code is in C++, quite some duplication between "byte" and "float" effect code paths can be reduced (easier than it was in C times). So I did that, removing about 400 lines of code. In that process I accidentally made Gaussian Blur faster, since while reducing the amount of code I noticed it was doing some things sub-optimally (calculated kernel tables for each job, etc.). Applying 100x100 gaussian blur on 4K UHD resolution image strip on Ryzen 5950X went 630ms -> 450ms. Pull Request: https://projects.blender.org/blender/blender/pulls/116089
2023-12-14 17:31:05 +01:00
parent 4a34dcbb69
commit 5cac8e2bb4
1 changed files with 314 additions and 736 deletions
--- a/source/blender/sequencer/intern/effects.cc
+++ b/source/blender/sequencer/intern/effects.cc
@@ -64,6 +64,8 @@
 #include "strip_time.hh"
 #include "utils.hh"

+using blender::float4;
+
 static SeqEffectHandle get_sequence_effect_impl(int seq_type);

 /* -------------------------------------------------------------------- */
@@ -120,6 +122,44 @@ static void slice_get_float_buffers(const SeqRenderData *context,
  }
 }

+static float4 load_premul_pixel(const uchar *ptr)
+{
+  float4 res;
+  straight_uchar_to_premul_float(res, ptr);
+  return res;
+}
+
+static float4 load_premul_pixel(const float *ptr)
+{
+  return float4(ptr);
+}
+
+static void store_premul_pixel(const float4 &pix, uchar *dst)
+{
+  premul_float_to_straight_uchar(dst, pix);
+}
+
+static void store_premul_pixel(const float4 &pix, float *dst)
+{
+  *reinterpret_cast<float4 *>(dst) = pix;
+}
+
+static void store_opaque_black_pixel(uchar *dst)
+{
+  dst[0] = 0;
+  dst[1] = 0;
+  dst[2] = 0;
+  dst[3] = 255;
+}
+
+static void store_opaque_black_pixel(float *dst)
+{
+  dst[0] = 0.0f;
+  dst[1] = 0.0f;
+  dst[2] = 0.0f;
+  dst[3] = 1.0f;
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
@@ -203,72 +243,32 @@ static void init_alpha_over_or_under(Sequence *seq)
  seq->seq1 = seq2;
 }

-static void do_alphaover_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+/* dst = src1 over src2 (alpha from src1) */
+template<typename T>
+static void do_alphaover_effect(
+    float fac, int width, int height, const T *src1, const T *src2, T *dst)
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
-  uchar *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 over rt2  (alpha from rt1) */
-
-      float tempc[4], rt1[4], rt2[4];
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      float mfac = 1.0f - fac * rt1[3];
-
-      if (fac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp2);
-      }
-      else if (mfac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp1);
-      }
-      else {
-        tempc[0] = fac * rt1[0] + mfac * rt2[0];
-        tempc[1] = fac * rt1[1] + mfac * rt2[1];
-        tempc[2] = fac * rt1[2] + mfac * rt2[2];
-        tempc[3] = fac * rt1[3] + mfac * rt2[3];
-
-        premul_float_to_straight_uchar(rt, tempc);
-      }
-      cp1 += 4;
-      cp2 += 4;
-      rt += 4;
-    }
+  if (fac <= 0.0f) {
+    memcpy(dst, src2, sizeof(T) * 4 * width * height);
+    return;
  }
-}

-static void do_alphaover_effect_float(
-    float fac, int x, int y, float *rect1, float *rect2, float *out)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 col1 = load_premul_pixel(src1);
+      float mfac = 1.0f - fac * col1.w;

-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 over rt2  (alpha from rt1) */
-
-      float mfac = 1.0f - (fac * rt1[3]);
-
-      if (fac <= 0.0f) {
-        memcpy(rt, rt2, sizeof(float[4]));
-      }
-      else if (mfac <= 0) {
-        memcpy(rt, rt1, sizeof(float[4]));
+      if (mfac <= 0.0f) {
+        memcpy(dst, src1, sizeof(T) * 4);
      }
      else {
-        rt[0] = fac * rt1[0] + mfac * rt2[0];
-        rt[1] = fac * rt1[1] + mfac * rt2[1];
-        rt[2] = fac * rt1[2] + mfac * rt2[2];
-        rt[3] = fac * rt1[3] + mfac * rt2[3];
+        float4 col2 = load_premul_pixel(src2);
+        float4 col = fac * col1 + mfac * col2;
+        store_premul_pixel(col, dst);
      }
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
    }
  }
 }
@@ -290,7 +290,7 @@ static void do_alphaover_effect(const SeqRenderData *context,
    slice_get_float_buffers(
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

-    do_alphaover_effect_float(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
  }
  else {
    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -298,7 +298,7 @@ static void do_alphaover_effect(const SeqRenderData *context,
    slice_get_byte_buffers(
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

-    do_alphaover_effect_byte(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
  }
 }

@@ -308,88 +308,34 @@ static void do_alphaover_effect(const SeqRenderData *context,
 /** \name Alpha Under Effect
 * \{ */

-static void do_alphaunder_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+/* dst = src1 under src2 (alpha from src2) */
+template<typename T>
+static void do_alphaunder_effect(
+    float fac, int width, int height, const T *src1, const T *src2, T *dst)
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
-  uchar *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 under rt2  (alpha from rt2) */
-
-      float tempc[4], rt1[4], rt2[4];
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      /* this complex optimization is because the
-       * 'skybuf' can be crossed in
-       */
-      if (rt2[3] <= 0.0f && fac >= 1.0f) {
-        *((uint *)rt) = *((uint *)cp1);
-      }
-      else if (rt2[3] >= 1.0f) {
-        *((uint *)rt) = *((uint *)cp2);
-      }
-      else {
-        float temp_fac = (fac * (1.0f - rt2[3]));
-
-        if (fac <= 0) {
-          *((uint *)rt) = *((uint *)cp2);
-        }
-        else {
-          tempc[0] = (temp_fac * rt1[0] + rt2[0]);
-          tempc[1] = (temp_fac * rt1[1] + rt2[1]);
-          tempc[2] = (temp_fac * rt1[2] + rt2[2]);
-          tempc[3] = (temp_fac * rt1[3] + rt2[3]);
-
-          premul_float_to_straight_uchar(rt, tempc);
-        }
-      }
-      cp1 += 4;
-      cp2 += 4;
-      rt += 4;
-    }
+  if (fac >= 1.0f) {
+    memcpy(dst, src1, sizeof(T) * 4 * width * height);
+    return;
  }
-}

-static void do_alphaunder_effect_float(
-    float fac, int x, int y, float *rect1, float *rect2, float *out)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 under rt2  (alpha from rt2) */
-
-      /* this complex optimization is because the
-       * 'skybuf' can be crossed in
-       */
-      if (rt2[3] <= 0 && fac >= 1.0f) {
-        memcpy(rt, rt1, sizeof(float[4]));
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 col2 = load_premul_pixel(src2);
+      if (col2.w <= 0.0f) {
+        memcpy(dst, src1, sizeof(T) * 4);
      }
-      else if (rt2[3] >= 1.0f) {
-        memcpy(rt, rt2, sizeof(float[4]));
+      else if (col2.w >= 1.0f || fac <= 0.0f) {
+        memcpy(dst, src2, sizeof(T) * 4);
      }
      else {
-        float temp_fac = fac * (1.0f - rt2[3]);
-
-        if (fac == 0) {
-          memcpy(rt, rt2, sizeof(float[4]));
-        }
-        else {
-          rt[0] = temp_fac * rt1[0] + rt2[0];
-          rt[1] = temp_fac * rt1[1] + rt2[1];
-          rt[2] = temp_fac * rt1[2] + rt2[2];
-          rt[3] = temp_fac * rt1[3] + rt2[3];
-        }
+        float mfac = fac * (1.0f - col2.w);
+        float4 col1 = load_premul_pixel(src1);
+        float4 col = mfac * col1 + col2;
+        store_premul_pixel(col, dst);
      }
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
    }
  }
 }
@@ -411,7 +357,7 @@ static void do_alphaunder_effect(const SeqRenderData *context,
    slice_get_float_buffers(
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

-    do_alphaunder_effect_float(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaunder_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
  }
  else {
    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -419,7 +365,7 @@ static void do_alphaunder_effect(const SeqRenderData *context,
    slice_get_byte_buffers(
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

-    do_alphaunder_effect_byte(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaunder_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
  }
 }

@@ -527,53 +473,24 @@ static float invGammaCorrect(float c)
  return sqrtf_signed(c);
 }

-static void do_gammacross_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+template<typename T>
+static void do_gammacross_effect(
+    float fac, int width, int height, const T *src1, const T *src2, T *dst)
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
-  uchar *rt = out;
-
  float mfac = 1.0f - fac;

-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      float rt1[4], rt2[4], tempc[4];
-
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      tempc[0] = gammaCorrect(mfac * invGammaCorrect(rt1[0]) + fac * invGammaCorrect(rt2[0]));
-      tempc[1] = gammaCorrect(mfac * invGammaCorrect(rt1[1]) + fac * invGammaCorrect(rt2[1]));
-      tempc[2] = gammaCorrect(mfac * invGammaCorrect(rt1[2]) + fac * invGammaCorrect(rt2[2]));
-      tempc[3] = gammaCorrect(mfac * invGammaCorrect(rt1[3]) + fac * invGammaCorrect(rt2[3]));
-
-      premul_float_to_straight_uchar(rt, tempc);
-      cp1 += 4;
-      cp2 += 4;
-      rt += 4;
-    }
-  }
-}
-
-static void do_gammacross_effect_float(
-    float fac, int x, int y, float *rect1, float *rect2, float *out)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
-
-  float mfac = 1.0f - fac;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      rt[0] = gammaCorrect(mfac * invGammaCorrect(rt1[0]) + fac * invGammaCorrect(rt2[0]));
-      rt[1] = gammaCorrect(mfac * invGammaCorrect(rt1[1]) + fac * invGammaCorrect(rt2[1]));
-      rt[2] = gammaCorrect(mfac * invGammaCorrect(rt1[2]) + fac * invGammaCorrect(rt2[2]));
-      rt[3] = gammaCorrect(mfac * invGammaCorrect(rt1[3]) + fac * invGammaCorrect(rt2[3]));
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 col1 = load_premul_pixel(src1);
+      float4 col2 = load_premul_pixel(src2);
+      float4 col;
+      for (int c = 0; c < 4; ++c) {
+        col[c] = gammaCorrect(mfac * invGammaCorrect(col1[c]) + fac * invGammaCorrect(col2[c]));
+      }
+      store_premul_pixel(col, dst);
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
    }
  }
 }
@@ -604,7 +521,7 @@ static void do_gammacross_effect(const SeqRenderData *context,
    slice_get_float_buffers(
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

-    do_gammacross_effect_float(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_gammacross_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
  }
  else {
    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -612,7 +529,7 @@ static void do_gammacross_effect(const SeqRenderData *context,
    slice_get_byte_buffers(
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

-    do_gammacross_effect_byte(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_gammacross_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
  }
 }

@@ -943,57 +860,21 @@ static void do_mul_effect(const SeqRenderData *context,
 /** \name Blend Mode Effect
 * \{ */

-using IMB_blend_func_byte = void (*)(uchar *dst, const uchar *src1, const uchar *src2);
-using IMB_blend_func_float = void (*)(float *dst, const float *src1, const float *src2);
-
-BLI_INLINE void apply_blend_function_byte(float fac,
-                                          int x,
-                                          int y,
-                                          uchar *rect1,
-                                          uchar *rect2,
-                                          uchar *out,
-                                          IMB_blend_func_byte blend_function)
+/* blend_function has to be: void (T* dst, const T *src1, const T *src2) */
+template<typename T, typename Func>
+static void apply_blend_function(
+    float fac, int width, int height, const T *src1, T *src2, T *dst, Func blend_function)
 {
-  uchar *rt1 = rect1;
-  uchar *rt2 = rect2;
-  uchar *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      uint achannel = rt2[3];
-      rt2[3] = uint(achannel) * fac;
-      blend_function(rt, rt1, rt2);
-      rt2[3] = achannel;
-      rt[3] = rt1[3];
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
-    }
-  }
-}
-
-BLI_INLINE void apply_blend_function_float(float fac,
-                                           int x,
-                                           int y,
-                                           float *rect1,
-                                           float *rect2,
-                                           float *out,
-                                           IMB_blend_func_float blend_function)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      float achannel = rt2[3];
-      rt2[3] = achannel * fac;
-      blend_function(rt, rt1, rt2);
-      rt2[3] = achannel;
-      rt[3] = rt1[3];
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      T achannel = src2[3];
+      src2[3] = T(achannel * fac);
+      blend_function(dst, src1, src2);
+      src2[3] = achannel;
+      dst[3] = src1[3];
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
    }
  }
 }
@@ -1003,67 +884,67 @@ static void do_blend_effect_float(
 {
  switch (btype) {
    case SEQ_TYPE_ADD:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_add_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_add_float);
      break;
    case SEQ_TYPE_SUB:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_sub_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_sub_float);
      break;
    case SEQ_TYPE_MUL:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_mul_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_mul_float);
      break;
    case SEQ_TYPE_DARKEN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_darken_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_darken_float);
      break;
    case SEQ_TYPE_COLOR_BURN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_burn_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_burn_float);
      break;
    case SEQ_TYPE_LINEAR_BURN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_linearburn_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearburn_float);
      break;
    case SEQ_TYPE_SCREEN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_screen_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_screen_float);
      break;
    case SEQ_TYPE_LIGHTEN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_lighten_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_lighten_float);
      break;
    case SEQ_TYPE_DODGE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_dodge_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_dodge_float);
      break;
    case SEQ_TYPE_OVERLAY:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_overlay_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_overlay_float);
      break;
    case SEQ_TYPE_SOFT_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_softlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_softlight_float);
      break;
    case SEQ_TYPE_HARD_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_hardlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hardlight_float);
      break;
    case SEQ_TYPE_PIN_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_pinlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_pinlight_float);
      break;
    case SEQ_TYPE_LIN_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_linearlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearlight_float);
      break;
    case SEQ_TYPE_VIVID_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_vividlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_vividlight_float);
      break;
    case SEQ_TYPE_BLEND_COLOR:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_color_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_color_float);
      break;
    case SEQ_TYPE_HUE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_hue_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hue_float);
      break;
    case SEQ_TYPE_SATURATION:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_saturation_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_saturation_float);
      break;
    case SEQ_TYPE_VALUE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_luminosity_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_luminosity_float);
      break;
    case SEQ_TYPE_DIFFERENCE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_difference_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_difference_float);
      break;
    case SEQ_TYPE_EXCLUSION:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_exclusion_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_exclusion_float);
      break;
    default:
      break;
@@ -1075,67 +956,67 @@ static void do_blend_effect_byte(
 {
  switch (btype) {
    case SEQ_TYPE_ADD:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_add_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_add_byte);
      break;
    case SEQ_TYPE_SUB:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_sub_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_sub_byte);
      break;
    case SEQ_TYPE_MUL:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_mul_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_mul_byte);
      break;
    case SEQ_TYPE_DARKEN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_darken_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_darken_byte);
      break;
    case SEQ_TYPE_COLOR_BURN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_burn_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_burn_byte);
      break;
    case SEQ_TYPE_LINEAR_BURN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_linearburn_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearburn_byte);
      break;
    case SEQ_TYPE_SCREEN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_screen_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_screen_byte);
      break;
    case SEQ_TYPE_LIGHTEN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_lighten_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_lighten_byte);
      break;
    case SEQ_TYPE_DODGE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_dodge_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_dodge_byte);
      break;
    case SEQ_TYPE_OVERLAY:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_overlay_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_overlay_byte);
      break;
    case SEQ_TYPE_SOFT_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_softlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_softlight_byte);
      break;
    case SEQ_TYPE_HARD_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_hardlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hardlight_byte);
      break;
    case SEQ_TYPE_PIN_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_pinlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_pinlight_byte);
      break;
    case SEQ_TYPE_LIN_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_linearlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearlight_byte);
      break;
    case SEQ_TYPE_VIVID_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_vividlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_vividlight_byte);
      break;
    case SEQ_TYPE_BLEND_COLOR:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_color_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_color_byte);
      break;
    case SEQ_TYPE_HUE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_hue_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hue_byte);
      break;
    case SEQ_TYPE_SATURATION:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_saturation_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_saturation_byte);
      break;
    case SEQ_TYPE_VALUE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_luminosity_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_luminosity_byte);
      break;
    case SEQ_TYPE_DIFFERENCE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_difference_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_difference_byte);
      break;
    case SEQ_TYPE_EXCLUSION:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_exclusion_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_exclusion_byte);
      break;
    default:
      break;
@@ -1516,58 +1397,38 @@ static void copy_wipe_effect(Sequence *dst, Sequence *src, const int /*flag*/)
  dst->effectdata = MEM_dupallocN(src->effectdata);
 }

-static void do_wipe_effect_byte(const Sequence *seq,
-                                float fac,
-                                int width,
-                                int height,
-                                const uchar *rect1,
-                                const uchar *rect2,
-                                uchar *out)
+template<typename T>
+static void do_wipe_effect(
+    const Sequence *seq, float fac, int width, int height, const T *rect1, const T *rect2, T *out)
 {
  using namespace blender;
  const WipeVars *wipe = (const WipeVars *)seq->effectdata;
  const WipeZone wipezone = precalc_wipe_zone(wipe, width, height);

  threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) {
-    const uchar *cp1 = rect1 + y_range.first() * width * 4;
-    const uchar *cp2 = rect2 + y_range.first() * width * 4;
-    uchar *rt = out + y_range.first() * width * 4;
+    const T *cp1 = rect1 ? rect1 + y_range.first() * width * 4 : nullptr;
+    const T *cp2 = rect2 ? rect2 + y_range.first() * width * 4 : nullptr;
+    T *rt = out + y_range.first() * width * 4;
    for (const int y : y_range) {
      for (int x = 0; x < width; x++) {
        float check = check_zone(&wipezone, x, y, fac);
        if (check) {
          if (cp1) {
-            float rt1[4], rt2[4], tempc[4];
-
-            straight_uchar_to_premul_float(rt1, cp1);
-            straight_uchar_to_premul_float(rt2, cp2);
-
-            tempc[0] = rt1[0] * check + rt2[0] * (1 - check);
-            tempc[1] = rt1[1] * check + rt2[1] * (1 - check);
-            tempc[2] = rt1[2] * check + rt2[2] * (1 - check);
-            tempc[3] = rt1[3] * check + rt2[3] * (1 - check);
-
-            premul_float_to_straight_uchar(rt, tempc);
+            float4 col1 = load_premul_pixel(cp1);
+            float4 col2 = load_premul_pixel(cp2);
+            float4 col = col1 * check + col2 * (1.0f - check);
+            store_premul_pixel(col, rt);
          }
          else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 255;
+            store_opaque_black_pixel(rt);
          }
        }
        else {
          if (cp2) {
-            rt[0] = cp2[0];
-            rt[1] = cp2[1];
-            rt[2] = cp2[2];
-            rt[3] = cp2[3];
+            memcpy(rt, cp2, sizeof(T) * 4);
          }
          else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 255;
+            store_opaque_black_pixel(rt);
          }
        }

@@ -1583,66 +1444,6 @@ static void do_wipe_effect_byte(const Sequence *seq,
  });
 }

-static void do_wipe_effect_float(Sequence *seq,
-                                 float fac,
-                                 int width,
-                                 int height,
-                                 const float *rect1,
-                                 const float *rect2,
-                                 float *out)
-{
-  using namespace blender;
-  const WipeVars *wipe = (const WipeVars *)seq->effectdata;
-  const WipeZone wipezone = precalc_wipe_zone(wipe, width, height);
-
-  threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) {
-    const float *rt1 = rect1 + y_range.first() * width * 4;
-    const float *rt2 = rect2 + y_range.first() * width * 4;
-    float *rt = out + y_range.first() * width * 4;
-    for (const int y : y_range) {
-      for (int x = 0; x < width; x++) {
-        float check = check_zone(&wipezone, x, y, fac);
-        if (check) {
-          if (rt1) {
-            rt[0] = rt1[0] * check + rt2[0] * (1 - check);
-            rt[1] = rt1[1] * check + rt2[1] * (1 - check);
-            rt[2] = rt1[2] * check + rt2[2] * (1 - check);
-            rt[3] = rt1[3] * check + rt2[3] * (1 - check);
-          }
-          else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 1.0;
-          }
-        }
-        else {
-          if (rt2) {
-            rt[0] = rt2[0];
-            rt[1] = rt2[1];
-            rt[2] = rt2[2];
-            rt[3] = rt2[3];
-          }
-          else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 1.0;
-          }
-        }
-
-        rt += 4;
-        if (rt1 != nullptr) {
-          rt1 += 4;
-        }
-        if (rt2 != nullptr) {
-          rt2 += 4;
-        }
-      }
-    }
-  });
-}
-
 static ImBuf *do_wipe_effect(const SeqRenderData *context,
                             Sequence *seq,
                             float /*timeline_frame*/,
@@ -1654,22 +1455,22 @@ static ImBuf *do_wipe_effect(const SeqRenderData *context,
  ImBuf *out = prepare_effect_imbufs(context, ibuf1, ibuf2, ibuf3);

  if (out->float_buffer.data) {
-    do_wipe_effect_float(seq,
-                         fac,
-                         context->rectx,
-                         context->recty,
-                         ibuf1->float_buffer.data,
-                         ibuf2->float_buffer.data,
-                         out->float_buffer.data);
+    do_wipe_effect(seq,
+                   fac,
+                   context->rectx,
+                   context->recty,
+                   ibuf1->float_buffer.data,
+                   ibuf2->float_buffer.data,
+                   out->float_buffer.data);
  }
  else {
-    do_wipe_effect_byte(seq,
-                        fac,
-                        context->rectx,
-                        context->recty,
-                        ibuf1->byte_buffer.data,
-                        ibuf2->byte_buffer.data,
-                        out->byte_buffer.data);
+    do_wipe_effect(seq,
+                   fac,
+                   context->rectx,
+                   context->recty,
+                   ibuf1->byte_buffer.data,
+                   ibuf2->byte_buffer.data,
+                   out->byte_buffer.data);
  }

  return out;
@@ -1837,12 +1638,8 @@ static void do_transform_effect(const SeqRenderData *context,
 /** \name Glow Effect
 * \{ */

-static void glow_blur_bitmap(const blender::float4 *src,
-                             blender::float4 *map,
-                             int width,
-                             int height,
-                             float blur,
-                             int quality)
+static void glow_blur_bitmap(
+    const float4 *src, float4 *map, int width, int height, float blur, int quality)
 {
  using namespace blender;

@@ -1913,8 +1710,8 @@ static void glow_blur_bitmap(const blender::float4 *src,
  });
 }

-static void blur_isolate_highlights(const blender::float4 *in,
-                                    blender::float4 *out,
+static void blur_isolate_highlights(const float4 *in,
+                                    float4 *out,
                                    int width,
                                    int height,
                                    float threshold,
@@ -2525,7 +2322,7 @@ static void do_overdrop_effect(const SeqRenderData *context,
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

    do_drop_effect_float(fac, x, y, rect1, rect2, rect_out);
-    do_alphaover_effect_float(fac, x, y, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, x, y, rect1, rect2, rect_out);
  }
  else {
    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -2534,7 +2331,7 @@ static void do_overdrop_effect(const SeqRenderData *context,
        context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);

    do_drop_effect_byte(fac, x, y, rect1, rect2, rect_out);
-    do_alphaover_effect_byte(fac, x, y, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, x, y, rect1, rect2, rect_out);
  }
 }

@@ -2544,14 +2341,6 @@ static void do_overdrop_effect(const SeqRenderData *context,
 /** \name Gaussian Blur
 * \{ */

-/* NOTE: This gaussian blur implementation accumulates values in the square
- * kernel rather that doing X direction and then Y direction because of the
- * lack of using multiple-staged filters.
- *
- * Once we can we'll implement a way to apply filter as multiple stages we
- * can optimize hell of a lot in here.
- */
-
 static void init_gaussian_blur_effect(Sequence *seq)
 {
  if (seq->effectdata) {
@@ -2585,346 +2374,92 @@ static int early_out_gaussian_blur(Sequence *seq, float /*fac*/)
  return EARLY_DO_EFFECT;
 }

-/* TODO(sergey): De-duplicate with compositor. */
-static float *make_gaussian_blur_kernel(float rad, int size)
+static blender::Array<float> make_gaussian_blur_kernel(float rad, int size)
 {
-  float *gausstab, sum, val;
-  float fac;
-  int i, n;
+  int n = 2 * size + 1;
+  blender::Array<float> gausstab(n);

-  n = 2 * size + 1;
-
-  gausstab = (float *)MEM_mallocN(sizeof(float) * n, __func__);
-
-  sum = 0.0f;
-  fac = (rad > 0.0f ? 1.0f / rad : 0.0f);
-  for (i = -size; i <= size; i++) {
-    val = RE_filter_value(R_FILTER_GAUSS, float(i) * fac);
+  float sum = 0.0f;
+  float fac = (rad > 0.0f ? 1.0f / rad : 0.0f);
+  for (int i = -size; i <= size; i++) {
+    float val = RE_filter_value(R_FILTER_GAUSS, float(i) * fac);
    sum += val;
    gausstab[i + size] = val;
  }

-  sum = 1.0f / sum;
-  for (i = 0; i < n; i++) {
-    gausstab[i] *= sum;
+  float inv_sum = 1.0f / sum;
+  for (int i = 0; i < n; i++) {
+    gausstab[i] *= inv_sum;
  }

  return gausstab;
 }

-static void do_gaussian_blur_effect_byte_x(Sequence *seq,
-                                           int start_line,
-                                           int x,
-                                           int y,
-                                           int frame_width,
-                                           int /*frame_height*/,
-                                           const uchar *rect,
-                                           uchar *out)
+template<typename T>
+static void gaussian_blur_x(const blender::Array<float> &gausstab,
+                            int half_size,
+                            int start_line,
+                            int width,
+                            int height,
+                            int /*frame_height*/,
+                            const T *rect,
+                            T *dst)
 {
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_x = int(data->size_x + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_x;
-  gausstab_x = make_gaussian_blur_kernel(data->size_x, size_x);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  dst += start_line * width * 4;
+  for (int y = start_line; y < start_line + height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 accum(0.0f);
      float accum_weight = 0.0f;

-      for (int current_x = j - size_x; current_x <= j + size_x; current_x++) {
-        if (current_x < 0 || current_x >= frame_width) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(current_x, i + start_line);
-        float weight = gausstab_x[current_x - j + size_x];
-        accum[0] += rect[index] * weight;
-        accum[1] += rect[index + 1] * weight;
-        accum[2] += rect[index + 2] * weight;
-        accum[3] += rect[index + 3] * weight;
+      int xmin = blender::math::max(x - half_size, 0);
+      int xmax = blender::math::min(x + half_size, width - 1);
+      for (int nx = xmin, index = (xmin - x) + half_size; nx <= xmax; nx++, index++) {
+        float weight = gausstab[index];
+        int offset = (y * width + nx) * 4;
+        accum += float4(rect + offset) * weight;
        accum_weight += weight;
      }
-
-      float inv_accum_weight = 1.0f / accum_weight;
-      out[out_index + 0] = accum[0] * inv_accum_weight;
-      out[out_index + 1] = accum[1] * inv_accum_weight;
-      out[out_index + 2] = accum[2] * inv_accum_weight;
-      out[out_index + 3] = accum[3] * inv_accum_weight;
+      accum *= (1.0f / accum_weight);
+      dst[0] = accum[0];
+      dst[1] = accum[1];
+      dst[2] = accum[2];
+      dst[3] = accum[3];
+      dst += 4;
    }
  }
-
-  MEM_freeN(gausstab_x);
-#undef INDEX
 }

-static void do_gaussian_blur_effect_byte_y(Sequence *seq,
-                                           int start_line,
-                                           int x,
-                                           int y,
-                                           int /*frame_width*/,
-                                           int frame_height,
-                                           const uchar *rect,
-                                           uchar *out)
+template<typename T>
+static void gaussian_blur_y(const blender::Array<float> &gausstab,
+                            int half_size,
+                            int start_line,
+                            int width,
+                            int height,
+                            int frame_height,
+                            const T *rect,
+                            T *dst)
 {
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_y = int(data->size_y + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_y;
-  gausstab_y = make_gaussian_blur_kernel(data->size_y, size_y);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  dst += start_line * width * 4;
+  for (int y = start_line; y < start_line + height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 accum(0.0f);
      float accum_weight = 0.0f;
-      for (int current_y = i - size_y; current_y <= i + size_y; current_y++) {
-        if (current_y < -start_line || current_y + start_line >= frame_height) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(j, current_y + start_line);
-        float weight = gausstab_y[current_y - i + size_y];
-        accum[0] += rect[index] * weight;
-        accum[1] += rect[index + 1] * weight;
-        accum[2] += rect[index + 2] * weight;
-        accum[3] += rect[index + 3] * weight;
+      int ymin = blender::math::max(y - half_size, 0);
+      int ymax = blender::math::min(y + half_size, frame_height - 1);
+      for (int ny = ymin, index = (ymin - y) + half_size; ny <= ymax; ny++, index++) {
+        float weight = gausstab[index];
+        int offset = (ny * width + x) * 4;
+        accum += float4(rect + offset) * weight;
        accum_weight += weight;
      }
-      float inv_accum_weight = 1.0f / accum_weight;
-      out[out_index + 0] = accum[0] * inv_accum_weight;
-      out[out_index + 1] = accum[1] * inv_accum_weight;
-      out[out_index + 2] = accum[2] * inv_accum_weight;
-      out[out_index + 3] = accum[3] * inv_accum_weight;
+      accum *= (1.0f / accum_weight);
+      dst[0] = accum[0];
+      dst[1] = accum[1];
+      dst[2] = accum[2];
+      dst[3] = accum[3];
+      dst += 4;
    }
  }
-
-  MEM_freeN(gausstab_y);
-#undef INDEX
-}
-
-static void do_gaussian_blur_effect_float_x(Sequence *seq,
-                                            int start_line,
-                                            int x,
-                                            int y,
-                                            int frame_width,
-                                            int /*frame_height*/,
-                                            float *rect,
-                                            float *out)
-{
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_x = int(data->size_x + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_x;
-  gausstab_x = make_gaussian_blur_kernel(data->size_x, size_x);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-      float accum_weight = 0.0f;
-      for (int current_x = j - size_x; current_x <= j + size_x; current_x++) {
-        if (current_x < 0 || current_x >= frame_width) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(current_x, i + start_line);
-        float weight = gausstab_x[current_x - j + size_x];
-        madd_v4_v4fl(accum, &rect[index], weight);
-        accum_weight += weight;
-      }
-      mul_v4_v4fl(&out[out_index], accum, 1.0f / accum_weight);
-    }
-  }
-
-  MEM_freeN(gausstab_x);
-#undef INDEX
-}
-
-static void do_gaussian_blur_effect_float_y(Sequence *seq,
-                                            int start_line,
-                                            int x,
-                                            int y,
-                                            int /*frame_width*/,
-                                            int frame_height,
-                                            float *rect,
-                                            float *out)
-{
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_y = int(data->size_y + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_y;
-  gausstab_y = make_gaussian_blur_kernel(data->size_y, size_y);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-      float accum_weight = 0.0f;
-      for (int current_y = i - size_y; current_y <= i + size_y; current_y++) {
-        if (current_y < -start_line || current_y + start_line >= frame_height) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(j, current_y + start_line);
-        float weight = gausstab_y[current_y - i + size_y];
-        madd_v4_v4fl(accum, &rect[index], weight);
-        accum_weight += weight;
-      }
-      mul_v4_v4fl(&out[out_index], accum, 1.0f / accum_weight);
-    }
-  }
-
-  MEM_freeN(gausstab_y);
-#undef INDEX
-}
-
-static void do_gaussian_blur_effect_x_cb(const SeqRenderData *context,
-                                         Sequence *seq,
-                                         ImBuf *ibuf,
-                                         int start_line,
-                                         int total_lines,
-                                         ImBuf *out)
-{
-  if (out->float_buffer.data) {
-    float *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_float_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_float_x(seq,
-                                    start_line,
-                                    context->rectx,
-                                    total_lines,
-                                    context->rectx,
-                                    context->recty,
-                                    ibuf->float_buffer.data,
-                                    rect_out);
-  }
-  else {
-    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_byte_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_byte_x(seq,
-                                   start_line,
-                                   context->rectx,
-                                   total_lines,
-                                   context->rectx,
-                                   context->recty,
-                                   ibuf->byte_buffer.data,
-                                   rect_out);
-  }
-}
-
-static void do_gaussian_blur_effect_y_cb(const SeqRenderData *context,
-                                         Sequence *seq,
-                                         ImBuf *ibuf,
-                                         int start_line,
-                                         int total_lines,
-                                         ImBuf *out)
-{
-  if (out->float_buffer.data) {
-    float *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_float_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_float_y(seq,
-                                    start_line,
-                                    context->rectx,
-                                    total_lines,
-                                    context->rectx,
-                                    context->recty,
-                                    ibuf->float_buffer.data,
-                                    rect_out);
-  }
-  else {
-    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_byte_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_byte_y(seq,
-                                   start_line,
-                                   context->rectx,
-                                   total_lines,
-                                   context->rectx,
-                                   context->recty,
-                                   ibuf->byte_buffer.data,
-                                   rect_out);
-  }
-}
-
-struct RenderGaussianBlurEffectInitData {
-  const SeqRenderData *context;
-  Sequence *seq;
-  ImBuf *ibuf;
-  ImBuf *out;
-};
-
-struct RenderGaussianBlurEffectThread {
-  const SeqRenderData *context;
-  Sequence *seq;
-  ImBuf *ibuf;
-  ImBuf *out;
-  int start_line, tot_line;
-};
-
-static void render_effect_execute_init_handle(void *handle_v,
-                                              int start_line,
-                                              int tot_line,
-                                              void *init_data_v)
-{
-  RenderGaussianBlurEffectThread *handle = (RenderGaussianBlurEffectThread *)handle_v;
-  RenderGaussianBlurEffectInitData *init_data = (RenderGaussianBlurEffectInitData *)init_data_v;
-
-  handle->context = init_data->context;
-  handle->seq = init_data->seq;
-  handle->ibuf = init_data->ibuf;
-  handle->out = init_data->out;
-
-  handle->start_line = start_line;
-  handle->tot_line = tot_line;
-}
-
-static void *render_effect_execute_do_x_thread(void *thread_data_v)
-{
-  RenderGaussianBlurEffectThread *thread_data = (RenderGaussianBlurEffectThread *)thread_data_v;
-  do_gaussian_blur_effect_x_cb(thread_data->context,
-                               thread_data->seq,
-                               thread_data->ibuf,
-                               thread_data->start_line,
-                               thread_data->tot_line,
-                               thread_data->out);
-  return nullptr;
-}
-
-static void *render_effect_execute_do_y_thread(void *thread_data_v)
-{
-  RenderGaussianBlurEffectThread *thread_data = (RenderGaussianBlurEffectThread *)thread_data_v;
-  do_gaussian_blur_effect_y_cb(thread_data->context,
-                               thread_data->seq,
-                               thread_data->ibuf,
-                               thread_data->start_line,
-                               thread_data->tot_line,
-                               thread_data->out);
-
-  return nullptr;
 }

 static ImBuf *do_gaussian_blur_effect(const SeqRenderData *context,
@@ -2935,32 +2470,75 @@ static ImBuf *do_gaussian_blur_effect(const SeqRenderData *context,
                                      ImBuf * /*ibuf2*/,
                                      ImBuf * /*ibuf3*/)
 {
+  using namespace blender;
+
+  /* Create blur kernel weights. */
+  const GaussianBlurVars *data = static_cast<const GaussianBlurVars *>(seq->effectdata);
+  const int half_size_x = int(data->size_x + 0.5f);
+  const int half_size_y = int(data->size_y + 0.5f);
+  Array<float> gausstab_x = make_gaussian_blur_kernel(data->size_x, half_size_x);
+  Array<float> gausstab_y = make_gaussian_blur_kernel(data->size_y, half_size_y);
+
+  const int width = context->rectx;
+  const int height = context->recty;
+  const bool is_float = ibuf1->float_buffer.data;
+
+  /* Horizontal blur: create output, blur ibuf1 into it. */
  ImBuf *out = prepare_effect_imbufs(context, ibuf1, nullptr, nullptr);
+  threading::parallel_for(IndexRange(context->recty), 32, [&](const IndexRange y_range) {
+    const int y_first = y_range.first();
+    const int y_size = y_range.size();
+    if (is_float) {
+      gaussian_blur_x(gausstab_x,
+                      half_size_x,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->float_buffer.data,
+                      out->float_buffer.data);
+    }
+    else {
+      gaussian_blur_x(gausstab_x,
+                      half_size_x,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->byte_buffer.data,
+                      out->byte_buffer.data);
+    }
+  });

-  RenderGaussianBlurEffectInitData init_data;
-
-  init_data.context = context;
-  init_data.seq = seq;
-  init_data.ibuf = ibuf1;
-  init_data.out = out;
-
-  IMB_processor_apply_threaded(out->y,
-                               sizeof(RenderGaussianBlurEffectThread),
-                               &init_data,
-                               render_effect_execute_init_handle,
-                               render_effect_execute_do_x_thread);
-
+  /* Vertical blur: create output, blur previous output into it. */
  ibuf1 = out;
-  init_data.ibuf = ibuf1;
  out = prepare_effect_imbufs(context, ibuf1, nullptr, nullptr);
-  init_data.out = out;
-
-  IMB_processor_apply_threaded(out->y,
-                               sizeof(RenderGaussianBlurEffectThread),
-                               &init_data,
-                               render_effect_execute_init_handle,
-                               render_effect_execute_do_y_thread);
+  threading::parallel_for(IndexRange(context->recty), 32, [&](const IndexRange y_range) {
+    const int y_first = y_range.first();
+    const int y_size = y_range.size();
+    if (is_float) {
+      gaussian_blur_y(gausstab_y,
+                      half_size_y,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->float_buffer.data,
+                      out->float_buffer.data);
+    }
+    else {
+      gaussian_blur_y(gausstab_y,
+                      half_size_y,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->byte_buffer.data,
+                      out->byte_buffer.data);
+    }
+  });

+  /* Free the first output. */
  IMB_freeImBuf(ibuf1);

  return out;