From b85fecee673884c8ff660c302eec21537e91b9ff Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Wed, 17 Jan 2024 10:26:50 +0100
Subject: [PATCH] VSE: Speedup Subsampled 3x3 image filter

Make Subsampling 3x3 filter twice faster (on 4K UHD resolution,
Windows/VS2022/Ryzen5950X: 52.7ms -> 28.3ms), by reformulating how it works:

Conceptually Subsampling filter is a box filter: it sums up N source image
pixels, computes their average and outputs the result. Critical thing is,
that should be done in premultiplied space so that colors from fully or
mostly transparent regions do not "override" opaque  colors.

Previously, when operating on byte images, the code achieved this by always
working on byte values, doing "progressively smaller" lerp into byte color
result, taking care of premultiplication and again storing the "straight"
alpha for each sample being processed. This meant that for each sample, there
are 3 divisions involved! This also led to some precision loss, since for all
9 samples all the intermediate results would only be stored at byte precision.

Reformulate that by simply accumulating the premultiplied color as a float.
This gets rid of all divisions, except the last step when said float needs to
be written back into a byte color.

The unit test results have a tiny difference, since now it is arguably better
(as per above, previously it was having some precision loss).

Pull Request: https://projects.blender.org/blender/blender/pulls/117125
---
 source/blender/imbuf/intern/transform.cc      | 105 ++++++++----------
 source/blender/imbuf/intern/transform_test.cc |   6 +-
 2 files changed, 50 insertions(+), 61 deletions(-)
diff --git a/source/blender/imbuf/intern/transform.cc b/source/blender/imbuf/intern/transform.cc
index f531f527df4..3629e23e911 100644
--- a/source/blender/imbuf/intern/transform.cc
+++ b/source/blender/imbuf/intern/transform.cc
@@ -134,26 +134,6 @@ static float wrap_uv(float value, int size)
   return x;
 }
 
-template<typename T, int NumChannels>
-static void add_subsample(const T *src, T *dst, int sample_number)
-{
-  BLI_STATIC_ASSERT((is_same_any_v<T, uchar, float>), "Only uchar and float channels supported.");
-
-  float factor = 1.0 / (sample_number + 1);
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(NumChannels == 4, "Pixels using uchar requires to have 4 channels.");
-    blend_color_interpolate_byte(dst, dst, src, factor);
-  }
-  else if constexpr (std::is_same_v<T, float> && NumChannels == 4) {
-    blend_color_interpolate_float(dst, dst, src, factor);
-  }
-  else if constexpr (std::is_same_v<T, float>) {
-    for (int i : IndexRange(NumChannels)) {
-      dst[i] = dst[i] * (1.0f - factor) + src[i] * factor;
-    }
-  }
-}
-
 template<int NumChannels>
 static void sample_nearest_float(const ImBuf *source, float u, float v, float *r_sample)
 {
@@ -235,39 +215,48 @@ static void sample_image(const ImBuf *source, float u, float v, T *r_sample)
   }
 }
 
-template<typename T, int SrcChannels> static void store_sample(const T *sample, T *dst)
+static void add_subsample(const float src[4], float dst[4])
 {
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
-    copy_v4_v4_uchar(dst, sample);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
-    copy_v4_v4(dst, sample);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 3) {
-    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 2) {
-    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 1) {
-    /* Note: single channel sample is stored as grayscale. */
-    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
-  }
-  else {
-    BLI_assert_unreachable();
-  }
+  add_v4_v4(dst, src);
 }
 
-template<typename T, int SrcChannels>
-static void mix_and_store_sample(const T *sample, T *dst, const float mix_factor)
+static void add_subsample(const uchar src[4], float dst[4])
 {
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
-    blend_color_interpolate_byte(dst, dst, sample, mix_factor);
+  float premul[4];
+  straight_uchar_to_premul_float(premul, src);
+  add_v4_v4(dst, premul);
+}
+
+static void store_premul_float_sample(const float sample[4], float dst[4])
+{
+  copy_v4_v4(dst, sample);
+}
+
+static void store_premul_float_sample(const float sample[4], uchar dst[4])
+{
+  premul_float_to_straight_uchar(dst, sample);
+}
+
+template<int SrcChannels> static void store_sample(const uchar *sample, uchar *dst)
+{
+  BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
+  copy_v4_v4_uchar(dst, sample);
+}
+
+template<int SrcChannels> static void store_sample(const float *sample, float *dst)
+{
+  if constexpr (SrcChannels == 4) {
+    copy_v4_v4(dst, sample);
   }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
-    blend_color_interpolate_float(dst, dst, sample, mix_factor);
+  else if constexpr (SrcChannels == 3) {
+    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
+  }
+  else if constexpr (SrcChannels == 2) {
+    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
+  }
+  else if constexpr (SrcChannels == 1) {
+    /* Note: single channel sample is stored as grayscale. */
+    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
   }
   else {
     BLI_assert_unreachable();
@@ -286,29 +275,29 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
   float2 uv_start = ctx.start_uv + ctx.add_x * 0.5f + ctx.add_y * 0.5f;
 
   if (ctx.subsampling_deltas.size() > 1) {
-    /* Multiple samples per pixel. */
+    /* Multiple samples per pixel: accumulate them premultiplied,
+     * divide by sample count and write out (un-premultiplying if writing out
+     * to byte image). */
+    const float inv_count = 1.0f / ctx.subsampling_deltas.size();
     for (int yi : y_range) {
       T *output = init_pixel_pointer<T>(ctx.dst, ctx.dst_region_x_range.first(), yi);
       float2 uv_row = uv_start + yi * ctx.add_y;
       for (int xi : ctx.dst_region_x_range) {
         float2 uv = uv_row + xi * ctx.add_x;
-        T sample[4] = {};
-        int num_subsamples_added = 0;
+        float sample[4] = {};
 
         for (const float2 &delta_uv : ctx.subsampling_deltas) {
           const float2 sub_uv = uv + delta_uv;
           if (!CropSource || !should_discard(ctx, sub_uv)) {
             T sub_sample[4];
             sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, sub_uv.x, sub_uv.y, sub_sample);
-            add_subsample<T, SrcChannels>(sub_sample, sample, num_subsamples_added);
-            num_subsamples_added += 1;
+            add_subsample(sub_sample, sample);
           }
         }
 
-        if (num_subsamples_added != 0) {
-          const float mix_weight = float(num_subsamples_added) / ctx.subsampling_deltas.size();
-          mix_and_store_sample<T, SrcChannels>(sample, output, mix_weight);
-        }
+        mul_v4_v4fl(sample, sample, inv_count);
+        store_premul_float_sample(sample, output);
+
         output += 4;
       }
     }
@@ -323,7 +312,7 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
         if (!CropSource || !should_discard(ctx, uv)) {
           T sample[4];
           sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, uv.x, uv.y, sample);
-          store_sample<T, SrcChannels>(sample, output);
+          store_sample<SrcChannels>(sample, output);
         }
         output += 4;
       }
diff --git a/source/blender/imbuf/intern/transform_test.cc b/source/blender/imbuf/intern/transform_test.cc
index f513cff1285..816926b5d88 100644
--- a/source/blender/imbuf/intern/transform_test.cc
+++ b/source/blender/imbuf/intern/transform_test.cc
@@ -71,9 +71,9 @@ TEST(imbuf_transform, nearest_subsample3_2x_smaller)
 {
   ImBuf *res = transform_2x_smaller(IMB_FILTER_NEAREST, 3);
   const ColorTheme4b *got = reinterpret_cast<ColorTheme4b *>(res->byte_buffer.data);
-  EXPECT_EQ(got[0], ColorTheme4b(226, 168, 113, 255));
-  EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 16));
-  EXPECT_EQ(got[2], ColorTheme4b(55, 22, 64, 254));
+  EXPECT_EQ(got[0], ColorTheme4b(227, 170, 113, 255));
+  EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 17));
+  EXPECT_EQ(got[2], ColorTheme4b(56, 22, 64, 253));
   IMB_freeImBuf(res);
 }