VSE: Speedup Subsampled 3x3 image filter

Make Subsampling 3x3 filter twice faster (on 4K UHD resolution, Windows/VS2022/Ryzen5950X: 52.7ms -> 28.3ms), by reformulating how it works: Conceptually Subsampling filter is a box filter: it sums up N source image pixels, computes their average and outputs the result. Critical thing is, that should be done in premultiplied space so that colors from fully or mostly transparent regions do not "override" opaque colors. Previously, when operating on byte images, the code achieved this by always working on byte values, doing "progressively smaller" lerp into byte color result, taking care of premultiplication and again storing the "straight" alpha for each sample being processed. This meant that for each sample, there are 3 divisions involved! This also led to some precision loss, since for all 9 samples all the intermediate results would only be stored at byte precision. Reformulate that by simply accumulating the premultiplied color as a float. This gets rid of all divisions, except the last step when said float needs to be written back into a byte color. The unit test results have a tiny difference, since now it is arguably better (as per above, previously it was having some precision loss). Pull Request: https://projects.blender.org/blender/blender/pulls/117125
2024-01-17 10:26:50 +01:00
parent 11c2028795
commit b85fecee67
2 changed files with 50 additions and 61 deletions
--- a/source/blender/imbuf/intern/transform.cc
+++ b/source/blender/imbuf/intern/transform.cc
@@ -134,26 +134,6 @@ static float wrap_uv(float value, int size)
  return x;
 }

-template<typename T, int NumChannels>
-static void add_subsample(const T *src, T *dst, int sample_number)
-{
-  BLI_STATIC_ASSERT((is_same_any_v<T, uchar, float>), "Only uchar and float channels supported.");
-
-  float factor = 1.0 / (sample_number + 1);
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(NumChannels == 4, "Pixels using uchar requires to have 4 channels.");
-    blend_color_interpolate_byte(dst, dst, src, factor);
-  }
-  else if constexpr (std::is_same_v<T, float> && NumChannels == 4) {
-    blend_color_interpolate_float(dst, dst, src, factor);
-  }
-  else if constexpr (std::is_same_v<T, float>) {
-    for (int i : IndexRange(NumChannels)) {
-      dst[i] = dst[i] * (1.0f - factor) + src[i] * factor;
-    }
-  }
-}
-
 template<int NumChannels>
 static void sample_nearest_float(const ImBuf *source, float u, float v, float *r_sample)
 {
@@ -235,39 +215,48 @@ static void sample_image(const ImBuf *source, float u, float v, T *r_sample)
  }
 }

-template<typename T, int SrcChannels> static void store_sample(const T *sample, T *dst)
+static void add_subsample(const float src[4], float dst[4])
 {
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
-    copy_v4_v4_uchar(dst, sample);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
-    copy_v4_v4(dst, sample);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 3) {
-    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 2) {
-    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 1) {
-    /* Note: single channel sample is stored as grayscale. */
-    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
-  }
-  else {
-    BLI_assert_unreachable();
-  }
+  add_v4_v4(dst, src);
 }

-template<typename T, int SrcChannels>
-static void mix_and_store_sample(const T *sample, T *dst, const float mix_factor)
+static void add_subsample(const uchar src[4], float dst[4])
 {
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
-    blend_color_interpolate_byte(dst, dst, sample, mix_factor);
+  float premul[4];
+  straight_uchar_to_premul_float(premul, src);
+  add_v4_v4(dst, premul);
+}
+
+static void store_premul_float_sample(const float sample[4], float dst[4])
+{
+  copy_v4_v4(dst, sample);
+}
+
+static void store_premul_float_sample(const float sample[4], uchar dst[4])
+{
+  premul_float_to_straight_uchar(dst, sample);
+}
+
+template<int SrcChannels> static void store_sample(const uchar *sample, uchar *dst)
+{
+  BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
+  copy_v4_v4_uchar(dst, sample);
+}
+
+template<int SrcChannels> static void store_sample(const float *sample, float *dst)
+{
+  if constexpr (SrcChannels == 4) {
+    copy_v4_v4(dst, sample);
  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
-    blend_color_interpolate_float(dst, dst, sample, mix_factor);
+  else if constexpr (SrcChannels == 3) {
+    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
+  }
+  else if constexpr (SrcChannels == 2) {
+    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
+  }
+  else if constexpr (SrcChannels == 1) {
+    /* Note: single channel sample is stored as grayscale. */
+    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
  }
  else {
    BLI_assert_unreachable();
@@ -286,29 +275,29 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
  float2 uv_start = ctx.start_uv + ctx.add_x * 0.5f + ctx.add_y * 0.5f;

  if (ctx.subsampling_deltas.size() > 1) {
-    /* Multiple samples per pixel. */
+    /* Multiple samples per pixel: accumulate them premultiplied,
+     * divide by sample count and write out (un-premultiplying if writing out
+     * to byte image). */
+    const float inv_count = 1.0f / ctx.subsampling_deltas.size();
    for (int yi : y_range) {
      T *output = init_pixel_pointer<T>(ctx.dst, ctx.dst_region_x_range.first(), yi);
      float2 uv_row = uv_start + yi * ctx.add_y;
      for (int xi : ctx.dst_region_x_range) {
        float2 uv = uv_row + xi * ctx.add_x;
-        T sample[4] = {};
-        int num_subsamples_added = 0;
+        float sample[4] = {};

        for (const float2 &delta_uv : ctx.subsampling_deltas) {
          const float2 sub_uv = uv + delta_uv;
          if (!CropSource || !should_discard(ctx, sub_uv)) {
            T sub_sample[4];
            sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, sub_uv.x, sub_uv.y, sub_sample);
-            add_subsample<T, SrcChannels>(sub_sample, sample, num_subsamples_added);
-            num_subsamples_added += 1;
+            add_subsample(sub_sample, sample);
          }
        }

-        if (num_subsamples_added != 0) {
-          const float mix_weight = float(num_subsamples_added) / ctx.subsampling_deltas.size();
-          mix_and_store_sample<T, SrcChannels>(sample, output, mix_weight);
-        }
+        mul_v4_v4fl(sample, sample, inv_count);
+        store_premul_float_sample(sample, output);
+
        output += 4;
      }
    }
@@ -323,7 +312,7 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
        if (!CropSource || !should_discard(ctx, uv)) {
          T sample[4];
          sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, uv.x, uv.y, sample);
-          store_sample<T, SrcChannels>(sample, output);
+          store_sample<SrcChannels>(sample, output);
        }
        output += 4;
      }
--- a/source/blender/imbuf/intern/transform_test.cc
+++ b/source/blender/imbuf/intern/transform_test.cc
@@ -71,9 +71,9 @@ TEST(imbuf_transform, nearest_subsample3_2x_smaller)
 {
  ImBuf *res = transform_2x_smaller(IMB_FILTER_NEAREST, 3);
  const ColorTheme4b *got = reinterpret_cast<ColorTheme4b *>(res->byte_buffer.data);
-  EXPECT_EQ(got[0], ColorTheme4b(226, 168, 113, 255));
-  EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 16));
-  EXPECT_EQ(got[2], ColorTheme4b(55, 22, 64, 254));
+  EXPECT_EQ(got[0], ColorTheme4b(227, 170, 113, 255));
+  EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 17));
+  EXPECT_EQ(got[2], ColorTheme4b(56, 22, 64, 253));
  IMB_freeImBuf(res);
 }