ImBuf: optimize IMB_transform

IMB_transform is used by Sequencer (and other places) to do image translation/rotation/scale on the CPU. This PR speeds up parts of it, particularly when bilinear filtering is used. No behavior changes are expected. - Don't use virtual function calls inside inner loop. The code was using class hierarchies with virtual calls just to do equivalent of "outside of image? ignore" and "wrap UV coordinates or not?" decisions. Make those use non-virtual function based code. - Simplify pixel sampling functions to only do the work as needed by anything within Blender codebase. For example, bilinear sampling of uchar images always uses 4 RGBA channels and never does "UV wrap" logic. - Bilinear interpolation uchar: completely branchless SIMD code now. - Bilinear interpolation float: 2x floor() calls instead of 4x floor() + 2x ceil(), and final sample blending is done with SIMD. Sequencer at 4K UHD resolution, with two image strips that need a transform, playback framerate: - Windows Ryzen 5950X: 18.7fps -> 26.2fps (IMB_transform time per frame goes 26.3ms -> 11.2ms) - Mac M1 Max: 27.3fps -> 31.4fps At that point the IMB_transform is not the slowest part of where playback takes time (but rather sequencer effect application etc.). Note: the amount of _actual code_ got a bit smaller. But I've added 100 lines of unit tests in BLI_math_interp_test.cc, the bilinear interpolation functions were only tested very indirectly by CPU compositor template image tests. Pull Request: https://projects.blender.org/blender/blender/pulls/115653
2023-12-14 15:10:30 +01:00
parent a52a362527
commit 1e0bf33b00
8 changed files with 425 additions and 358 deletions
--- a/source/blender/blenlib/BLI_math_interp.h
+++ b/source/blender/blenlib/BLI_math_interp.h
@@ -15,24 +15,14 @@ extern "C" {
 void BLI_bicubic_interpolation_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v);

-void BLI_bicubic_interpolation_char(const unsigned char *buffer,
-                                    unsigned char *output,
-                                    int width,
-                                    int height,
-                                    int components,
-                                    float u,
-                                    float v);
+void BLI_bicubic_interpolation_char(
+    const unsigned char *buffer, unsigned char *output, int width, int height, float u, float v);

 void BLI_bilinear_interpolation_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v);

-void BLI_bilinear_interpolation_char(const unsigned char *buffer,
-                                     unsigned char *output,
-                                     int width,
-                                     int height,
-                                     int components,
-                                     float u,
-                                     float v);
+void BLI_bilinear_interpolation_char(
+    const unsigned char *buffer, unsigned char *output, int width, int height, float u, float v);

 void BLI_bilinear_interpolation_wrap_fl(const float *buffer,
                                        float *output,
@@ -44,16 +34,6 @@ void BLI_bilinear_interpolation_wrap_fl(const float *buffer,
                                        bool wrap_x,
                                        bool wrap_y);

-void BLI_bilinear_interpolation_wrap_char(const unsigned char *buffer,
-                                          unsigned char *output,
-                                          int width,
-                                          int height,
-                                          int components,
-                                          float u,
-                                          float v,
-                                          bool wrap_x,
-                                          bool wrap_y);
-
 #define EWA_MAXIDX 255
 extern const float EWA_WTS[EWA_MAXIDX + 1];

--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -524,6 +524,7 @@ if(WITH_GTESTS)
    tests/BLI_math_bits_test.cc
    tests/BLI_math_color_test.cc
    tests/BLI_math_geom_test.cc
+    tests/BLI_math_interp_test.cc
    tests/BLI_math_matrix_test.cc
    tests/BLI_math_matrix_types_test.cc
    tests/BLI_math_rotation_test.cc
--- a/source/blender/blenlib/intern/math_interp.c
+++ b/source/blender/blenlib/intern/math_interp.c
@@ -7,12 +7,18 @@
 */

 #include <math.h>
+#include <string.h>

 #include "BLI_math_base.h"
 #include "BLI_math_interp.h"
 #include "BLI_math_vector.h"
+#include "BLI_simd.h"
 #include "BLI_strict_flags.h"

+#if BLI_HAVE_SSE2 && defined(__SSE4_1__)
+#  include <smmintrin.h> /* _mm_floor_ps */
+#endif
+
 /**************************************************************************
 *                            INTERPOLATIONS
 *
@@ -236,221 +242,298 @@ void BLI_bicubic_interpolation_fl(
 }

 void BLI_bicubic_interpolation_char(
-    const uchar *buffer, uchar *output, int width, int height, int components, float u, float v)
+    const uchar *buffer, uchar *output, int width, int height, float u, float v)
 {
-  bicubic_interpolation(buffer, NULL, output, NULL, width, height, components, u, v);
+  bicubic_interpolation(buffer, NULL, output, NULL, width, height, 4, u, v);
 }

 /* BILINEAR INTERPOLATION */
-BLI_INLINE void bilinear_interpolation(const uchar *byte_buffer,
-                                       const float *float_buffer,
-                                       uchar *byte_output,
-                                       float *float_output,
-                                       int width,
-                                       int height,
-                                       int components,
-                                       float u,
-                                       float v,
-                                       bool wrap_x,
-                                       bool wrap_y)
+BLI_INLINE void bilinear_interpolation_fl(const float *float_buffer,
+                                          float *float_output,
+                                          int width,
+                                          int height,
+                                          int components,
+                                          float u,
+                                          float v,
+                                          bool wrap_x,
+                                          bool wrap_y)
 {
  float a, b;
  float a_b, ma_b, a_mb, ma_mb;
  int y1, y2, x1, x2;

-  /* ImBuf in must have a valid rect or rect_float, assume this is already checked */
+  float uf = floorf(u);
+  float vf = floorf(v);

-  x1 = (int)floor(u);
-  x2 = (int)ceil(u);
-  y1 = (int)floor(v);
-  y2 = (int)ceil(v);
+  x1 = (int)uf;
+  x2 = x1 + 1;
+  y1 = (int)vf;
+  y2 = y1 + 1;

-  if (float_output) {
-    const float *row1, *row2, *row3, *row4;
-    const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  const float *row1, *row2, *row3, *row4;
+  const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};

-    /* pixel value must be already wrapped, however values at boundaries may flip */
-    if (wrap_x) {
-      if (x1 < 0) {
-        x1 = width - 1;
-      }
-      if (x2 >= width) {
-        x2 = 0;
-      }
+  /* pixel value must be already wrapped, however values at boundaries may flip */
+  if (wrap_x) {
+    if (x1 < 0) {
+      x1 = width - 1;
    }
-    else if (x2 < 0 || x1 >= width) {
-      copy_vn_fl(float_output, components, 0.0f);
-      return;
+    if (x2 >= width) {
+      x2 = 0;
    }
+  }
+  else if (x2 < 0 || x1 >= width) {
+    copy_vn_fl(float_output, components, 0.0f);
+    return;
+  }

-    if (wrap_y) {
-      if (y1 < 0) {
-        y1 = height - 1;
-      }
-      if (y2 >= height) {
-        y2 = 0;
-      }
+  if (wrap_y) {
+    if (y1 < 0) {
+      y1 = height - 1;
    }
-    else if (y2 < 0 || y1 >= height) {
-      copy_vn_fl(float_output, components, 0.0f);
-      return;
+    if (y2 >= height) {
+      y2 = 0;
    }
+  }
+  else if (y2 < 0 || y1 >= height) {
+    copy_vn_fl(float_output, components, 0.0f);
+    return;
+  }

-    /* sample including outside of edges of image */
-    if (x1 < 0 || y1 < 0) {
-      row1 = empty;
-    }
-    else {
-      row1 = float_buffer + width * y1 * components + components * x1;
-    }
-
-    if (x1 < 0 || y2 > height - 1) {
-      row2 = empty;
-    }
-    else {
-      row2 = float_buffer + width * y2 * components + components * x1;
-    }
-
-    if (x2 > width - 1 || y1 < 0) {
-      row3 = empty;
-    }
-    else {
-      row3 = float_buffer + width * y1 * components + components * x2;
-    }
-
-    if (x2 > width - 1 || y2 > height - 1) {
-      row4 = empty;
-    }
-    else {
-      row4 = float_buffer + width * y2 * components + components * x2;
-    }
-
-    a = u - floorf(u);
-    b = v - floorf(v);
-    a_b = a * b;
-    ma_b = (1.0f - a) * b;
-    a_mb = a * (1.0f - b);
-    ma_mb = (1.0f - a) * (1.0f - b);
-
-    if (components == 1) {
-      float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
-    }
-    else if (components == 3) {
-      float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
-      float_output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
-      float_output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
-    }
-    else {
-      float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
-      float_output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
-      float_output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
-      float_output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
-    }
+  /* sample including outside of edges of image */
+  if (x1 < 0 || y1 < 0) {
+    row1 = empty;
  }
  else {
-    const uchar *row1, *row2, *row3, *row4;
-    uchar empty[4] = {0, 0, 0, 0};
-
-    /* pixel value must be already wrapped, however values at boundaries may flip */
-    if (wrap_x) {
-      if (x1 < 0) {
-        x1 = width - 1;
-      }
-      if (x2 >= width) {
-        x2 = 0;
-      }
-    }
-    else if (x2 < 0 || x1 >= width) {
-      copy_vn_uchar(byte_output, components, 0);
-      return;
-    }
-
-    if (wrap_y) {
-      if (y1 < 0) {
-        y1 = height - 1;
-      }
-      if (y2 >= height) {
-        y2 = 0;
-      }
-    }
-    else if (y2 < 0 || y1 >= height) {
-      copy_vn_uchar(byte_output, components, 0);
-      return;
-    }
-
-    /* sample including outside of edges of image */
-    if (x1 < 0 || y1 < 0) {
-      row1 = empty;
-    }
-    else {
-      row1 = byte_buffer + width * y1 * components + components * x1;
-    }
-
-    if (x1 < 0 || y2 > height - 1) {
-      row2 = empty;
-    }
-    else {
-      row2 = byte_buffer + width * y2 * components + components * x1;
-    }
-
-    if (x2 > width - 1 || y1 < 0) {
-      row3 = empty;
-    }
-    else {
-      row3 = byte_buffer + width * y1 * components + components * x2;
-    }
-
-    if (x2 > width - 1 || y2 > height - 1) {
-      row4 = empty;
-    }
-    else {
-      row4 = byte_buffer + width * y2 * components + components * x2;
-    }
-
-    a = u - floorf(u);
-    b = v - floorf(v);
-    a_b = a * b;
-    ma_b = (1.0f - a) * b;
-    a_mb = a * (1.0f - b);
-    ma_mb = (1.0f - a) * (1.0f - b);
-
-    if (components == 1) {
-      byte_output[0] = (uchar)(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] +
-                               0.5f);
-    }
-    else if (components == 3) {
-      byte_output[0] = (uchar)(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] +
-                               0.5f);
-      byte_output[1] = (uchar)(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] +
-                               0.5f);
-      byte_output[2] = (uchar)(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] +
-                               0.5f);
-    }
-    else {
-      byte_output[0] = (uchar)(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] +
-                               0.5f);
-      byte_output[1] = (uchar)(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] +
-                               0.5f);
-      byte_output[2] = (uchar)(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] +
-                               0.5f);
-      byte_output[3] = (uchar)(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] +
-                               0.5f);
-    }
+    row1 = float_buffer + width * y1 * components + components * x1;
  }
+
+  if (x1 < 0 || y2 > height - 1) {
+    row2 = empty;
+  }
+  else {
+    row2 = float_buffer + width * y2 * components + components * x1;
+  }
+
+  if (x2 > width - 1 || y1 < 0) {
+    row3 = empty;
+  }
+  else {
+    row3 = float_buffer + width * y1 * components + components * x2;
+  }
+
+  if (x2 > width - 1 || y2 > height - 1) {
+    row4 = empty;
+  }
+  else {
+    row4 = float_buffer + width * y2 * components + components * x2;
+  }
+
+  a = u - uf;
+  b = v - vf;
+  a_b = a * b;
+  ma_b = (1.0f - a) * b;
+  a_mb = a * (1.0f - b);
+  ma_mb = (1.0f - a) * (1.0f - b);
+
+  if (components == 1) {
+    float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
+  }
+  else if (components == 3) {
+    float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
+    float_output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
+    float_output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
+  }
+  else {
+#if BLI_HAVE_SSE2
+    __m128 rgba1 = _mm_loadu_ps(row1);
+    __m128 rgba2 = _mm_loadu_ps(row2);
+    __m128 rgba3 = _mm_loadu_ps(row3);
+    __m128 rgba4 = _mm_loadu_ps(row4);
+    rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
+    rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
+    rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
+    rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
+    __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
+    __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
+    __m128 rgba = _mm_add_ps(rgba13, rgba24);
+    _mm_storeu_ps(float_output, rgba);
+#else
+    float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
+    float_output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
+    float_output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
+    float_output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
+#endif
+  }
+}
+
+void BLI_bilinear_interpolation_char(
+    const uchar *buffer, uchar *output, int width, int height, float u, float v)
+{
+#if BLI_HAVE_SSE2
+  /* Bilinear interpolation needs to read and blend four image pixels, while
+   * also handling conditions of sample coordinate being outside of the
+   * image, in which case black (all zeroes) should be used as the sample
+   * contribution.
+   *
+   * Code below does all that without any branches, by making outside the
+   * image sample locations still read the first pixel of the image, but
+   * later making sure that the result is set to zero for that sample. */
+
+  __m128 uvuv = _mm_set_ps(v, u, v, u);
+
+#  if defined(__SSE4_1__) || defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+  /* If we're on SSE4 or ARM NEON, just use the simple floor() way. */
+  __m128 uvuv_floor = _mm_floor_ps(uvuv);
+#  else
+  /* The hard way: truncate, for negative inputs this will round towards zero.
+   * Then compare with input UV, and subtract 1 for the inputs that were
+   * negative. */
+  __m128 uv_trunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(uvuv));
+  __m128 uv_neg = _mm_cmplt_ps(uvuv, uv_trunc);
+  __m128 uvuv_floor = _mm_sub_ps(uv_trunc, _mm_and_ps(uv_neg, _mm_set1_ps(1.0f)));
+#  endif
+
+  /* x1, y1, x2, y2 */
+  __m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
+  /* Check whether any of the coordinates are outside of the image. */
+  __m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
+                                       _mm_set1_epi32(1));
+  __m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
+  __m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
+  __m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
+
+  /* Samples 1,2,3,4 are in this order: x1y1, x1y2, x2y1, x2y2 */
+  __m128i x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
+  __m128i y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
+  __m128i invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
+                                      _mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
+  /* Set x & y to zero for invalid samples. */
+  x1234 = _mm_andnot_si128(invalid_1234, x1234);
+  y1234 = _mm_andnot_si128(invalid_1234, y1234);
+
+  /* Read the four sample values. Do address calculations in C, since SSE
+   * before 4.1 makes it very cumbersome to do full integer multiplies. */
+  int xcoord[4];
+  int ycoord[4];
+  _mm_storeu_ps((float *)xcoord, _mm_castsi128_ps(x1234));
+  _mm_storeu_ps((float *)ycoord, _mm_castsi128_ps(y1234));
+  int sample1 = ((const int *)buffer)[ycoord[0] * (int64_t)width + xcoord[0]];
+  int sample2 = ((const int *)buffer)[ycoord[1] * (int64_t)width + xcoord[1]];
+  int sample3 = ((const int *)buffer)[ycoord[2] * (int64_t)width + xcoord[2]];
+  int sample4 = ((const int *)buffer)[ycoord[3] * (int64_t)width + xcoord[3]];
+  __m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
+  /* Set samples to black for the ones that were actually invalid. */
+  samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
+
+  /* Expand samples from packed 8-bit RGBA to full floats:
+   * spread to 16 bit values. */
+  __m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
+  __m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
+  /* Spread to 32 bit values and convert to float. */
+  __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
+  __m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
+  __m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
+  __m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
+
+  /* Calculate interpolation factors: (1-a)*(1-b), (1-a)*b, a*(1-b), a*b */
+  __m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
+  __m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
+  __m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
+  __m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
+                              _mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
+
+  /* Blend the samples. */
+  rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
+  rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
+  rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
+  rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
+  __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
+  __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
+  __m128 rgba = _mm_add_ps(rgba13, rgba24);
+  rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
+  /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
+   * unsigned, then write resulting 32-bit value. */
+  __m128i rgba32 = _mm_cvttps_epi32(rgba);
+  __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
+  __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
+  _mm_store_ss((float *)output, _mm_castsi128_ps(rgba8));
+
+#else
+
+  float a, b;
+  float a_b, ma_b, a_mb, ma_mb;
+  int y1, y2, x1, x2;
+
+  float uf = floorf(u);
+  float vf = floorf(v);
+
+  x1 = (int)uf;
+  x2 = x1 + 1;
+  y1 = (int)vf;
+  y2 = y1 + 1;
+
+  const uchar *row1, *row2, *row3, *row4;
+  uchar empty[4] = {0, 0, 0, 0};
+
+  /* completely outside of the image? */
+  if (x2 < 0 || x1 >= width) {
+    copy_vn_uchar(output, 4, 0);
+    return;
+  }
+
+  if (y2 < 0 || y1 >= height) {
+    copy_vn_uchar(output, 4, 0);
+    return;
+  }
+
+  /* sample including outside of edges of image */
+  if (x1 < 0 || y1 < 0) {
+    row1 = empty;
+  }
+  else {
+    row1 = buffer + width * y1 * 4 + 4 * x1;
+  }
+
+  if (x1 < 0 || y2 > height - 1) {
+    row2 = empty;
+  }
+  else {
+    row2 = buffer + width * y2 * 4 + 4 * x1;
+  }
+
+  if (x2 > width - 1 || y1 < 0) {
+    row3 = empty;
+  }
+  else {
+    row3 = buffer + width * y1 * 4 + 4 * x2;
+  }
+
+  if (x2 > width - 1 || y2 > height - 1) {
+    row4 = empty;
+  }
+  else {
+    row4 = buffer + width * y2 * 4 + 4 * x2;
+  }
+
+  a = u - uf;
+  b = v - vf;
+  a_b = a * b;
+  ma_b = (1.0f - a) * b;
+  a_mb = a * (1.0f - b);
+  ma_mb = (1.0f - a) * (1.0f - b);
+
+  output[0] = (uchar)(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
+  output[1] = (uchar)(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
+  output[2] = (uchar)(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
+  output[3] = (uchar)(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
+#endif
 }

 void BLI_bilinear_interpolation_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v)
 {
-  bilinear_interpolation(
-      NULL, buffer, NULL, output, width, height, components, u, v, false, false);
-}
-
-void BLI_bilinear_interpolation_char(
-    const uchar *buffer, uchar *output, int width, int height, int components, float u, float v)
-{
-  bilinear_interpolation(
-      buffer, NULL, output, NULL, width, height, components, u, v, false, false);
+  bilinear_interpolation_fl(buffer, output, width, height, components, u, v, false, false);
 }

 void BLI_bilinear_interpolation_wrap_fl(const float *buffer,
@@ -463,22 +546,7 @@ void BLI_bilinear_interpolation_wrap_fl(const float *buffer,
                                        bool wrap_x,
                                        bool wrap_y)
 {
-  bilinear_interpolation(
-      NULL, buffer, NULL, output, width, height, components, u, v, wrap_x, wrap_y);
-}
-
-void BLI_bilinear_interpolation_wrap_char(const uchar *buffer,
-                                          uchar *output,
-                                          int width,
-                                          int height,
-                                          int components,
-                                          float u,
-                                          float v,
-                                          bool wrap_x,
-                                          bool wrap_y)
-{
-  bilinear_interpolation(
-      buffer, NULL, output, NULL, width, height, components, u, v, wrap_x, wrap_y);
+  bilinear_interpolation_fl(buffer, output, width, height, components, u, v, wrap_x, wrap_y);
 }

 /**************************************************************************
--- a/source/blender/blenlib/tests/BLI_math_interp_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_interp_test.cc
@@ -0,0 +1,98 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include "testing/testing.h"
+
+#include "BLI_math_interp.h"
+
+static constexpr int image_width = 3;
+static constexpr int image_height = 3;
+static constexpr unsigned char image_char[image_height][image_width][4] = {
+    {{255, 254, 217, 216}, {230, 230, 230, 230}, {240, 160, 90, 20}},
+    {{0, 1, 2, 3}, {62, 72, 82, 92}, {126, 127, 128, 129}},
+    {{1, 2, 3, 4}, {73, 108, 153, 251}, {128, 129, 130, 131}},
+};
+
+TEST(math_interp, BilinearCharExactSamples)
+{
+  unsigned char res[4];
+  unsigned char exp1[4] = {73, 108, 153, 251};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 1.0f, 2.0f);
+  EXPECT_EQ_ARRAY(exp1, res, 4);
+  unsigned char exp2[4] = {240, 160, 90, 20};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 2.0f, 0.0f);
+  EXPECT_EQ_ARRAY(exp2, res, 4);
+}
+
+TEST(math_interp, BilinearCharHalfwayUSamples)
+{
+  unsigned char res[4];
+  unsigned char exp1[4] = {31, 37, 42, 48};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0.5f, 1.0f);
+  EXPECT_EQ_ARRAY(exp1, res, 4);
+  unsigned char exp2[4] = {243, 242, 224, 223};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0.5f, 0.0f);
+  EXPECT_EQ_ARRAY(exp2, res, 4);
+}
+
+TEST(math_interp, BilinearCharHalfwayVSamples)
+{
+  unsigned char res[4];
+  unsigned char exp1[4] = {1, 2, 3, 4};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0.0f, 1.5f);
+  EXPECT_EQ_ARRAY(exp1, res, 4);
+  unsigned char exp2[4] = {127, 128, 129, 130};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 2.0f, 1.5f);
+  EXPECT_EQ_ARRAY(exp2, res, 4);
+}
+
+TEST(math_interp, BilinearCharSamples)
+{
+  unsigned char res[4];
+  unsigned char exp1[4] = {136, 133, 132, 130};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 1.25f, 0.625f);
+  EXPECT_EQ_ARRAY(exp1, res, 4);
+  unsigned char exp2[4] = {219, 191, 167, 142};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 1.4f, 0.1f);
+  EXPECT_EQ_ARRAY(exp2, res, 4);
+}
+
+TEST(math_interp, BilinearCharPartiallyOutsideImage)
+{
+  unsigned char res[4];
+  unsigned char exp1[4] = {1, 1, 2, 2};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, -0.5f, 2.0f);
+  EXPECT_EQ_ARRAY(exp1, res, 4);
+  unsigned char exp2[4] = {9, 11, 15, 22};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 1.25f, 2.9f);
+  EXPECT_EQ_ARRAY(exp2, res, 4);
+  unsigned char exp3[4] = {173, 115, 65, 14};
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 2.2f, -0.1f);
+  EXPECT_EQ_ARRAY(exp3, res, 4);
+}
+
+TEST(math_interp, BilinearCharFullyOutsideImage)
+{
+  unsigned char res[4];
+  unsigned char exp[4] = {0, 0, 0, 0};
+  /* Out of range on U */
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, -1.5f, 0);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, -1.1f, 0);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 3, 0);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 5, 0);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+
+  /* Out of range on V */
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0, -3.2f);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0, -1.5f);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0, 3.1f);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+  BLI_bilinear_interpolation_char(image_char[0][0], res, image_width, image_height, 0, 500.0f);
+  EXPECT_EQ_ARRAY(exp, res, 4);
+}
--- a/source/blender/imbuf/IMB_imbuf.h
+++ b/source/blender/imbuf/IMB_imbuf.h
@@ -682,10 +682,11 @@ void nearest_interpolation_color_wrap(
    const struct ImBuf *in, unsigned char outI[4], float outF[4], float u, float v);
 void bilinear_interpolation_color(
    const struct ImBuf *in, unsigned char outI[4], float outF[4], float u, float v);
-void bilinear_interpolation_color_char(
-    const struct ImBuf *in, unsigned char outI[4], float outF[4], float u, float v);
-void bilinear_interpolation_color_fl(
-    const struct ImBuf *in, unsigned char outI[4], float outF[4], float u, float v);
+void bilinear_interpolation_color_char(const struct ImBuf *in,
+                                       unsigned char outI[4],
+                                       float u,
+                                       float v);
+void bilinear_interpolation_color_fl(const struct ImBuf *in, float outF[4], float u, float v);
 /**
 * Note about wrapping, the u/v still needs to be within the image bounds,
 * just the interpolation is wrapped.
--- a/source/blender/imbuf/intern/imageprocess.cc
+++ b/source/blender/imbuf/intern/imageprocess.cc
@@ -83,7 +83,7 @@ void bicubic_interpolation_color(const ImBuf *in, uchar outI[4], float outF[4],
    BLI_bicubic_interpolation_fl(in->float_buffer.data, outF, in->x, in->y, 4, u, v);
  }
  else {
-    BLI_bicubic_interpolation_char(in->byte_buffer.data, outI, in->x, in->y, 4, u, v);
+    BLI_bicubic_interpolation_char(in->byte_buffer.data, outI, in->x, in->y, u, v);
  }
 }

@@ -108,20 +108,18 @@ void bicubic_interpolation(const ImBuf *in, ImBuf *out, float u, float v, int xo
 /** \name Bi-Linear Interpolation
 * \{ */

-void bilinear_interpolation_color_fl(
-    const ImBuf *in, uchar /*outI*/[4], float outF[4], float u, float v)
+void bilinear_interpolation_color_fl(const ImBuf *in, float outF[4], float u, float v)
 {
  BLI_assert(outF);
  BLI_assert(in->float_buffer.data);
  BLI_bilinear_interpolation_fl(in->float_buffer.data, outF, in->x, in->y, 4, u, v);
 }

-void bilinear_interpolation_color_char(
-    const ImBuf *in, uchar outI[4], float /*outF*/[4], float u, float v)
+void bilinear_interpolation_color_char(const ImBuf *in, uchar outI[4], float u, float v)
 {
  BLI_assert(outI);
  BLI_assert(in->byte_buffer.data);
-  BLI_bilinear_interpolation_char(in->byte_buffer.data, outI, in->x, in->y, 4, u, v);
+  BLI_bilinear_interpolation_char(in->byte_buffer.data, outI, in->x, in->y, u, v);
 }

 void bilinear_interpolation_color(const ImBuf *in, uchar outI[4], float outF[4], float u, float v)
@@ -130,7 +128,7 @@ void bilinear_interpolation_color(const ImBuf *in, uchar outI[4], float outF[4],
    BLI_bilinear_interpolation_fl(in->float_buffer.data, outF, in->x, in->y, 4, u, v);
  }
  else {
-    BLI_bilinear_interpolation_char(in->byte_buffer.data, outI, in->x, in->y, 4, u, v);
+    BLI_bilinear_interpolation_char(in->byte_buffer.data, outI, in->x, in->y, u, v);
  }
 }

--- a/source/blender/imbuf/intern/scaling.cc
+++ b/source/blender/imbuf/intern/scaling.cc
@@ -1762,7 +1762,7 @@ static void *do_scale_thread(void *data_v)

      if (data->byte_buffer) {
        uchar *pixel = data->byte_buffer + 4 * offset;
-        BLI_bilinear_interpolation_char(ibuf->byte_buffer.data, pixel, ibuf->x, ibuf->y, 4, u, v);
+        BLI_bilinear_interpolation_char(ibuf->byte_buffer.data, pixel, ibuf->x, ibuf->y, u, v);
      }

      if (data->float_buffer) {
--- a/source/blender/imbuf/intern/transform.cc
+++ b/source/blender/imbuf/intern/transform.cc
@@ -145,34 +145,16 @@ struct TransformUserData {
  }
 };

-/**
- * \brief Base class for source discarding.
- *
- * The class decides if a specific uv coordinate from the source buffer should be ignored.
- * This is used to mix multiple images over a single output buffer. Discarded pixels will
- * not change the output buffer.
- */
-class BaseDiscard {
- public:
-  virtual ~BaseDiscard() = default;
-
-  /**
-   * \brief Should the source pixel at the given uv coordinate be discarded.
-   */
-  virtual bool should_discard(const TransformUserData &user_data, const double2 &uv) = 0;
-};
-
 /**
 * \brief Crop uv-coordinates that are outside the user data src_crop rect.
 */
-class CropSource : public BaseDiscard {
- public:
+struct CropSource {
  /**
   * \brief Should the source pixel at the given uv coordinate be discarded.
   *
   * Uses user_data.src_crop to determine if the uv coordinate should be skipped.
   */
-  bool should_discard(const TransformUserData &user_data, const double2 &uv) override
+  static bool should_discard(const TransformUserData &user_data, const double2 &uv)
  {
    return uv.x < user_data.src_crop.xmin || uv.x >= user_data.src_crop.xmax ||
           uv.y < user_data.src_crop.ymin || uv.y >= user_data.src_crop.ymax;
@@ -182,14 +164,13 @@ class CropSource : public BaseDiscard {
 /**
 * \brief Discard that does not discard anything.
 */
-class NoDiscard : public BaseDiscard {
- public:
+struct NoDiscard {
  /**
   * \brief Should the source pixel at the given uv coordinate be discarded.
   *
   * Will never discard any pixels.
   */
-  bool should_discard(const TransformUserData & /*user_data*/, const double2 & /*uv*/) override
+  static bool should_discard(const TransformUserData & /*user_data*/, const double2 & /*uv*/)
  {
    return false;
  }
@@ -250,73 +231,19 @@ class PixelPointer {
 };

 /**
- * \brief Wrapping mode for the uv coordinates.
- *
- * Subclasses have the ability to change the UV coordinates when sampling the source buffer.
+ * \brief Repeats UV coordinate.
 */
-class BaseUVWrapping {
- public:
-  /**
-   * \brief modify the given u coordinate.
-   */
-  virtual double modify_u(const ImBuf *source_buffer, double u) = 0;
-
-  /**
-   * \brief modify the given v coordinate.
-   */
-  virtual double modify_v(const ImBuf *source_buffer, double v) = 0;
-
-  /**
-   * \brief modify the given uv coordinate.
-   */
-  double2 modify_uv(const ImBuf *source_buffer, const double2 &uv)
-  {
-    return double2(modify_u(source_buffer, uv.x), modify_v(source_buffer, uv.y));
-  }
-};
-
-/**
- * \brief UVWrapping method that does not modify the UV coordinates.
- */
-class PassThroughUV : public BaseUVWrapping {
- public:
-  double modify_u(const ImBuf * /*source_buffer*/, double u) override
-  {
-    return u;
-  }
-
-  double modify_v(const ImBuf * /*source_buffer*/, double v) override
-  {
-    return v;
-  }
-};
-
-/**
- * \brief UVWrapping method that wrap repeats the UV coordinates.
- */
-class WrapRepeatUV : public BaseUVWrapping {
- public:
-  double modify_u(const ImBuf *source_buffer, double u) override
-
-  {
-    int x = int(floor(u));
-    x = x % source_buffer->x;
+static float wrap_uv(float value, int size)
+{
+  int x = int(floorf(value));
+  if (UNLIKELY(x < 0 || x >= size)) {
+    x %= size;
    if (x < 0) {
-      x += source_buffer->x;
+      x += size;
    }
-    return x;
  }
-
-  double modify_v(const ImBuf *source_buffer, double v) override
-  {
-    int y = int(floor(v));
-    y = y % source_buffer->y;
-    if (y < 0) {
-      y += source_buffer->y;
-    }
-    return y;
-  }
-};
+  return x;
+}

 /* TODO: should we use math_vectors for this. */
 template<typename StorageType, int NumChannels>
@@ -369,14 +296,10 @@ template<
     */
    int NumChannels,
    /**
-     * \brief Wrapping method to perform
-     *
-     * Should be a subclass of BaseUVWrapper
+     * \brief Should UVs wrap
     */
-    typename UVWrapping>
+    bool UVWrapping>
 class Sampler {
-  UVWrapping uv_wrapper;
-
 public:
  using ChannelType = StorageType;
  static const int ChannelLen = NumChannels;
@@ -384,26 +307,29 @@ class Sampler {

  void sample(const ImBuf *source, const double2 &uv, SampleType &r_sample)
  {
+    float u = float(uv.x);
+    float v = float(uv.y);
+    if constexpr (UVWrapping) {
+      u = wrap_uv(u, source->x);
+      v = wrap_uv(v, source->y);
+    }
    if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<StorageType, float> &&
                  NumChannels == 4)
    {
-      const double2 wrapped_uv = uv_wrapper.modify_uv(source, uv);
-      bilinear_interpolation_color_fl(source, nullptr, r_sample.data(), UNPACK2(wrapped_uv));
+      bilinear_interpolation_color_fl(source, r_sample.data(), u, v);
    }
    else if constexpr (Filter == IMB_FILTER_NEAREST && std::is_same_v<StorageType, uchar> &&
                       NumChannels == 4)
    {
-      const double2 wrapped_uv = uv_wrapper.modify_uv(source, uv);
-      nearest_interpolation_color_char(source, r_sample.data(), nullptr, UNPACK2(wrapped_uv));
+      nearest_interpolation_color_char(source, r_sample.data(), nullptr, u, v);
    }
    else if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<StorageType, uchar> &&
                       NumChannels == 4)
    {
-      const double2 wrapped_uv = uv_wrapper.modify_uv(source, uv);
-      bilinear_interpolation_color_char(source, r_sample.data(), nullptr, UNPACK2(wrapped_uv));
+      bilinear_interpolation_color_char(source, r_sample.data(), u, v);
    }
    else if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<StorageType, float>) {
-      if constexpr (std::is_same_v<UVWrapping, WrapRepeatUV>) {
+      if constexpr (UVWrapping) {
        BLI_bilinear_interpolation_wrap_fl(source->float_buffer.data,
                                           r_sample.data(),
                                           source->x,
@@ -414,18 +340,12 @@ class Sampler {
                                           true);
      }
      else {
-        const double2 wrapped_uv = uv_wrapper.modify_uv(source, uv);
-        BLI_bilinear_interpolation_fl(source->float_buffer.data,
-                                      r_sample.data(),
-                                      source->x,
-                                      source->y,
-                                      NumChannels,
-                                      UNPACK2(wrapped_uv));
+        BLI_bilinear_interpolation_fl(
+            source->float_buffer.data, r_sample.data(), source->x, source->y, NumChannels, u, v);
      }
    }
    else if constexpr (Filter == IMB_FILTER_NEAREST && std::is_same_v<StorageType, float>) {
-      const double2 wrapped_uv = uv_wrapper.modify_uv(source, uv);
-      sample_nearest_float(source, wrapped_uv, r_sample);
+      sample_nearest_float(source, u, v, r_sample);
    }
    else {
      /* Unsupported sampler. */
@@ -434,13 +354,16 @@ class Sampler {
  }

 private:
-  void sample_nearest_float(const ImBuf *source, const double2 &uv, SampleType &r_sample)
+  void sample_nearest_float(const ImBuf *source,
+                            const float u,
+                            const float v,
+                            SampleType &r_sample)
  {
    BLI_STATIC_ASSERT(std::is_same_v<StorageType, float>);

    /* ImBuf in must have a valid rect or rect_float, assume this is already checked */
-    int x1 = int(uv.x);
-    int y1 = int(uv.y);
+    int x1 = int(u);
+    int y1 = int(v);

    /* Break when sample outside image is requested. */
    if (x1 < 0 || x1 >= source->x || y1 < 0 || y1 >= source->y) {
@@ -537,9 +460,7 @@ class ChannelConverter {
 */
 template<
    /**
-     * \brief Discard function to use.
-     *
-     * \attention Should be a subclass of BaseDiscard.
+     * \brief Discard functor that implements `should_discard`.
     */
    typename Discard,

@@ -659,17 +580,17 @@ ScanlineThreadFunc get_scanline_function(const eIMBTransformMode mode)
    case IMB_TRANSFORM_MODE_REGULAR:
      return transform_scanline_function<
          ScanlineProcessor<NoDiscard,
-                            Sampler<Filter, StorageType, SourceNumChannels, PassThroughUV>,
+                            Sampler<Filter, StorageType, SourceNumChannels, false>,
                            PixelPointer<StorageType, DestinationNumChannels>>>;
    case IMB_TRANSFORM_MODE_CROP_SRC:
      return transform_scanline_function<
          ScanlineProcessor<CropSource,
-                            Sampler<Filter, StorageType, SourceNumChannels, PassThroughUV>,
+                            Sampler<Filter, StorageType, SourceNumChannels, false>,
                            PixelPointer<StorageType, DestinationNumChannels>>>;
    case IMB_TRANSFORM_MODE_WRAP_REPEAT:
      return transform_scanline_function<
          ScanlineProcessor<NoDiscard,
-                            Sampler<Filter, StorageType, SourceNumChannels, WrapRepeatUV>,
+                            Sampler<Filter, StorageType, SourceNumChannels, true>,
                            PixelPointer<StorageType, DestinationNumChannels>>>;
  }