test/source/blender/blenlib/intern/math_interp.cc

/* SPDX-FileCopyrightText: 2024 Blender Authors
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

/** \file
 * \ingroup bli
 */

#include <cmath>
#include <cstring>

#include "BLI_math_base.h"
#include "BLI_math_base.hh"
#include "BLI_math_interp.hh"
#include "BLI_math_vector.h"
#include "BLI_math_vector_types.hh"
#include "BLI_simd.hh"

#include "BLI_strict_flags.h" /* Keep last. */

namespace blender::math {

enum class eCubicFilter {
  BSpline,
  Mitchell,
};

/* Calculate cubic filter coefficients, for samples at -1,0,+1,+2.
 * f is 0..1 offset from texel center in pixel space. */
template<enum eCubicFilter filter> static float4 cubic_filter_coefficients(float f)
{
  float f2 = f * f;
  float f3 = f2 * f;

  if constexpr (filter == eCubicFilter::BSpline) {
    /* Cubic B-Spline (Mitchell-Netravali filter with B=1, C=0 parameters). */
    float w3 = f3 * (1.0f / 6.0f);
    float w0 = -w3 + f2 * 0.5f - f * 0.5f + 1.0f / 6.0f;
    float w1 = f3 * 0.5f - f2 * 1.0f + 2.0f / 3.0f;
    float w2 = 1.0f - w0 - w1 - w3;
    return float4(w0, w1, w2, w3);
  }
  else if constexpr (filter == eCubicFilter::Mitchell) {
    /* Cubic Mitchell-Netravali filter with B=1/3, C=1/3 parameters. */
    float w0 = -7.0f / 18.0f * f3 + 5.0f / 6.0f * f2 - 0.5f * f + 1.0f / 18.0f;
    float w1 = 7.0f / 6.0f * f3 - 2.0f * f2 + 8.0f / 9.0f;
    float w2 = -7.0f / 6.0f * f3 + 3.0f / 2.0f * f2 + 0.5f * f + 1.0f / 18.0f;
    float w3 = 7.0f / 18.0f * f3 - 1.0f / 3.0f * f2;
    return float4(w0, w1, w2, w3);
  }
}

#if BLI_HAVE_SSE4
template<eCubicFilter filter>
BLI_INLINE void bicubic_interpolation_uchar_simd(
    const uchar *src_buffer, uchar *output, int width, int height, float u, float v)
{
  __m128 uv = _mm_set_ps(0, 0, v, u);
  __m128 uv_floor = _mm_floor_ps(uv);
  __m128i i_uv = _mm_cvttps_epi32(uv_floor);

  /* Sample area entirely outside image?
   * We check if any of (iu+1, iv+1, width, height) < (0, 0, iu+1, iv+1). */
  __m128i i_uv_1 = _mm_add_epi32(i_uv, _mm_set_epi32(0, 0, 1, 1));
  __m128i cmp_a = _mm_or_si128(i_uv_1, _mm_set_epi32(height, width, 0, 0));
  __m128i cmp_b = _mm_shuffle_epi32(i_uv_1, _MM_SHUFFLE(1, 0, 3, 2));
  __m128i invalid = _mm_cmplt_epi32(cmp_a, cmp_b);
  if (_mm_movemask_ps(_mm_castsi128_ps(invalid)) != 0) {
    memset(output, 0, 4);
    return;
  }

  __m128 frac_uv = _mm_sub_ps(uv, uv_floor);

  /* Calculate pixel weights. */
  float4 wx = cubic_filter_coefficients<filter>(_mm_cvtss_f32(frac_uv));
  float4 wy = cubic_filter_coefficients<filter>(
      _mm_cvtss_f32(_mm_shuffle_ps(frac_uv, frac_uv, 1)));

  /* Read 4x4 source pixels and blend them. */
  __m128 out = _mm_setzero_ps();
  int iu = _mm_cvtsi128_si32(i_uv);
  int iv = _mm_cvtsi128_si32(_mm_shuffle_epi32(i_uv, 1));

  for (int n = 0; n < 4; n++) {
    int y1 = iv + n - 1;
    CLAMP(y1, 0, height - 1);
    for (int m = 0; m < 4; m++) {

      int x1 = iu + m - 1;
      CLAMP(x1, 0, width - 1);
      float w = wx[m] * wy[n];

      const uchar *data = src_buffer + (width * y1 + x1) * 4;
      /* Load 4 bytes and expand into 4-lane SIMD. */
      __m128i sample_i = _mm_castps_si128(_mm_load_ss((const float *)data));
      sample_i = _mm_unpacklo_epi8(sample_i, _mm_setzero_si128());
      sample_i = _mm_unpacklo_epi16(sample_i, _mm_setzero_si128());

      /* Accumulate into out with weight. */
      out = _mm_add_ps(out, _mm_mul_ps(_mm_cvtepi32_ps(sample_i), _mm_set1_ps(w)));
    }
  }

  /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
   * unsigned, then write resulting 32-bit value. This will clamp
   * out of range values too. */
  out = _mm_add_ps(out, _mm_set1_ps(0.5f));
  __m128i rgba32 = _mm_cvttps_epi32(out);
  __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
  __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
  _mm_store_ss((float *)output, _mm_castsi128_ps(rgba8));
}
#endif /* BLI_HAVE_SSE4 */

template<typename T, eCubicFilter filter>
static void bicubic_interpolation(
    const T *src_buffer, T *output, int width, int height, int components, float u, float v)
{
  BLI_assert(src_buffer && output);

#if BLI_HAVE_SSE4
  if constexpr (std::is_same_v<T, uchar>) {
    if (components == 4) {
      bicubic_interpolation_uchar_simd<filter>(src_buffer, output, width, height, u, v);
      return;
    }
  }
#endif

  int iu = int(floor(u));
  int iv = int(floor(v));

  /* Sample area entirely outside image? */
  if (iu + 1 < 0 || iu > width - 1 || iv + 1 < 0 || iv > height - 1) {
    memset(output, 0, size_t(components) * sizeof(T));
    return;
  }

  float frac_u = u - float(iu);
  float frac_v = v - float(iv);

  float4 out{0.0f};

  /* Calculate pixel weights. */
  float4 wx = cubic_filter_coefficients<filter>(frac_u);
  float4 wy = cubic_filter_coefficients<filter>(frac_v);

  /* Read 4x4 source pixels and blend them. */
  for (int n = 0; n < 4; n++) {
    int y1 = iv + n - 1;
    CLAMP(y1, 0, height - 1);
    for (int m = 0; m < 4; m++) {

      int x1 = iu + m - 1;
      CLAMP(x1, 0, width - 1);
      float w = wx[m] * wy[n];

      const T *data = src_buffer + (width * y1 + x1) * components;

      if (components == 1) {
        out[0] += data[0] * w;
      }
      else if (components == 3) {
        out[0] += data[0] * w;
        out[1] += data[1] * w;
        out[2] += data[2] * w;
      }
      else {
        out[0] += data[0] * w;
        out[1] += data[1] * w;
        out[2] += data[2] * w;
        out[3] += data[3] * w;
      }
    }
  }

  /* Mitchell filter has negative lobes; prevent output from going out of range. */
  if constexpr (filter == eCubicFilter::Mitchell) {
    for (int i = 0; i < components; i++) {
      out[i] = math::max(out[i], 0.0f);
      if constexpr (std::is_same_v<T, uchar>) {
        out[i] = math::min(out[i], 255.0f);
      }
    }
  }

  /* Write result. */
  if constexpr (std::is_same_v<T, float>) {
    if (components == 1) {
      output[0] = out[0];
    }
    else if (components == 3) {
      copy_v3_v3(output, out);
    }
    else {
      copy_v4_v4(output, out);
    }
  }
  else {
    if (components == 1) {
      output[0] = uchar(out[0] + 0.5f);
    }
    else if (components == 3) {
      output[0] = uchar(out[0] + 0.5f);
      output[1] = uchar(out[1] + 0.5f);
      output[2] = uchar(out[2] + 0.5f);
    }
    else {
      output[0] = uchar(out[0] + 0.5f);
      output[1] = uchar(out[1] + 0.5f);
      output[2] = uchar(out[2] + 0.5f);
      output[3] = uchar(out[3] + 0.5f);
    }
  }
}

template<bool border>
BLI_INLINE void bilinear_fl_impl(const float *buffer,
                                 float *output,
                                 int width,
                                 int height,
                                 int components,
                                 float u,
                                 float v,
                                 bool wrap_x = false,
                                 bool wrap_y = false)
{
  BLI_assert(buffer && output);
  float a, b;
  float a_b, ma_b, a_mb, ma_mb;
  int y1, y2, x1, x2;

  if (wrap_x) {
    u = floored_fmod(u, float(width));
  }
  if (wrap_y) {
    v = floored_fmod(v, float(height));
  }

  float uf = floorf(u);
  float vf = floorf(v);

  x1 = int(uf);
  x2 = x1 + 1;
  y1 = int(vf);
  y2 = y1 + 1;

  const float *row1, *row2, *row3, *row4;
  const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};

  /* Check if +1 samples need wrapping, or we don't do wrapping then if
   * we are sampling completely outside the image. */
  if (wrap_x) {
    if (x2 >= width) {
      x2 = 0;
    }
  }
  else if (border && (x2 < 0 || x1 >= width)) {
    copy_vn_fl(output, components, 0.0f);
    return;
  }
  if (wrap_y) {
    if (y2 >= height) {
      y2 = 0;
    }
  }
  else if (border && (y2 < 0 || y1 >= height)) {
    copy_vn_fl(output, components, 0.0f);
    return;
  }

  /* Sample locations. */
  if constexpr (border) {
    row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x1) * components;
    row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x1) * components;
    row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x2) * components;
    row4 = (x2 > width - 1 || y2 > height - 1) ? empty :
                                                 buffer + (int64_t(width) * y2 + x2) * components;
  }
  else {
    x1 = blender::math::clamp(x1, 0, width - 1);
    x2 = blender::math::clamp(x2, 0, width - 1);
    y1 = blender::math::clamp(y1, 0, height - 1);
    y2 = blender::math::clamp(y2, 0, height - 1);
    row1 = buffer + (int64_t(width) * y1 + x1) * components;
    row2 = buffer + (int64_t(width) * y2 + x1) * components;
    row3 = buffer + (int64_t(width) * y1 + x2) * components;
    row4 = buffer + (int64_t(width) * y2 + x2) * components;
  }

  a = u - uf;
  b = v - vf;
  a_b = a * b;
  ma_b = (1.0f - a) * b;
  a_mb = a * (1.0f - b);
  ma_mb = (1.0f - a) * (1.0f - b);

  if (components == 1) {
    output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
  }
  else if (components == 3) {
    output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
    output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
    output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
  }
  else {
#if BLI_HAVE_SSE2
    __m128 rgba1 = _mm_loadu_ps(row1);
    __m128 rgba2 = _mm_loadu_ps(row2);
    __m128 rgba3 = _mm_loadu_ps(row3);
    __m128 rgba4 = _mm_loadu_ps(row4);
    rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
    rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
    rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
    rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
    __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
    __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
    __m128 rgba = _mm_add_ps(rgba13, rgba24);
    _mm_storeu_ps(output, rgba);
#else
    output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
    output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
    output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
    output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
#endif
  }
}

template<bool border>
BLI_INLINE uchar4 bilinear_byte_impl(const uchar *buffer, int width, int height, float u, float v)
{
  BLI_assert(buffer);
  uchar4 res;

#if BLI_HAVE_SSE4
  __m128 uvuv = _mm_set_ps(v, u, v, u);
  __m128 uvuv_floor = _mm_floor_ps(uvuv);

  /* x1, y1, x2, y2 */
  __m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
  /* Check whether any of the coordinates are outside of the image. */
  __m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
                                       _mm_set1_epi32(1));

  /* Samples 1,2,3,4 will be in this order: x1y1, x1y2, x2y1, x2y2. */
  __m128i x1234, y1234, invalid_1234;

  if constexpr (border) {
    /* Blend black colors for samples right outside the image: figure out
     * which of the 4 samples were outside, set their coordinates to zero
     * and later on put black color into their place. */
    __m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
    __m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
    __m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);

    /* Samples 1,2,3,4 are in this order: x1y1, x1y2, x2y1, x2y2 */
    x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
    y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
    invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
                                _mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
    /* Set x & y to zero for invalid samples. */
    x1234 = _mm_andnot_si128(invalid_1234, x1234);
    y1234 = _mm_andnot_si128(invalid_1234, y1234);
  }
  else {
    /* Clamp samples to image edges. */
    __m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
    xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
    x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
    y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
  }

  /* Read the four sample values. Do address calculations in C, since SSE
   * before 4.1 makes it very cumbersome to do full integer multiplies. */
  int xcoord[4];
  int ycoord[4];
  _mm_storeu_ps((float *)xcoord, _mm_castsi128_ps(x1234));
  _mm_storeu_ps((float *)ycoord, _mm_castsi128_ps(y1234));
  int sample1 = ((const int *)buffer)[ycoord[0] * int64_t(width) + xcoord[0]];
  int sample2 = ((const int *)buffer)[ycoord[1] * int64_t(width) + xcoord[1]];
  int sample3 = ((const int *)buffer)[ycoord[2] * int64_t(width) + xcoord[2]];
  int sample4 = ((const int *)buffer)[ycoord[3] * int64_t(width) + xcoord[3]];
  __m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
  if constexpr (border) {
    /* Set samples to black for the ones that were actually invalid. */
    samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
  }

  /* Expand samples from packed 8-bit RGBA to full floats:
   * spread to 16 bit values. */
  __m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
  __m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
  /* Spread to 32 bit values and convert to float. */
  __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
  __m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
  __m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
  __m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));

  /* Calculate interpolation factors: (1-a)*(1-b), (1-a)*b, a*(1-b), a*b */
  __m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
  __m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
  __m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
  __m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
                              _mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));

  /* Blend the samples. */
  rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
  rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
  rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
  rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
  __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
  __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
  __m128 rgba = _mm_add_ps(rgba13, rgba24);
  rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
  /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
   * unsigned, then write resulting 32-bit value. */
  __m128i rgba32 = _mm_cvttps_epi32(rgba);
  __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
  __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
  _mm_store_ss((float *)&res, _mm_castsi128_ps(rgba8));

#else

  float uf = floorf(u);
  float vf = floorf(v);

  int x1 = int(uf);
  int x2 = x1 + 1;
  int y1 = int(vf);
  int y2 = y1 + 1;

  /* Completely outside of the image in bordered mode? */
  if (border && (x2 < 0 || x1 >= width || y2 < 0 || y1 >= height)) {
    return uchar4(0);
  }

  /* Sample locations. */
  const uchar *row1, *row2, *row3, *row4;
  uchar empty[4] = {0, 0, 0, 0};
  if constexpr (border) {
    row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x1) * 4;
    row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x1) * 4;
    row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x2) * 4;
    row4 = (x2 > width - 1 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x2) * 4;
  }
  else {
    x1 = blender::math::clamp(x1, 0, width - 1);
    x2 = blender::math::clamp(x2, 0, width - 1);
    y1 = blender::math::clamp(y1, 0, height - 1);
    y2 = blender::math::clamp(y2, 0, height - 1);
    row1 = buffer + (int64_t(width) * y1 + x1) * 4;
    row2 = buffer + (int64_t(width) * y2 + x1) * 4;
    row3 = buffer + (int64_t(width) * y1 + x2) * 4;
    row4 = buffer + (int64_t(width) * y2 + x2) * 4;
  }

  float a = u - uf;
  float b = v - vf;
  float a_b = a * b;
  float ma_b = (1.0f - a) * b;
  float a_mb = a * (1.0f - b);
  float ma_mb = (1.0f - a) * (1.0f - b);

  res.x = uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
  res.y = uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
  res.z = uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
  res.w = uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
#endif

  return res;
}

uchar4 interpolate_bilinear_border_byte(
    const uchar *buffer, int width, int height, float u, float v)
{
  return bilinear_byte_impl<true>(buffer, width, height, u, v);
}

uchar4 interpolate_bilinear_byte(const uchar *buffer, int width, int height, float u, float v)
{
  return bilinear_byte_impl<false>(buffer, width, height, u, v);
}

float4 interpolate_bilinear_border_fl(const float *buffer, int width, int height, float u, float v)
{
  float4 res;
  bilinear_fl_impl<true>(buffer, res, width, height, 4, u, v);
  return res;
}

void interpolate_bilinear_border_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v)
{
  bilinear_fl_impl<true>(buffer, output, width, height, components, u, v);
}

float4 interpolate_bilinear_fl(const float *buffer, int width, int height, float u, float v)
{
  float4 res;
  bilinear_fl_impl<false>(buffer, res, width, height, 4, u, v);
  return res;
}

void interpolate_bilinear_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v)
{
  bilinear_fl_impl<false>(buffer, output, width, height, components, u, v);
}

void interpolate_bilinear_wrap_fl(const float *buffer,
                                  float *output,
                                  int width,
                                  int height,
                                  int components,
                                  float u,
                                  float v,
                                  bool wrap_x,
                                  bool wrap_y)
{
  bilinear_fl_impl<false>(buffer, output, width, height, components, u, v, wrap_x, wrap_y);
}

uchar4 interpolate_bilinear_wrap_byte(const uchar *buffer, int width, int height, float u, float v)
{
  u = floored_fmod(u, float(width));
  v = floored_fmod(v, float(height));
  float uf = floorf(u);
  float vf = floorf(v);

  int x1 = int(uf);
  int x2 = x1 + 1;
  int y1 = int(vf);
  int y2 = y1 + 1;

  /* Wrap interpolation pixels if needed. */
  BLI_assert(x1 >= 0 && x1 < width && y1 >= 0 && y1 < height);
  if (x2 >= width) {
    x2 = 0;
  }
  if (y2 >= height) {
    y2 = 0;
  }

  float a = u - uf;
  float b = v - vf;
  float a_b = a * b;
  float ma_b = (1.0f - a) * b;
  float a_mb = a * (1.0f - b);
  float ma_mb = (1.0f - a) * (1.0f - b);

  /* Blend samples. */
  const uchar *row1 = buffer + (int64_t(width) * y1 + x1) * 4;
  const uchar *row2 = buffer + (int64_t(width) * y2 + x1) * 4;
  const uchar *row3 = buffer + (int64_t(width) * y1 + x2) * 4;
  const uchar *row4 = buffer + (int64_t(width) * y2 + x2) * 4;

  uchar4 res;
  res.x = uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
  res.y = uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
  res.z = uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
  res.w = uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
  return res;
}

float4 interpolate_bilinear_wrap_fl(const float *buffer, int width, int height, float u, float v)
{
  float4 res;
  bilinear_fl_impl<false>(buffer, res, width, height, 4, u, v, true, true);
  return res;
}

uchar4 interpolate_cubic_bspline_byte(const uchar *buffer, int width, int height, float u, float v)
{
  uchar4 res;
  bicubic_interpolation<uchar, eCubicFilter::BSpline>(buffer, res, width, height, 4, u, v);
  return res;
}

float4 interpolate_cubic_bspline_fl(const float *buffer, int width, int height, float u, float v)
{
  float4 res;
  bicubic_interpolation<float, eCubicFilter::BSpline>(buffer, res, width, height, 4, u, v);
  return res;
}

void interpolate_cubic_bspline_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v)
{
  bicubic_interpolation<float, eCubicFilter::BSpline>(
      buffer, output, width, height, components, u, v);
}

uchar4 interpolate_cubic_mitchell_byte(
    const uchar *buffer, int width, int height, float u, float v)
{
  uchar4 res;
  bicubic_interpolation<uchar, eCubicFilter::Mitchell>(buffer, res, width, height, 4, u, v);
  return res;
}

float4 interpolate_cubic_mitchell_fl(const float *buffer, int width, int height, float u, float v)
{
  float4 res;
  bicubic_interpolation<float, eCubicFilter::Mitchell>(buffer, res, width, height, 4, u, v);
  return res;
}

void interpolate_cubic_mitchell_fl(
    const float *buffer, float *output, int width, int height, int components, float u, float v)
{
  bicubic_interpolation<float, eCubicFilter::Mitchell>(
      buffer, output, width, height, components, u, v);
}

}  // namespace blender::math

/**************************************************************************
 * Filtering method based on
 * "Creating raster omnimax images from multiple perspective views
 * using the elliptical weighted average filter"
 * by Ned Greene and Paul S. Heckbert (1986)
 ***************************************************************************/

/* Table of `(exp(ar) - exp(a)) / (1 - exp(a))` for `r` in range [0, 1] and `a = -2`.
 * used instead of actual gaussian,
 * otherwise at high texture magnifications circular artifacts are visible. */
#define EWA_MAXIDX 255
const float EWA_WTS[EWA_MAXIDX + 1] = {
    1.0f,        0.990965f,   0.982f,      0.973105f,   0.96428f,    0.955524f,   0.946836f,
    0.938216f,   0.929664f,   0.921178f,   0.912759f,   0.904405f,   0.896117f,   0.887893f,
    0.879734f,   0.871638f,   0.863605f,   0.855636f,   0.847728f,   0.839883f,   0.832098f,
    0.824375f,   0.816712f,   0.809108f,   0.801564f,   0.794079f,   0.786653f,   0.779284f,
    0.771974f,   0.76472f,    0.757523f,   0.750382f,   0.743297f,   0.736267f,   0.729292f,
    0.722372f,   0.715505f,   0.708693f,   0.701933f,   0.695227f,   0.688572f,   0.68197f,
    0.67542f,    0.66892f,    0.662471f,   0.656073f,   0.649725f,   0.643426f,   0.637176f,
    0.630976f,   0.624824f,   0.618719f,   0.612663f,   0.606654f,   0.600691f,   0.594776f,
    0.588906f,   0.583083f,   0.577305f,   0.571572f,   0.565883f,   0.56024f,    0.55464f,
    0.549084f,   0.543572f,   0.538102f,   0.532676f,   0.527291f,   0.521949f,   0.516649f,
    0.511389f,   0.506171f,   0.500994f,   0.495857f,   0.490761f,   0.485704f,   0.480687f,
    0.475709f,   0.470769f,   0.465869f,   0.461006f,   0.456182f,   0.451395f,   0.446646f,
    0.441934f,   0.437258f,   0.432619f,   0.428017f,   0.42345f,    0.418919f,   0.414424f,
    0.409963f,   0.405538f,   0.401147f,   0.39679f,    0.392467f,   0.388178f,   0.383923f,
    0.379701f,   0.375511f,   0.371355f,   0.367231f,   0.363139f,   0.359079f,   0.355051f,
    0.351055f,   0.347089f,   0.343155f,   0.339251f,   0.335378f,   0.331535f,   0.327722f,
    0.323939f,   0.320186f,   0.316461f,   0.312766f,   0.3091f,     0.305462f,   0.301853f,
    0.298272f,   0.294719f,   0.291194f,   0.287696f,   0.284226f,   0.280782f,   0.277366f,
    0.273976f,   0.270613f,   0.267276f,   0.263965f,   0.26068f,    0.257421f,   0.254187f,
    0.250979f,   0.247795f,   0.244636f,   0.241502f,   0.238393f,   0.235308f,   0.232246f,
    0.229209f,   0.226196f,   0.223206f,   0.220239f,   0.217296f,   0.214375f,   0.211478f,
    0.208603f,   0.20575f,    0.20292f,    0.200112f,   0.197326f,   0.194562f,   0.191819f,
    0.189097f,   0.186397f,   0.183718f,   0.18106f,    0.178423f,   0.175806f,   0.17321f,
    0.170634f,   0.168078f,   0.165542f,   0.163026f,   0.16053f,    0.158053f,   0.155595f,
    0.153157f,   0.150738f,   0.148337f,   0.145955f,   0.143592f,   0.141248f,   0.138921f,
    0.136613f,   0.134323f,   0.132051f,   0.129797f,   0.12756f,    0.125341f,   0.123139f,
    0.120954f,   0.118786f,   0.116635f,   0.114501f,   0.112384f,   0.110283f,   0.108199f,
    0.106131f,   0.104079f,   0.102043f,   0.100023f,   0.0980186f,  0.09603f,    0.094057f,
    0.0920994f,  0.0901571f,  0.08823f,    0.0863179f,  0.0844208f,  0.0825384f,  0.0806708f,
    0.0788178f,  0.0769792f,  0.0751551f,  0.0733451f,  0.0715493f,  0.0697676f,  0.0679997f,
    0.0662457f,  0.0645054f,  0.0627786f,  0.0610654f,  0.0593655f,  0.0576789f,  0.0560055f,
    0.0543452f,  0.0526979f,  0.0510634f,  0.0494416f,  0.0478326f,  0.0462361f,  0.0446521f,
    0.0430805f,  0.0415211f,  0.039974f,   0.0384389f,  0.0369158f,  0.0354046f,  0.0339052f,
    0.0324175f,  0.0309415f,  0.029477f,   0.0280239f,  0.0265822f,  0.0251517f,  0.0237324f,
    0.0223242f,  0.020927f,   0.0195408f,  0.0181653f,  0.0168006f,  0.0154466f,  0.0141031f,
    0.0127701f,  0.0114476f,  0.0101354f,  0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
    0.00372644f, 0.00247454f, 0.00123242f, 0.0f,
};

static void radangle2imp(float a2, float b2, float th, float *A, float *B, float *C, float *F)
{
  float ct2 = cosf(th);
  const float st2 = 1.0f - ct2 * ct2; /* <- sin(th)^2 */
  ct2 *= ct2;
  *A = a2 * st2 + b2 * ct2;
  *B = (b2 - a2) * sinf(2.0f * th);
  *C = a2 * ct2 + b2 * st2;
  *F = a2 * b2;
}

void BLI_ewa_imp2radangle(
    float A, float B, float C, float F, float *a, float *b, float *th, float *ecc)
{
  /* NOTE: all tests here are done to make sure possible overflows are hopefully minimized. */

  if (F <= 1e-5f) { /* use arbitrary major radius, zero minor, infinite eccentricity */
    *a = sqrtf(A > C ? A : C);
    *b = 0.0f;
    *ecc = 1e10f;
    *th = 0.5f * (atan2f(B, A - C) + float(M_PI));
  }
  else {
    const float AmC = A - C, ApC = A + C, F2 = F * 2.0f;
    const float r = sqrtf(AmC * AmC + B * B);
    float d = ApC - r;
    *a = (d <= 0.0f) ? sqrtf(A > C ? A : C) : sqrtf(F2 / d);
    d = ApC + r;
    if (d <= 0.0f) {
      *b = 0.0f;
      *ecc = 1e10f;
    }
    else {
      *b = sqrtf(F2 / d);
      *ecc = *a / *b;
    }
    /* Increase theta by `0.5 * pi` (angle of major axis). */
    *th = 0.5f * (atan2f(B, AmC) + float(M_PI));
  }
}

void BLI_ewa_filter(const int width,
                    const int height,
                    const bool intpol,
                    const bool use_alpha,
                    const float uv[2],
                    const float du[2],
                    const float dv[2],
                    ewa_filter_read_pixel_cb read_pixel_cb,
                    void *userdata,
                    float result[4])
{
  /* Scaling `dxt` / `dyt` by full resolution can cause overflow because of huge A/B/C and esp.
   * F values, scaling by aspect ratio alone does the opposite, so try something in between
   * instead. */
  const float ff2 = float(width), ff = sqrtf(ff2), q = float(height) / ff;
  const float Ux = du[0] * ff, Vx = du[1] * q, Uy = dv[0] * ff, Vy = dv[1] * q;
  float A = Vx * Vx + Vy * Vy;
  float B = -2.0f * (Ux * Vx + Uy * Vy);
  float C = Ux * Ux + Uy * Uy;
  float F = A * C - B * B * 0.25f;
  float a, b, th, ecc, a2, b2, ue, ve, U0, V0, DDQ, U, ac1, ac2, BU, d;
  int u, v, u1, u2, v1, v2;

  /* The so-called 'high' quality EWA method simply adds a constant of 1 to both A & C,
   * so the ellipse always covers at least some texels. But since the filter is now always larger,
   * it also means that everywhere else it's also more blurry then ideally should be the case.
   * So instead here the ellipse radii are modified instead whenever either is too low.
   * Use a different radius based on interpolation switch,
   * just enough to anti-alias when interpolation is off,
   * and slightly larger to make result a bit smoother than bilinear interpolation when
   * interpolation is on (minimum values: `const float rmin = intpol ? 1.0f : 0.5f;`) */
  const float rmin = (intpol ? 1.5625f : 0.765625f) / ff2;
  BLI_ewa_imp2radangle(A, B, C, F, &a, &b, &th, &ecc);
  if ((b2 = b * b) < rmin) {
    if ((a2 = a * a) < rmin) {
      UNUSED_VARS(a2, b2);
      B = 0.0f;
      A = C = rmin;
      F = A * C;
    }
    else {
      b2 = rmin;
      radangle2imp(a2, b2, th, &A, &B, &C, &F);
    }
  }

  ue = ff * sqrtf(C);
  ve = ff * sqrtf(A);
  d = float(EWA_MAXIDX + 1) / (F * ff2);
  A *= d;
  B *= d;
  C *= d;

  U0 = uv[0] * float(width);
  V0 = uv[1] * float(height);
  u1 = int(floorf(U0 - ue));
  u2 = int(ceilf(U0 + ue));
  v1 = int(floorf(V0 - ve));
  v2 = int(ceilf(V0 + ve));

  /* sane clamping to avoid unnecessarily huge loops */
  /* NOTE: if eccentricity gets clamped (see above),
   * the ue/ve limits can also be lowered accordingly
   */
  if (U0 - float(u1) > EWA_MAXIDX) {
    u1 = int(U0) - EWA_MAXIDX;
  }
  if (float(u2) - U0 > EWA_MAXIDX) {
    u2 = int(U0) + EWA_MAXIDX;
  }
  if (V0 - float(v1) > EWA_MAXIDX) {
    v1 = int(V0) - EWA_MAXIDX;
  }
  if (float(v2) - V0 > EWA_MAXIDX) {
    v2 = int(V0) + EWA_MAXIDX;
  }

  /* Early output check for cases the whole region is outside of the buffer. */
  if ((u2 < 0 || u1 >= width) || (v2 < 0 || v1 >= height)) {
    zero_v4(result);
    return;
  }

  U0 -= 0.5f;
  V0 -= 0.5f;
  DDQ = 2.0f * A;
  U = float(u1) - U0;
  ac1 = A * (2.0f * U + 1.0f);
  ac2 = A * U * U;
  BU = B * U;

  d = 0.0f;
  zero_v4(result);
  for (v = v1; v <= v2; v++) {
    const float V = float(v) - V0;
    float DQ = ac1 + B * V;
    float Q = (C * V + BU) * V + ac2;
    for (u = u1; u <= u2; u++) {
      if (Q < float(EWA_MAXIDX + 1)) {
        float tc[4];
        const float wt = EWA_WTS[(Q < 0.0f) ? 0 : uint(Q)];
        read_pixel_cb(userdata, u, v, tc);
        madd_v3_v3fl(result, tc, wt);
        result[3] += use_alpha ? tc[3] * wt : 0.0f;
        d += wt;
      }
      Q += DQ;
      DQ += DDQ;
    }
  }

  /* `d` should hopefully never be zero anymore. */
  d = 1.0f / d;
  mul_v3_fl(result, d);
  /* Clipping can be ignored if alpha used, `texr->trgba[3]` already includes filtered edge. */
  result[3] = use_alpha ? result[3] * d : 1.0f;
}