Files
test2/source/blender/blenlib/intern/math_interp.c
Aras Pranckevicius 1e0bf33b00 ImBuf: optimize IMB_transform
IMB_transform is used by Sequencer (and other places) to do image
translation/rotation/scale on the CPU. This PR speeds up parts of it,
particularly when bilinear filtering is used. No behavior changes are
expected.

- Don't use virtual function calls inside inner loop. The code was using
  class hierarchies with virtual calls just to do equivalent of "outside
  of image? ignore" and "wrap UV coordinates or not?" decisions. Make those
  use non-virtual function based code.
- Simplify pixel sampling functions to only do the work as needed by
  anything within Blender codebase. For example, bilinear sampling of uchar
  images always uses 4 RGBA channels and never does "UV wrap" logic.
- Bilinear interpolation uchar: completely branchless SIMD code now.
- Bilinear interpolation float: 2x floor() calls instead of 4x floor() +
  2x ceil(), and final sample blending is done with SIMD.

Sequencer at 4K UHD resolution, with two image strips that need a transform,
playback framerate:

- Windows Ryzen 5950X: 18.7fps -> 26.2fps (IMB_transform time per frame goes
  26.3ms -> 11.2ms)
- Mac M1 Max: 27.3fps -> 31.4fps

At that point the IMB_transform is not the slowest part of where playback
takes time (but rather sequencer effect application etc.).

Note: the amount of _actual code_ got a bit smaller. But I've added 100 lines
of unit tests in BLI_math_interp_test.cc, the bilinear interpolation
functions were only tested very indirectly by CPU compositor template
image tests.

Pull Request: https://projects.blender.org/blender/blender/pulls/115653
2023-12-14 15:10:30 +01:00

759 lines
26 KiB
C

/* SPDX-FileCopyrightText: 2012 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup bli
*/
#include <math.h>
#include <string.h>
#include "BLI_math_base.h"
#include "BLI_math_interp.h"
#include "BLI_math_vector.h"
#include "BLI_simd.h"
#include "BLI_strict_flags.h"
#if BLI_HAVE_SSE2 && defined(__SSE4_1__)
# include <smmintrin.h> /* _mm_floor_ps */
#endif
/**************************************************************************
* INTERPOLATIONS
*
* Reference and docs:
* http://wiki.blender.org/index.php/User:Damiles#Interpolations_Algorithms
***************************************************************************/
/* BICUBIC Interpolation functions
* More info: http://wiki.blender.org/index.php/User:Damiles#Bicubic_pixel_interpolation
* function assumes out to be zero'ed, only does RGBA */
static float P(float k)
{
float p1, p2, p3, p4;
p1 = max_ff(k + 2.0f, 0.0f);
p2 = max_ff(k + 1.0f, 0.0f);
p3 = max_ff(k, 0.0f);
p4 = max_ff(k - 1.0f, 0.0f);
return (float)(1.0f / 6.0f) *
(p1 * p1 * p1 - 4.0f * p2 * p2 * p2 + 6.0f * p3 * p3 * p3 - 4.0f * p4 * p4 * p4);
}
#if 0
/* older, slower function, works the same as above */
static float P(float k)
{
return (float)(1.0f / 6.0f) *
(pow(MAX2(k + 2.0f, 0), 3.0f) - 4.0f * pow(MAX2(k + 1.0f, 0), 3.0f) +
6.0f * pow(MAX2(k, 0), 3.0f) - 4.0f * pow(MAX2(k - 1.0f, 0), 3.0f));
}
#endif
static void vector_from_float(const float *data, float vector[4], int components)
{
if (components == 1) {
vector[0] = data[0];
}
else if (components == 3) {
copy_v3_v3(vector, data);
}
else {
copy_v4_v4(vector, data);
}
}
static void vector_from_byte(const uchar *data, float vector[4], int components)
{
if (components == 1) {
vector[0] = data[0];
}
else if (components == 3) {
vector[0] = data[0];
vector[1] = data[1];
vector[2] = data[2];
}
else {
vector[0] = data[0];
vector[1] = data[1];
vector[2] = data[2];
vector[3] = data[3];
}
}
/* BICUBIC INTERPOLATION */
BLI_INLINE void bicubic_interpolation(const uchar *byte_buffer,
const float *float_buffer,
uchar *byte_output,
float *float_output,
int width,
int height,
int components,
float u,
float v)
{
int i, j, n, m, x1, y1;
float a, b, w, wx, wy[4], out[4];
/* sample area entirely outside image? */
if (ceil(u) < 0 || floor(u) > width - 1 || ceil(v) < 0 || floor(v) > height - 1) {
if (float_output) {
copy_vn_fl(float_output, components, 0.0f);
}
if (byte_output) {
copy_vn_uchar(byte_output, components, 0);
}
return;
}
i = (int)floor(u);
j = (int)floor(v);
a = u - (float)i;
b = v - (float)j;
zero_v4(out);
/* Optimized and not so easy to read */
/* avoid calling multiple times */
wy[0] = P(b - (-1));
wy[1] = P(b - 0);
wy[2] = P(b - 1);
wy[3] = P(b - 2);
for (n = -1; n <= 2; n++) {
x1 = i + n;
CLAMP(x1, 0, width - 1);
wx = P((float)n - a);
for (m = -1; m <= 2; m++) {
float data[4];
y1 = j + m;
CLAMP(y1, 0, height - 1);
/* Normally we could do this:
* `w = P(n-a) * P(b-m);`
* except that would call `P()` 16 times per pixel therefor `pow()` 64 times,
* better pre-calculate these. */
w = wx * wy[m + 1];
if (float_output) {
const float *float_data = float_buffer + width * y1 * components + components * x1;
vector_from_float(float_data, data, components);
}
else {
const uchar *byte_data = byte_buffer + width * y1 * components + components * x1;
vector_from_byte(byte_data, data, components);
}
if (components == 1) {
out[0] += data[0] * w;
}
else if (components == 3) {
out[0] += data[0] * w;
out[1] += data[1] * w;
out[2] += data[2] * w;
}
else {
out[0] += data[0] * w;
out[1] += data[1] * w;
out[2] += data[2] * w;
out[3] += data[3] * w;
}
}
}
/* Done with optimized part */
#if 0
/* older, slower function, works the same as above */
for (n = -1; n <= 2; n++) {
for (m = -1; m <= 2; m++) {
x1 = i + n;
y1 = j + m;
if (x1 > 0 && x1 < width && y1 > 0 && y1 < height) {
float data[4];
if (float_output) {
const float *float_data = float_buffer + width * y1 * components + components * x1;
vector_from_float(float_data, data, components);
}
else {
const uchar *byte_data = byte_buffer + width * y1 * components + components * x1;
vector_from_byte(byte_data, data, components);
}
if (components == 1) {
out[0] += data[0] * P(n - a) * P(b - m);
}
else if (components == 3) {
out[0] += data[0] * P(n - a) * P(b - m);
out[1] += data[1] * P(n - a) * P(b - m);
out[2] += data[2] * P(n - a) * P(b - m);
}
else {
out[0] += data[0] * P(n - a) * P(b - m);
out[1] += data[1] * P(n - a) * P(b - m);
out[2] += data[2] * P(n - a) * P(b - m);
out[3] += data[3] * P(n - a) * P(b - m);
}
}
}
}
#endif
if (float_output) {
if (components == 1) {
float_output[0] = out[0];
}
else if (components == 3) {
copy_v3_v3(float_output, out);
}
else {
copy_v4_v4(float_output, out);
}
}
else {
if (components == 1) {
byte_output[0] = (uchar)(out[0] + 0.5f);
}
else if (components == 3) {
byte_output[0] = (uchar)(out[0] + 0.5f);
byte_output[1] = (uchar)(out[1] + 0.5f);
byte_output[2] = (uchar)(out[2] + 0.5f);
}
else {
byte_output[0] = (uchar)(out[0] + 0.5f);
byte_output[1] = (uchar)(out[1] + 0.5f);
byte_output[2] = (uchar)(out[2] + 0.5f);
byte_output[3] = (uchar)(out[3] + 0.5f);
}
}
}
void BLI_bicubic_interpolation_fl(
const float *buffer, float *output, int width, int height, int components, float u, float v)
{
bicubic_interpolation(NULL, buffer, NULL, output, width, height, components, u, v);
}
void BLI_bicubic_interpolation_char(
const uchar *buffer, uchar *output, int width, int height, float u, float v)
{
bicubic_interpolation(buffer, NULL, output, NULL, width, height, 4, u, v);
}
/* BILINEAR INTERPOLATION */
BLI_INLINE void bilinear_interpolation_fl(const float *float_buffer,
float *float_output,
int width,
int height,
int components,
float u,
float v,
bool wrap_x,
bool wrap_y)
{
float a, b;
float a_b, ma_b, a_mb, ma_mb;
int y1, y2, x1, x2;
float uf = floorf(u);
float vf = floorf(v);
x1 = (int)uf;
x2 = x1 + 1;
y1 = (int)vf;
y2 = y1 + 1;
const float *row1, *row2, *row3, *row4;
const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
/* pixel value must be already wrapped, however values at boundaries may flip */
if (wrap_x) {
if (x1 < 0) {
x1 = width - 1;
}
if (x2 >= width) {
x2 = 0;
}
}
else if (x2 < 0 || x1 >= width) {
copy_vn_fl(float_output, components, 0.0f);
return;
}
if (wrap_y) {
if (y1 < 0) {
y1 = height - 1;
}
if (y2 >= height) {
y2 = 0;
}
}
else if (y2 < 0 || y1 >= height) {
copy_vn_fl(float_output, components, 0.0f);
return;
}
/* sample including outside of edges of image */
if (x1 < 0 || y1 < 0) {
row1 = empty;
}
else {
row1 = float_buffer + width * y1 * components + components * x1;
}
if (x1 < 0 || y2 > height - 1) {
row2 = empty;
}
else {
row2 = float_buffer + width * y2 * components + components * x1;
}
if (x2 > width - 1 || y1 < 0) {
row3 = empty;
}
else {
row3 = float_buffer + width * y1 * components + components * x2;
}
if (x2 > width - 1 || y2 > height - 1) {
row4 = empty;
}
else {
row4 = float_buffer + width * y2 * components + components * x2;
}
a = u - uf;
b = v - vf;
a_b = a * b;
ma_b = (1.0f - a) * b;
a_mb = a * (1.0f - b);
ma_mb = (1.0f - a) * (1.0f - b);
if (components == 1) {
float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
}
else if (components == 3) {
float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
float_output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
float_output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
}
else {
#if BLI_HAVE_SSE2
__m128 rgba1 = _mm_loadu_ps(row1);
__m128 rgba2 = _mm_loadu_ps(row2);
__m128 rgba3 = _mm_loadu_ps(row3);
__m128 rgba4 = _mm_loadu_ps(row4);
rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
__m128 rgba13 = _mm_add_ps(rgba1, rgba3);
__m128 rgba24 = _mm_add_ps(rgba2, rgba4);
__m128 rgba = _mm_add_ps(rgba13, rgba24);
_mm_storeu_ps(float_output, rgba);
#else
float_output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
float_output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
float_output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
float_output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
#endif
}
}
void BLI_bilinear_interpolation_char(
const uchar *buffer, uchar *output, int width, int height, float u, float v)
{
#if BLI_HAVE_SSE2
/* Bilinear interpolation needs to read and blend four image pixels, while
* also handling conditions of sample coordinate being outside of the
* image, in which case black (all zeroes) should be used as the sample
* contribution.
*
* Code below does all that without any branches, by making outside the
* image sample locations still read the first pixel of the image, but
* later making sure that the result is set to zero for that sample. */
__m128 uvuv = _mm_set_ps(v, u, v, u);
# if defined(__SSE4_1__) || defined(__ARM_NEON) && defined(WITH_SSE2NEON)
/* If we're on SSE4 or ARM NEON, just use the simple floor() way. */
__m128 uvuv_floor = _mm_floor_ps(uvuv);
# else
/* The hard way: truncate, for negative inputs this will round towards zero.
* Then compare with input UV, and subtract 1 for the inputs that were
* negative. */
__m128 uv_trunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(uvuv));
__m128 uv_neg = _mm_cmplt_ps(uvuv, uv_trunc);
__m128 uvuv_floor = _mm_sub_ps(uv_trunc, _mm_and_ps(uv_neg, _mm_set1_ps(1.0f)));
# endif
/* x1, y1, x2, y2 */
__m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
/* Check whether any of the coordinates are outside of the image. */
__m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
_mm_set1_epi32(1));
__m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
__m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
__m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
/* Samples 1,2,3,4 are in this order: x1y1, x1y2, x2y1, x2y2 */
__m128i x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
__m128i y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
__m128i invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
/* Set x & y to zero for invalid samples. */
x1234 = _mm_andnot_si128(invalid_1234, x1234);
y1234 = _mm_andnot_si128(invalid_1234, y1234);
/* Read the four sample values. Do address calculations in C, since SSE
* before 4.1 makes it very cumbersome to do full integer multiplies. */
int xcoord[4];
int ycoord[4];
_mm_storeu_ps((float *)xcoord, _mm_castsi128_ps(x1234));
_mm_storeu_ps((float *)ycoord, _mm_castsi128_ps(y1234));
int sample1 = ((const int *)buffer)[ycoord[0] * (int64_t)width + xcoord[0]];
int sample2 = ((const int *)buffer)[ycoord[1] * (int64_t)width + xcoord[1]];
int sample3 = ((const int *)buffer)[ycoord[2] * (int64_t)width + xcoord[2]];
int sample4 = ((const int *)buffer)[ycoord[3] * (int64_t)width + xcoord[3]];
__m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
/* Set samples to black for the ones that were actually invalid. */
samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
/* Expand samples from packed 8-bit RGBA to full floats:
* spread to 16 bit values. */
__m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
__m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
/* Spread to 32 bit values and convert to float. */
__m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
__m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
__m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
__m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
/* Calculate interpolation factors: (1-a)*(1-b), (1-a)*b, a*(1-b), a*b */
__m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
__m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
__m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
__m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
/* Blend the samples. */
rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
__m128 rgba13 = _mm_add_ps(rgba1, rgba3);
__m128 rgba24 = _mm_add_ps(rgba2, rgba4);
__m128 rgba = _mm_add_ps(rgba13, rgba24);
rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
/* Pack and write to destination: pack to 16 bit signed, then to 8 bit
* unsigned, then write resulting 32-bit value. */
__m128i rgba32 = _mm_cvttps_epi32(rgba);
__m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
__m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
_mm_store_ss((float *)output, _mm_castsi128_ps(rgba8));
#else
float a, b;
float a_b, ma_b, a_mb, ma_mb;
int y1, y2, x1, x2;
float uf = floorf(u);
float vf = floorf(v);
x1 = (int)uf;
x2 = x1 + 1;
y1 = (int)vf;
y2 = y1 + 1;
const uchar *row1, *row2, *row3, *row4;
uchar empty[4] = {0, 0, 0, 0};
/* completely outside of the image? */
if (x2 < 0 || x1 >= width) {
copy_vn_uchar(output, 4, 0);
return;
}
if (y2 < 0 || y1 >= height) {
copy_vn_uchar(output, 4, 0);
return;
}
/* sample including outside of edges of image */
if (x1 < 0 || y1 < 0) {
row1 = empty;
}
else {
row1 = buffer + width * y1 * 4 + 4 * x1;
}
if (x1 < 0 || y2 > height - 1) {
row2 = empty;
}
else {
row2 = buffer + width * y2 * 4 + 4 * x1;
}
if (x2 > width - 1 || y1 < 0) {
row3 = empty;
}
else {
row3 = buffer + width * y1 * 4 + 4 * x2;
}
if (x2 > width - 1 || y2 > height - 1) {
row4 = empty;
}
else {
row4 = buffer + width * y2 * 4 + 4 * x2;
}
a = u - uf;
b = v - vf;
a_b = a * b;
ma_b = (1.0f - a) * b;
a_mb = a * (1.0f - b);
ma_mb = (1.0f - a) * (1.0f - b);
output[0] = (uchar)(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
output[1] = (uchar)(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
output[2] = (uchar)(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
output[3] = (uchar)(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
#endif
}
void BLI_bilinear_interpolation_fl(
const float *buffer, float *output, int width, int height, int components, float u, float v)
{
bilinear_interpolation_fl(buffer, output, width, height, components, u, v, false, false);
}
void BLI_bilinear_interpolation_wrap_fl(const float *buffer,
float *output,
int width,
int height,
int components,
float u,
float v,
bool wrap_x,
bool wrap_y)
{
bilinear_interpolation_fl(buffer, output, width, height, components, u, v, wrap_x, wrap_y);
}
/**************************************************************************
* Filtering method based on
* "Creating raster omnimax images from multiple perspective views
* using the elliptical weighted average filter"
* by Ned Greene and Paul S. Heckbert (1986)
***************************************************************************/
/* Table of `(exp(ar) - exp(a)) / (1 - exp(a))` for `r` in range [0, 1] and `a = -2`.
* used instead of actual gaussian,
* otherwise at high texture magnifications circular artifacts are visible. */
#define EWA_MAXIDX 255
const float EWA_WTS[EWA_MAXIDX + 1] = {
1.0f, 0.990965f, 0.982f, 0.973105f, 0.96428f, 0.955524f, 0.946836f,
0.938216f, 0.929664f, 0.921178f, 0.912759f, 0.904405f, 0.896117f, 0.887893f,
0.879734f, 0.871638f, 0.863605f, 0.855636f, 0.847728f, 0.839883f, 0.832098f,
0.824375f, 0.816712f, 0.809108f, 0.801564f, 0.794079f, 0.786653f, 0.779284f,
0.771974f, 0.76472f, 0.757523f, 0.750382f, 0.743297f, 0.736267f, 0.729292f,
0.722372f, 0.715505f, 0.708693f, 0.701933f, 0.695227f, 0.688572f, 0.68197f,
0.67542f, 0.66892f, 0.662471f, 0.656073f, 0.649725f, 0.643426f, 0.637176f,
0.630976f, 0.624824f, 0.618719f, 0.612663f, 0.606654f, 0.600691f, 0.594776f,
0.588906f, 0.583083f, 0.577305f, 0.571572f, 0.565883f, 0.56024f, 0.55464f,
0.549084f, 0.543572f, 0.538102f, 0.532676f, 0.527291f, 0.521949f, 0.516649f,
0.511389f, 0.506171f, 0.500994f, 0.495857f, 0.490761f, 0.485704f, 0.480687f,
0.475709f, 0.470769f, 0.465869f, 0.461006f, 0.456182f, 0.451395f, 0.446646f,
0.441934f, 0.437258f, 0.432619f, 0.428017f, 0.42345f, 0.418919f, 0.414424f,
0.409963f, 0.405538f, 0.401147f, 0.39679f, 0.392467f, 0.388178f, 0.383923f,
0.379701f, 0.375511f, 0.371355f, 0.367231f, 0.363139f, 0.359079f, 0.355051f,
0.351055f, 0.347089f, 0.343155f, 0.339251f, 0.335378f, 0.331535f, 0.327722f,
0.323939f, 0.320186f, 0.316461f, 0.312766f, 0.3091f, 0.305462f, 0.301853f,
0.298272f, 0.294719f, 0.291194f, 0.287696f, 0.284226f, 0.280782f, 0.277366f,
0.273976f, 0.270613f, 0.267276f, 0.263965f, 0.26068f, 0.257421f, 0.254187f,
0.250979f, 0.247795f, 0.244636f, 0.241502f, 0.238393f, 0.235308f, 0.232246f,
0.229209f, 0.226196f, 0.223206f, 0.220239f, 0.217296f, 0.214375f, 0.211478f,
0.208603f, 0.20575f, 0.20292f, 0.200112f, 0.197326f, 0.194562f, 0.191819f,
0.189097f, 0.186397f, 0.183718f, 0.18106f, 0.178423f, 0.175806f, 0.17321f,
0.170634f, 0.168078f, 0.165542f, 0.163026f, 0.16053f, 0.158053f, 0.155595f,
0.153157f, 0.150738f, 0.148337f, 0.145955f, 0.143592f, 0.141248f, 0.138921f,
0.136613f, 0.134323f, 0.132051f, 0.129797f, 0.12756f, 0.125341f, 0.123139f,
0.120954f, 0.118786f, 0.116635f, 0.114501f, 0.112384f, 0.110283f, 0.108199f,
0.106131f, 0.104079f, 0.102043f, 0.100023f, 0.0980186f, 0.09603f, 0.094057f,
0.0920994f, 0.0901571f, 0.08823f, 0.0863179f, 0.0844208f, 0.0825384f, 0.0806708f,
0.0788178f, 0.0769792f, 0.0751551f, 0.0733451f, 0.0715493f, 0.0697676f, 0.0679997f,
0.0662457f, 0.0645054f, 0.0627786f, 0.0610654f, 0.0593655f, 0.0576789f, 0.0560055f,
0.0543452f, 0.0526979f, 0.0510634f, 0.0494416f, 0.0478326f, 0.0462361f, 0.0446521f,
0.0430805f, 0.0415211f, 0.039974f, 0.0384389f, 0.0369158f, 0.0354046f, 0.0339052f,
0.0324175f, 0.0309415f, 0.029477f, 0.0280239f, 0.0265822f, 0.0251517f, 0.0237324f,
0.0223242f, 0.020927f, 0.0195408f, 0.0181653f, 0.0168006f, 0.0154466f, 0.0141031f,
0.0127701f, 0.0114476f, 0.0101354f, 0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
0.00372644f, 0.00247454f, 0.00123242f, 0.0f,
};
static void radangle2imp(float a2, float b2, float th, float *A, float *B, float *C, float *F)
{
float ct2 = cosf(th);
const float st2 = 1.0f - ct2 * ct2; /* <- sin(th)^2 */
ct2 *= ct2;
*A = a2 * st2 + b2 * ct2;
*B = (b2 - a2) * sinf(2.0f * th);
*C = a2 * ct2 + b2 * st2;
*F = a2 * b2;
}
void BLI_ewa_imp2radangle(
float A, float B, float C, float F, float *a, float *b, float *th, float *ecc)
{
/* NOTE: all tests here are done to make sure possible overflows are hopefully minimized. */
if (F <= 1e-5f) { /* use arbitrary major radius, zero minor, infinite eccentricity */
*a = sqrtf(A > C ? A : C);
*b = 0.0f;
*ecc = 1e10f;
*th = 0.5f * (atan2f(B, A - C) + (float)M_PI);
}
else {
const float AmC = A - C, ApC = A + C, F2 = F * 2.0f;
const float r = sqrtf(AmC * AmC + B * B);
float d = ApC - r;
*a = (d <= 0.0f) ? sqrtf(A > C ? A : C) : sqrtf(F2 / d);
d = ApC + r;
if (d <= 0.0f) {
*b = 0.0f;
*ecc = 1e10f;
}
else {
*b = sqrtf(F2 / d);
*ecc = *a / *b;
}
/* Increase theta by `0.5 * pi` (angle of major axis). */
*th = 0.5f * (atan2f(B, AmC) + (float)M_PI);
}
}
void BLI_ewa_filter(const int width,
const int height,
const bool intpol,
const bool use_alpha,
const float uv[2],
const float du[2],
const float dv[2],
ewa_filter_read_pixel_cb read_pixel_cb,
void *userdata,
float result[4])
{
/* scaling dxt/dyt by full resolution can cause overflow because of huge A/B/C and esp. F values,
* scaling by aspect ratio alone does the opposite, so try something in between instead... */
const float ff2 = (float)width, ff = sqrtf(ff2), q = (float)height / ff;
const float Ux = du[0] * ff, Vx = du[1] * q, Uy = dv[0] * ff, Vy = dv[1] * q;
float A = Vx * Vx + Vy * Vy;
float B = -2.0f * (Ux * Vx + Uy * Vy);
float C = Ux * Ux + Uy * Uy;
float F = A * C - B * B * 0.25f;
float a, b, th, ecc, a2, b2, ue, ve, U0, V0, DDQ, U, ac1, ac2, BU, d;
int u, v, u1, u2, v1, v2;
/* The so-called 'high' quality ewa method simply adds a constant of 1 to both A & C,
* so the ellipse always covers at least some texels. But since the filter is now always larger,
* it also means that everywhere else it's also more blurry then ideally should be the case.
* So instead here the ellipse radii are modified instead whenever either is too low.
* Use a different radius based on interpolation switch,
* just enough to anti-alias when interpolation is off,
* and slightly larger to make result a bit smoother than bilinear interpolation when
* interpolation is on (minimum values: `const float rmin = intpol ? 1.0f : 0.5f;`) */
const float rmin = (intpol ? 1.5625f : 0.765625f) / ff2;
BLI_ewa_imp2radangle(A, B, C, F, &a, &b, &th, &ecc);
if ((b2 = b * b) < rmin) {
if ((a2 = a * a) < rmin) {
B = 0.0f;
A = C = rmin;
F = A * C;
}
else {
b2 = rmin;
radangle2imp(a2, b2, th, &A, &B, &C, &F);
}
}
ue = ff * sqrtf(C);
ve = ff * sqrtf(A);
d = (float)(EWA_MAXIDX + 1) / (F * ff2);
A *= d;
B *= d;
C *= d;
U0 = uv[0] * (float)width;
V0 = uv[1] * (float)height;
u1 = (int)floorf(U0 - ue);
u2 = (int)ceilf(U0 + ue);
v1 = (int)floorf(V0 - ve);
v2 = (int)ceilf(V0 + ve);
/* sane clamping to avoid unnecessarily huge loops */
/* NOTE: if eccentricity gets clamped (see above),
* the ue/ve limits can also be lowered accordingly
*/
if (U0 - (float)u1 > EWA_MAXIDX) {
u1 = (int)U0 - EWA_MAXIDX;
}
if ((float)u2 - U0 > EWA_MAXIDX) {
u2 = (int)U0 + EWA_MAXIDX;
}
if (V0 - (float)v1 > EWA_MAXIDX) {
v1 = (int)V0 - EWA_MAXIDX;
}
if ((float)v2 - V0 > EWA_MAXIDX) {
v2 = (int)V0 + EWA_MAXIDX;
}
/* Early output check for cases the whole region is outside of the buffer. */
if ((u2 < 0 || u1 >= width) || (v2 < 0 || v1 >= height)) {
zero_v4(result);
return;
}
U0 -= 0.5f;
V0 -= 0.5f;
DDQ = 2.0f * A;
U = (float)u1 - U0;
ac1 = A * (2.0f * U + 1.0f);
ac2 = A * U * U;
BU = B * U;
d = 0.0f;
zero_v4(result);
for (v = v1; v <= v2; v++) {
const float V = (float)v - V0;
float DQ = ac1 + B * V;
float Q = (C * V + BU) * V + ac2;
for (u = u1; u <= u2; u++) {
if (Q < (float)(EWA_MAXIDX + 1)) {
float tc[4];
const float wt = EWA_WTS[(Q < 0.0f) ? 0 : (uint)Q];
read_pixel_cb(userdata, u, v, tc);
madd_v3_v3fl(result, tc, wt);
result[3] += use_alpha ? tc[3] * wt : 0.0f;
d += wt;
}
Q += DQ;
DQ += DDQ;
}
}
/* `d` should hopefully never be zero anymore. */
d = 1.0f / d;
mul_v3_fl(result, d);
/* Clipping can be ignored if alpha used, `texr->trgba[3]` already includes filtered edge. */
result[3] = use_alpha ? result[3] * d : 1.0f;
}