Files
test2/source/blender/blenlib/intern/math_interp.cc
Omar Emara 462b81efca Fix: Border bicubic interpolation is wrong around edges
The BLI bicubic interpolation function with border boundary is wrong
around edges. It returns zero, where interpolated values should exist.
This is due to a wrong early exit condition, where only a 2x2 window is
assumed instead of the 4x4 window considered by bicubic interpolation.
To fix this, assume a 2 pixel region in the early exit condition for
both directions.

Pull Request: https://projects.blender.org/blender/blender/pulls/131320
2024-12-04 08:39:58 +01:00

942 lines
34 KiB
C++

/* SPDX-FileCopyrightText: 2024 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup bli
*/
#include <cmath>
#include <cstring>
#include "BLI_math_base.h"
#include "BLI_math_base.hh"
#include "BLI_math_interp.hh"
#include "BLI_math_vector.h"
#include "BLI_math_vector_types.hh"
#include "BLI_simd.hh"
#include "BLI_strict_flags.h" /* Keep last. */
namespace blender::math {
BLI_INLINE int wrap_coord(float u, int size, InterpWrapMode wrap)
{
int x = 0;
switch (wrap) {
case InterpWrapMode::Extend:
x = math::clamp(int(u), 0, size - 1);
break;
case InterpWrapMode::Repeat:
x = int(floored_fmod(u, float(size)));
break;
case InterpWrapMode::Border:
x = int(u);
if (u < 0.0f || x >= size) {
x = -1;
}
break;
}
return x;
}
void interpolate_nearest_wrapmode_fl(const float *buffer,
float *output,
int width,
int height,
int components,
float u,
float v,
InterpWrapMode wrap_u,
InterpWrapMode wrap_v)
{
BLI_assert(buffer);
int x = wrap_coord(u, width, wrap_u);
int y = wrap_coord(v, height, wrap_v);
if (x < 0 || y < 0) {
for (int i = 0; i < components; i++) {
output[i] = 0.0f;
}
return;
}
const float *data = buffer + (int64_t(width) * y + x) * components;
for (int i = 0; i < components; i++) {
output[i] = data[i];
}
}
enum class eCubicFilter {
BSpline,
Mitchell,
};
/* Calculate cubic filter coefficients, for samples at -1,0,+1,+2.
* f is 0..1 offset from texel center in pixel space. */
template<enum eCubicFilter filter> static float4 cubic_filter_coefficients(float f)
{
float f2 = f * f;
float f3 = f2 * f;
if constexpr (filter == eCubicFilter::BSpline) {
/* Cubic B-Spline (Mitchell-Netravali filter with B=1, C=0 parameters). */
float w3 = f3 * (1.0f / 6.0f);
float w0 = -w3 + f2 * 0.5f - f * 0.5f + 1.0f / 6.0f;
float w1 = f3 * 0.5f - f2 * 1.0f + 2.0f / 3.0f;
float w2 = 1.0f - w0 - w1 - w3;
return float4(w0, w1, w2, w3);
}
else if constexpr (filter == eCubicFilter::Mitchell) {
/* Cubic Mitchell-Netravali filter with B=1/3, C=1/3 parameters. */
float w0 = -7.0f / 18.0f * f3 + 5.0f / 6.0f * f2 - 0.5f * f + 1.0f / 18.0f;
float w1 = 7.0f / 6.0f * f3 - 2.0f * f2 + 8.0f / 9.0f;
float w2 = -7.0f / 6.0f * f3 + 3.0f / 2.0f * f2 + 0.5f * f + 1.0f / 18.0f;
float w3 = 7.0f / 18.0f * f3 - 1.0f / 3.0f * f2;
return float4(w0, w1, w2, w3);
}
}
#if BLI_HAVE_SSE4
template<eCubicFilter filter>
BLI_INLINE void bicubic_interpolation_uchar_simd(
const uchar *src_buffer, uchar *output, int width, int height, float u, float v)
{
__m128 uv = _mm_set_ps(0, 0, v, u);
__m128 uv_floor = _mm_floor_ps(uv);
__m128i i_uv = _mm_cvttps_epi32(uv_floor);
__m128 frac_uv = _mm_sub_ps(uv, uv_floor);
/* Calculate pixel weights. */
float4 wx = cubic_filter_coefficients<filter>(_mm_cvtss_f32(frac_uv));
float4 wy = cubic_filter_coefficients<filter>(
_mm_cvtss_f32(_mm_shuffle_ps(frac_uv, frac_uv, 1)));
/* Read 4x4 source pixels and blend them. */
__m128 out = _mm_setzero_ps();
int iu = _mm_cvtsi128_si32(i_uv);
int iv = _mm_cvtsi128_si32(_mm_shuffle_epi32(i_uv, 1));
for (int n = 0; n < 4; n++) {
int y1 = iv + n - 1;
CLAMP(y1, 0, height - 1);
for (int m = 0; m < 4; m++) {
int x1 = iu + m - 1;
CLAMP(x1, 0, width - 1);
float w = wx[m] * wy[n];
const uchar *data = src_buffer + (width * y1 + x1) * 4;
/* Load 4 bytes and expand into 4-lane SIMD. */
__m128i sample_i = _mm_castps_si128(_mm_load_ss((const float *)data));
sample_i = _mm_unpacklo_epi8(sample_i, _mm_setzero_si128());
sample_i = _mm_unpacklo_epi16(sample_i, _mm_setzero_si128());
/* Accumulate into out with weight. */
out = _mm_add_ps(out, _mm_mul_ps(_mm_cvtepi32_ps(sample_i), _mm_set1_ps(w)));
}
}
/* Pack and write to destination: pack to 16 bit signed, then to 8 bit
* unsigned, then write resulting 32-bit value. This will clamp
* out of range values too. */
out = _mm_add_ps(out, _mm_set1_ps(0.5f));
__m128i rgba32 = _mm_cvttps_epi32(out);
__m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
__m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
_mm_store_ss((float *)output, _mm_castsi128_ps(rgba8));
}
#endif /* BLI_HAVE_SSE4 */
template<typename T, eCubicFilter filter>
BLI_INLINE void bicubic_interpolation(const T *src_buffer,
T *output,
int width,
int height,
int components,
float u,
float v,
InterpWrapMode wrap_u,
InterpWrapMode wrap_v)
{
BLI_assert(src_buffer && output);
#if BLI_HAVE_SSE4
if constexpr (std::is_same_v<T, uchar>) {
if (components == 4 && wrap_u == InterpWrapMode::Extend && wrap_v == InterpWrapMode::Extend) {
bicubic_interpolation_uchar_simd<filter>(src_buffer, output, width, height, u, v);
return;
}
}
#endif
int iu = int(floor(u));
int iv = int(floor(v));
/* Sample area entirely outside image in border mode? */
if (wrap_u == InterpWrapMode::Border && (iu + 2 < 0 || iu > width)) {
memset(output, 0, size_t(components) * sizeof(T));
return;
}
if (wrap_v == InterpWrapMode::Border && (iv + 2 < 0 || iv > height)) {
memset(output, 0, size_t(components) * sizeof(T));
return;
}
float frac_u = u - float(iu);
float frac_v = v - float(iv);
float4 out{0.0f};
/* Calculate pixel weights. */
float4 wx = cubic_filter_coefficients<filter>(frac_u);
float4 wy = cubic_filter_coefficients<filter>(frac_v);
/* Read 4x4 source pixels and blend them. */
for (int n = 0; n < 4; n++) {
int y1 = iv + n - 1;
y1 = wrap_coord(float(y1), height, wrap_v);
if (wrap_v == InterpWrapMode::Border && y1 < 0) {
continue;
}
for (int m = 0; m < 4; m++) {
int x1 = iu + m - 1;
x1 = wrap_coord(float(x1), width, wrap_u);
if (wrap_u == InterpWrapMode::Border && x1 < 0) {
continue;
}
float w = wx[m] * wy[n];
const T *data = src_buffer + (width * y1 + x1) * components;
if (components == 1) {
out[0] += data[0] * w;
}
else if (components == 3) {
out[0] += data[0] * w;
out[1] += data[1] * w;
out[2] += data[2] * w;
}
else {
out[0] += data[0] * w;
out[1] += data[1] * w;
out[2] += data[2] * w;
out[3] += data[3] * w;
}
}
}
/* Mitchell filter has negative lobes; prevent output from going out of range. */
if constexpr (filter == eCubicFilter::Mitchell) {
for (int i = 0; i < components; i++) {
out[i] = math::max(out[i], 0.0f);
if constexpr (std::is_same_v<T, uchar>) {
out[i] = math::min(out[i], 255.0f);
}
}
}
/* Write result. */
if constexpr (std::is_same_v<T, float>) {
if (components == 1) {
output[0] = out[0];
}
else if (components == 3) {
copy_v3_v3(output, out);
}
else {
copy_v4_v4(output, out);
}
}
else {
if (components == 1) {
output[0] = uchar(out[0] + 0.5f);
}
else if (components == 3) {
output[0] = uchar(out[0] + 0.5f);
output[1] = uchar(out[1] + 0.5f);
output[2] = uchar(out[2] + 0.5f);
}
else {
output[0] = uchar(out[0] + 0.5f);
output[1] = uchar(out[1] + 0.5f);
output[2] = uchar(out[2] + 0.5f);
output[3] = uchar(out[3] + 0.5f);
}
}
}
BLI_INLINE void bilinear_fl_impl(const float *buffer,
float *output,
int width,
int height,
int components,
float u,
float v,
InterpWrapMode wrap_x,
InterpWrapMode wrap_y)
{
BLI_assert(buffer && output);
float a, b;
float a_b, ma_b, a_mb, ma_mb;
int y1, y2, x1, x2;
if (wrap_x == InterpWrapMode::Repeat) {
u = floored_fmod(u, float(width));
}
if (wrap_y == InterpWrapMode::Repeat) {
v = floored_fmod(v, float(height));
}
float uf = floorf(u);
float vf = floorf(v);
x1 = int(uf);
x2 = x1 + 1;
y1 = int(vf);
y2 = y1 + 1;
const float *row1, *row2, *row3, *row4;
const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
/* Check if +1 samples need wrapping, or we don't do wrapping then if
* we are sampling completely outside the image. */
if (wrap_x == InterpWrapMode::Repeat) {
if (x2 >= width) {
x2 = 0;
}
}
else if (wrap_x == InterpWrapMode::Border && (x2 < 0 || x1 >= width)) {
copy_vn_fl(output, components, 0.0f);
return;
}
if (wrap_y == InterpWrapMode::Repeat) {
if (y2 >= height) {
y2 = 0;
}
}
else if (wrap_y == InterpWrapMode::Border && (y2 < 0 || y1 >= height)) {
copy_vn_fl(output, components, 0.0f);
return;
}
/* Sample locations. */
int x1c = blender::math::clamp(x1, 0, width - 1);
int x2c = blender::math::clamp(x2, 0, width - 1);
int y1c = blender::math::clamp(y1, 0, height - 1);
int y2c = blender::math::clamp(y2, 0, height - 1);
row1 = buffer + (int64_t(width) * y1c + x1c) * components;
row2 = buffer + (int64_t(width) * y2c + x1c) * components;
row3 = buffer + (int64_t(width) * y1c + x2c) * components;
row4 = buffer + (int64_t(width) * y2c + x2c) * components;
if (wrap_x == InterpWrapMode::Border) {
if (x1 < 0) {
row1 = empty;
row2 = empty;
}
if (x2 > width - 1) {
row3 = empty;
row4 = empty;
}
}
if (wrap_y == InterpWrapMode::Border) {
if (y1 < 0) {
row1 = empty;
row3 = empty;
}
if (y2 > height - 1) {
row2 = empty;
row4 = empty;
}
}
/* Finally, do interpolation. */
a = u - uf;
b = v - vf;
a_b = a * b;
ma_b = (1.0f - a) * b;
a_mb = a * (1.0f - b);
ma_mb = (1.0f - a) * (1.0f - b);
if (components == 1) {
output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
}
else if (components == 3) {
output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
}
else {
#if BLI_HAVE_SSE2
__m128 rgba1 = _mm_loadu_ps(row1);
__m128 rgba2 = _mm_loadu_ps(row2);
__m128 rgba3 = _mm_loadu_ps(row3);
__m128 rgba4 = _mm_loadu_ps(row4);
rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
__m128 rgba13 = _mm_add_ps(rgba1, rgba3);
__m128 rgba24 = _mm_add_ps(rgba2, rgba4);
__m128 rgba = _mm_add_ps(rgba13, rgba24);
_mm_storeu_ps(output, rgba);
#else
output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
#endif
}
}
template<bool border>
BLI_INLINE uchar4 bilinear_byte_impl(const uchar *buffer, int width, int height, float u, float v)
{
BLI_assert(buffer);
uchar4 res;
#if BLI_HAVE_SSE4
__m128 uvuv = _mm_set_ps(v, u, v, u);
__m128 uvuv_floor = _mm_floor_ps(uvuv);
/* x1, y1, x2, y2 */
__m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
/* Check whether any of the coordinates are outside of the image. */
__m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
_mm_set1_epi32(1));
/* Samples 1,2,3,4 will be in this order: x1y1, x1y2, x2y1, x2y2. */
__m128i x1234, y1234, invalid_1234;
if constexpr (border) {
/* Blend black colors for samples right outside the image: figure out
* which of the 4 samples were outside, set their coordinates to zero
* and later on put black color into their place. */
__m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
__m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
__m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
/* Samples 1,2,3,4 are in this order: x1y1, x1y2, x2y1, x2y2 */
x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
/* Set x & y to zero for invalid samples. */
x1234 = _mm_andnot_si128(invalid_1234, x1234);
y1234 = _mm_andnot_si128(invalid_1234, y1234);
}
else {
/* Clamp samples to image edges. */
__m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
}
/* Read the four sample values. Do address calculations in C, since SSE
* before 4.1 makes it very cumbersome to do full integer multiplies. */
int xcoord[4];
int ycoord[4];
_mm_storeu_ps((float *)xcoord, _mm_castsi128_ps(x1234));
_mm_storeu_ps((float *)ycoord, _mm_castsi128_ps(y1234));
int sample1 = ((const int *)buffer)[ycoord[0] * int64_t(width) + xcoord[0]];
int sample2 = ((const int *)buffer)[ycoord[1] * int64_t(width) + xcoord[1]];
int sample3 = ((const int *)buffer)[ycoord[2] * int64_t(width) + xcoord[2]];
int sample4 = ((const int *)buffer)[ycoord[3] * int64_t(width) + xcoord[3]];
__m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
if constexpr (border) {
/* Set samples to black for the ones that were actually invalid. */
samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
}
/* Expand samples from packed 8-bit RGBA to full floats:
* spread to 16 bit values. */
__m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
__m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
/* Spread to 32 bit values and convert to float. */
__m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
__m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
__m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
__m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
/* Calculate interpolation factors: (1-a)*(1-b), (1-a)*b, a*(1-b), a*b */
__m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
__m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
__m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
__m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
/* Blend the samples. */
rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
__m128 rgba13 = _mm_add_ps(rgba1, rgba3);
__m128 rgba24 = _mm_add_ps(rgba2, rgba4);
__m128 rgba = _mm_add_ps(rgba13, rgba24);
rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
/* Pack and write to destination: pack to 16 bit signed, then to 8 bit
* unsigned, then write resulting 32-bit value. */
__m128i rgba32 = _mm_cvttps_epi32(rgba);
__m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
__m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
_mm_store_ss((float *)&res, _mm_castsi128_ps(rgba8));
#else
float uf = floorf(u);
float vf = floorf(v);
int x1 = int(uf);
int x2 = x1 + 1;
int y1 = int(vf);
int y2 = y1 + 1;
/* Completely outside of the image in bordered mode? */
if (border && (x2 < 0 || x1 >= width || y2 < 0 || y1 >= height)) {
return uchar4(0);
}
/* Sample locations. */
const uchar *row1, *row2, *row3, *row4;
uchar empty[4] = {0, 0, 0, 0};
if constexpr (border) {
row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x1) * 4;
row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x1) * 4;
row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (int64_t(width) * y1 + x2) * 4;
row4 = (x2 > width - 1 || y2 > height - 1) ? empty : buffer + (int64_t(width) * y2 + x2) * 4;
}
else {
x1 = blender::math::clamp(x1, 0, width - 1);
x2 = blender::math::clamp(x2, 0, width - 1);
y1 = blender::math::clamp(y1, 0, height - 1);
y2 = blender::math::clamp(y2, 0, height - 1);
row1 = buffer + (int64_t(width) * y1 + x1) * 4;
row2 = buffer + (int64_t(width) * y2 + x1) * 4;
row3 = buffer + (int64_t(width) * y1 + x2) * 4;
row4 = buffer + (int64_t(width) * y2 + x2) * 4;
}
float a = u - uf;
float b = v - vf;
float a_b = a * b;
float ma_b = (1.0f - a) * b;
float a_mb = a * (1.0f - b);
float ma_mb = (1.0f - a) * (1.0f - b);
res.x = uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
res.y = uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
res.z = uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
res.w = uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
#endif
return res;
}
uchar4 interpolate_bilinear_border_byte(
const uchar *buffer, int width, int height, float u, float v)
{
return bilinear_byte_impl<true>(buffer, width, height, u, v);
}
uchar4 interpolate_bilinear_byte(const uchar *buffer, int width, int height, float u, float v)
{
return bilinear_byte_impl<false>(buffer, width, height, u, v);
}
float4 interpolate_bilinear_border_fl(const float *buffer, int width, int height, float u, float v)
{
float4 res;
bilinear_fl_impl(
buffer, res, width, height, 4, u, v, InterpWrapMode::Border, InterpWrapMode::Border);
return res;
}
void interpolate_bilinear_border_fl(
const float *buffer, float *output, int width, int height, int components, float u, float v)
{
bilinear_fl_impl(buffer,
output,
width,
height,
components,
u,
v,
InterpWrapMode::Border,
InterpWrapMode::Border);
}
float4 interpolate_bilinear_fl(const float *buffer, int width, int height, float u, float v)
{
float4 res;
bilinear_fl_impl(
buffer, res, width, height, 4, u, v, InterpWrapMode::Extend, InterpWrapMode::Extend);
return res;
}
void interpolate_bilinear_fl(
const float *buffer, float *output, int width, int height, int components, float u, float v)
{
bilinear_fl_impl(buffer,
output,
width,
height,
components,
u,
v,
InterpWrapMode::Extend,
InterpWrapMode::Extend);
}
void interpolate_bilinear_wrapmode_fl(const float *buffer,
float *output,
int width,
int height,
int components,
float u,
float v,
InterpWrapMode wrap_u,
InterpWrapMode wrap_v)
{
bilinear_fl_impl(buffer, output, width, height, components, u, v, wrap_u, wrap_v);
}
uchar4 interpolate_bilinear_wrap_byte(const uchar *buffer, int width, int height, float u, float v)
{
u = floored_fmod(u, float(width));
v = floored_fmod(v, float(height));
float uf = floorf(u);
float vf = floorf(v);
int x1 = int(uf);
int x2 = x1 + 1;
int y1 = int(vf);
int y2 = y1 + 1;
/* Wrap interpolation pixels if needed. */
BLI_assert(x1 >= 0 && x1 < width && y1 >= 0 && y1 < height);
if (x2 >= width) {
x2 = 0;
}
if (y2 >= height) {
y2 = 0;
}
float a = u - uf;
float b = v - vf;
float a_b = a * b;
float ma_b = (1.0f - a) * b;
float a_mb = a * (1.0f - b);
float ma_mb = (1.0f - a) * (1.0f - b);
/* Blend samples. */
const uchar *row1 = buffer + (int64_t(width) * y1 + x1) * 4;
const uchar *row2 = buffer + (int64_t(width) * y2 + x1) * 4;
const uchar *row3 = buffer + (int64_t(width) * y1 + x2) * 4;
const uchar *row4 = buffer + (int64_t(width) * y2 + x2) * 4;
uchar4 res;
res.x = uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
res.y = uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
res.z = uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
res.w = uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
return res;
}
float4 interpolate_bilinear_wrap_fl(const float *buffer, int width, int height, float u, float v)
{
float4 res;
bilinear_fl_impl(
buffer, res, width, height, 4, u, v, InterpWrapMode::Repeat, InterpWrapMode::Repeat);
return res;
}
uchar4 interpolate_cubic_bspline_byte(const uchar *buffer, int width, int height, float u, float v)
{
uchar4 res;
bicubic_interpolation<uchar, eCubicFilter::BSpline>(
buffer, res, width, height, 4, u, v, InterpWrapMode::Extend, InterpWrapMode::Extend);
return res;
}
float4 interpolate_cubic_bspline_fl(const float *buffer, int width, int height, float u, float v)
{
float4 res;
bicubic_interpolation<float, eCubicFilter::BSpline>(
buffer, res, width, height, 4, u, v, InterpWrapMode::Extend, InterpWrapMode::Extend);
return res;
}
void interpolate_cubic_bspline_fl(
const float *buffer, float *output, int width, int height, int components, float u, float v)
{
bicubic_interpolation<float, eCubicFilter::BSpline>(buffer,
output,
width,
height,
components,
u,
v,
InterpWrapMode::Extend,
InterpWrapMode::Extend);
}
void interpolate_cubic_bspline_wrapmode_fl(const float *buffer,
float *output,
int width,
int height,
int components,
float u,
float v,
math::InterpWrapMode wrap_u,
math::InterpWrapMode wrap_v)
{
bicubic_interpolation<float, eCubicFilter::BSpline>(
buffer, output, width, height, components, u, v, wrap_u, wrap_v);
}
uchar4 interpolate_cubic_mitchell_byte(
const uchar *buffer, int width, int height, float u, float v)
{
uchar4 res;
bicubic_interpolation<uchar, eCubicFilter::Mitchell>(
buffer, res, width, height, 4, u, v, InterpWrapMode::Extend, InterpWrapMode::Extend);
return res;
}
float4 interpolate_cubic_mitchell_fl(const float *buffer, int width, int height, float u, float v)
{
float4 res;
bicubic_interpolation<float, eCubicFilter::Mitchell>(
buffer, res, width, height, 4, u, v, InterpWrapMode::Extend, InterpWrapMode::Extend);
return res;
}
void interpolate_cubic_mitchell_fl(
const float *buffer, float *output, int width, int height, int components, float u, float v)
{
bicubic_interpolation<float, eCubicFilter::Mitchell>(buffer,
output,
width,
height,
components,
u,
v,
InterpWrapMode::Extend,
InterpWrapMode::Extend);
}
} // namespace blender::math
/**************************************************************************
* Filtering method based on
* "Creating raster omnimax images from multiple perspective views
* using the elliptical weighted average filter"
* by Ned Greene and Paul S. Heckbert (1986)
***************************************************************************/
/* Table of `(exp(ar) - exp(a)) / (1 - exp(a))` for `r` in range [0, 1] and `a = -2`.
* used instead of actual gaussian,
* otherwise at high texture magnifications circular artifacts are visible. */
#define EWA_MAXIDX 255
const float EWA_WTS[EWA_MAXIDX + 1] = {
1.0f, 0.990965f, 0.982f, 0.973105f, 0.96428f, 0.955524f, 0.946836f,
0.938216f, 0.929664f, 0.921178f, 0.912759f, 0.904405f, 0.896117f, 0.887893f,
0.879734f, 0.871638f, 0.863605f, 0.855636f, 0.847728f, 0.839883f, 0.832098f,
0.824375f, 0.816712f, 0.809108f, 0.801564f, 0.794079f, 0.786653f, 0.779284f,
0.771974f, 0.76472f, 0.757523f, 0.750382f, 0.743297f, 0.736267f, 0.729292f,
0.722372f, 0.715505f, 0.708693f, 0.701933f, 0.695227f, 0.688572f, 0.68197f,
0.67542f, 0.66892f, 0.662471f, 0.656073f, 0.649725f, 0.643426f, 0.637176f,
0.630976f, 0.624824f, 0.618719f, 0.612663f, 0.606654f, 0.600691f, 0.594776f,
0.588906f, 0.583083f, 0.577305f, 0.571572f, 0.565883f, 0.56024f, 0.55464f,
0.549084f, 0.543572f, 0.538102f, 0.532676f, 0.527291f, 0.521949f, 0.516649f,
0.511389f, 0.506171f, 0.500994f, 0.495857f, 0.490761f, 0.485704f, 0.480687f,
0.475709f, 0.470769f, 0.465869f, 0.461006f, 0.456182f, 0.451395f, 0.446646f,
0.441934f, 0.437258f, 0.432619f, 0.428017f, 0.42345f, 0.418919f, 0.414424f,
0.409963f, 0.405538f, 0.401147f, 0.39679f, 0.392467f, 0.388178f, 0.383923f,
0.379701f, 0.375511f, 0.371355f, 0.367231f, 0.363139f, 0.359079f, 0.355051f,
0.351055f, 0.347089f, 0.343155f, 0.339251f, 0.335378f, 0.331535f, 0.327722f,
0.323939f, 0.320186f, 0.316461f, 0.312766f, 0.3091f, 0.305462f, 0.301853f,
0.298272f, 0.294719f, 0.291194f, 0.287696f, 0.284226f, 0.280782f, 0.277366f,
0.273976f, 0.270613f, 0.267276f, 0.263965f, 0.26068f, 0.257421f, 0.254187f,
0.250979f, 0.247795f, 0.244636f, 0.241502f, 0.238393f, 0.235308f, 0.232246f,
0.229209f, 0.226196f, 0.223206f, 0.220239f, 0.217296f, 0.214375f, 0.211478f,
0.208603f, 0.20575f, 0.20292f, 0.200112f, 0.197326f, 0.194562f, 0.191819f,
0.189097f, 0.186397f, 0.183718f, 0.18106f, 0.178423f, 0.175806f, 0.17321f,
0.170634f, 0.168078f, 0.165542f, 0.163026f, 0.16053f, 0.158053f, 0.155595f,
0.153157f, 0.150738f, 0.148337f, 0.145955f, 0.143592f, 0.141248f, 0.138921f,
0.136613f, 0.134323f, 0.132051f, 0.129797f, 0.12756f, 0.125341f, 0.123139f,
0.120954f, 0.118786f, 0.116635f, 0.114501f, 0.112384f, 0.110283f, 0.108199f,
0.106131f, 0.104079f, 0.102043f, 0.100023f, 0.0980186f, 0.09603f, 0.094057f,
0.0920994f, 0.0901571f, 0.08823f, 0.0863179f, 0.0844208f, 0.0825384f, 0.0806708f,
0.0788178f, 0.0769792f, 0.0751551f, 0.0733451f, 0.0715493f, 0.0697676f, 0.0679997f,
0.0662457f, 0.0645054f, 0.0627786f, 0.0610654f, 0.0593655f, 0.0576789f, 0.0560055f,
0.0543452f, 0.0526979f, 0.0510634f, 0.0494416f, 0.0478326f, 0.0462361f, 0.0446521f,
0.0430805f, 0.0415211f, 0.039974f, 0.0384389f, 0.0369158f, 0.0354046f, 0.0339052f,
0.0324175f, 0.0309415f, 0.029477f, 0.0280239f, 0.0265822f, 0.0251517f, 0.0237324f,
0.0223242f, 0.020927f, 0.0195408f, 0.0181653f, 0.0168006f, 0.0154466f, 0.0141031f,
0.0127701f, 0.0114476f, 0.0101354f, 0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
0.00372644f, 0.00247454f, 0.00123242f, 0.0f,
};
static void radangle2imp(float a2, float b2, float th, float *A, float *B, float *C, float *F)
{
float ct2 = cosf(th);
const float st2 = 1.0f - ct2 * ct2; /* <- sin(th)^2 */
ct2 *= ct2;
*A = a2 * st2 + b2 * ct2;
*B = (b2 - a2) * sinf(2.0f * th);
*C = a2 * ct2 + b2 * st2;
*F = a2 * b2;
}
void BLI_ewa_imp2radangle(
float A, float B, float C, float F, float *a, float *b, float *th, float *ecc)
{
/* NOTE: all tests here are done to make sure possible overflows are hopefully minimized. */
if (F <= 1e-5f) { /* use arbitrary major radius, zero minor, infinite eccentricity */
*a = sqrtf(A > C ? A : C);
*b = 0.0f;
*ecc = 1e10f;
*th = 0.5f * (atan2f(B, A - C) + float(M_PI));
}
else {
const float AmC = A - C, ApC = A + C, F2 = F * 2.0f;
const float r = sqrtf(AmC * AmC + B * B);
float d = ApC - r;
*a = (d <= 0.0f) ? sqrtf(A > C ? A : C) : sqrtf(F2 / d);
d = ApC + r;
if (d <= 0.0f) {
*b = 0.0f;
*ecc = 1e10f;
}
else {
*b = sqrtf(F2 / d);
*ecc = *a / *b;
}
/* Increase theta by `0.5 * pi` (angle of major axis). */
*th = 0.5f * (atan2f(B, AmC) + float(M_PI));
}
}
void BLI_ewa_filter(const int width,
const int height,
const bool intpol,
const bool use_alpha,
const float uv[2],
const float du[2],
const float dv[2],
ewa_filter_read_pixel_cb read_pixel_cb,
void *userdata,
float result[4])
{
/* Scaling `dxt` / `dyt` by full resolution can cause overflow because of huge A/B/C and esp.
* F values, scaling by aspect ratio alone does the opposite, so try something in between
* instead. */
const float ff2 = float(width), ff = sqrtf(ff2), q = float(height) / ff;
const float Ux = du[0] * ff, Vx = du[1] * q, Uy = dv[0] * ff, Vy = dv[1] * q;
float A = Vx * Vx + Vy * Vy;
float B = -2.0f * (Ux * Vx + Uy * Vy);
float C = Ux * Ux + Uy * Uy;
float F = A * C - B * B * 0.25f;
float a, b, th, ecc, a2, b2, ue, ve, U0, V0, DDQ, U, ac1, ac2, BU, d;
int u, v, u1, u2, v1, v2;
/* The so-called 'high' quality EWA method simply adds a constant of 1 to both A & C,
* so the ellipse always covers at least some texels. But since the filter is now always larger,
* it also means that everywhere else it's also more blurry then ideally should be the case.
* So instead here the ellipse radii are modified instead whenever either is too low.
* Use a different radius based on interpolation switch,
* just enough to anti-alias when interpolation is off,
* and slightly larger to make result a bit smoother than bilinear interpolation when
* interpolation is on (minimum values: `const float rmin = intpol ? 1.0f : 0.5f;`) */
const float rmin = (intpol ? 1.5625f : 0.765625f) / ff2;
BLI_ewa_imp2radangle(A, B, C, F, &a, &b, &th, &ecc);
if ((b2 = b * b) < rmin) {
if ((a2 = a * a) < rmin) {
UNUSED_VARS(a2, b2);
B = 0.0f;
A = C = rmin;
F = A * C;
}
else {
b2 = rmin;
radangle2imp(a2, b2, th, &A, &B, &C, &F);
}
}
ue = ff * sqrtf(C);
ve = ff * sqrtf(A);
d = float(EWA_MAXIDX + 1) / (F * ff2);
A *= d;
B *= d;
C *= d;
U0 = uv[0] * float(width);
V0 = uv[1] * float(height);
u1 = int(floorf(U0 - ue));
u2 = int(ceilf(U0 + ue));
v1 = int(floorf(V0 - ve));
v2 = int(ceilf(V0 + ve));
/* sane clamping to avoid unnecessarily huge loops */
/* NOTE: if eccentricity gets clamped (see above),
* the ue/ve limits can also be lowered accordingly
*/
if (U0 - float(u1) > EWA_MAXIDX) {
u1 = int(U0) - EWA_MAXIDX;
}
if (float(u2) - U0 > EWA_MAXIDX) {
u2 = int(U0) + EWA_MAXIDX;
}
if (V0 - float(v1) > EWA_MAXIDX) {
v1 = int(V0) - EWA_MAXIDX;
}
if (float(v2) - V0 > EWA_MAXIDX) {
v2 = int(V0) + EWA_MAXIDX;
}
/* Early output check for cases the whole region is outside of the buffer. */
if ((u2 < 0 || u1 >= width) || (v2 < 0 || v1 >= height)) {
zero_v4(result);
return;
}
U0 -= 0.5f;
V0 -= 0.5f;
DDQ = 2.0f * A;
U = float(u1) - U0;
ac1 = A * (2.0f * U + 1.0f);
ac2 = A * U * U;
BU = B * U;
d = 0.0f;
zero_v4(result);
for (v = v1; v <= v2; v++) {
const float V = float(v) - V0;
float DQ = ac1 + B * V;
float Q = (C * V + BU) * V + ac2;
for (u = u1; u <= u2; u++) {
if (Q < float(EWA_MAXIDX + 1)) {
float tc[4];
const float wt = EWA_WTS[(Q < 0.0f) ? 0 : uint(Q)];
read_pixel_cb(userdata, u, v, tc);
madd_v3_v3fl(result, tc, wt);
result[3] += use_alpha ? tc[3] * wt : 0.0f;
d += wt;
}
Q += DQ;
DQ += DDQ;
}
}
/* `d` should hopefully never be zero anymore. */
d = 1.0f / d;
mul_v3_fl(result, d);
/* Clipping can be ignored if alpha used, `texr->trgba[3]` already includes filtered edge. */
result[3] = use_alpha ? result[3] * d : 1.0f;
}