Cleanup: Remove pre-SSE4 fallbacks in BLI pixel interpolation functions
Commit 8b9743eb40 already made Blender be compiled with SSE4.2 flags
on x64 architecture, which kicked in the SSE4 code paths in
BLI_math_interp functions.
Which made them faster, e.g. in VSE on Windows/Ryzen5950X, scaling
up an image to 4K resolution:
- Bilinear 5.8ms -> 5.3ms
- Cubic Mitchell 16.3ms -> 15.7ms
This change removes the now-unneeded SSE pre-SSE4 code paths for
_mm_floor_ps, _mm_min_epi32 and _mm_max_epi32 emulation.
Additionally, including BLI_simd.h on SSE4 platform now includes
the necessary SSE4 intrinsics header.
Pull Request: https://projects.blender.org/blender/blender/pulls/120583
This commit is contained in:
committed by
Aras Pranckevicius
parent
904d51d6cb
commit
e2e6b977a6
@@ -33,7 +33,12 @@
|
||||
# define BLI_HAVE_SSE2 0
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(WITH_SSE2NEON))
|
||||
#if defined(__ARM_NEON) && defined(WITH_SSE2NEON)
|
||||
/* SSE4 is emulated via sse2neon. */
|
||||
# define BLI_HAVE_SSE4 1
|
||||
#elif defined(__SSE4_2__)
|
||||
/* Native SSE4.2. */
|
||||
# include <nmmintrin.h>
|
||||
# define BLI_HAVE_SSE4 1
|
||||
#else
|
||||
# define BLI_HAVE_SSE4 0
|
||||
|
||||
@@ -50,55 +50,13 @@ template<enum eCubicFilter filter> static float4 cubic_filter_coefficients(float
|
||||
}
|
||||
}
|
||||
|
||||
#if BLI_HAVE_SSE2
|
||||
# if defined(__SSE4_1__)
|
||||
# include <smmintrin.h> /* _mm_floor_ps */
|
||||
# endif
|
||||
|
||||
BLI_INLINE __m128 floor_simd(__m128 v)
|
||||
{
|
||||
# if BLI_HAVE_SSE4
|
||||
__m128 v_floor = _mm_floor_ps(v);
|
||||
# else
|
||||
/* Truncate, for negative inputs this will round towards zero. Then compare
|
||||
* with input, and subtract 1 for the inputs that were negative. */
|
||||
__m128 v_trunc = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
|
||||
__m128 v_neg = _mm_cmplt_ps(v, v_trunc);
|
||||
__m128 v_floor = _mm_sub_ps(v_trunc, _mm_and_ps(v_neg, _mm_set1_ps(1.0f)));
|
||||
# endif
|
||||
return v_floor;
|
||||
}
|
||||
|
||||
BLI_INLINE __m128i min_i_simd(__m128i a, __m128i b)
|
||||
{
|
||||
# if BLI_HAVE_SSE4
|
||||
return _mm_min_epi32(a, b);
|
||||
# else
|
||||
__m128i cmp = _mm_cmplt_epi32(a, b);
|
||||
a = _mm_and_si128(cmp, a);
|
||||
b = _mm_andnot_si128(cmp, b);
|
||||
return _mm_or_si128(a, b);
|
||||
# endif
|
||||
}
|
||||
|
||||
BLI_INLINE __m128i max_i_simd(__m128i a, __m128i b)
|
||||
{
|
||||
# if BLI_HAVE_SSE4
|
||||
return _mm_max_epi32(a, b);
|
||||
# else
|
||||
__m128i cmp = _mm_cmplt_epi32(b, a);
|
||||
a = _mm_and_si128(cmp, a);
|
||||
b = _mm_andnot_si128(cmp, b);
|
||||
return _mm_or_si128(a, b);
|
||||
# endif
|
||||
}
|
||||
|
||||
#if BLI_HAVE_SSE4
|
||||
template<eCubicFilter filter>
|
||||
BLI_INLINE void bicubic_interpolation_uchar_simd(
|
||||
const uchar *src_buffer, uchar *output, int width, int height, float u, float v)
|
||||
{
|
||||
__m128 uv = _mm_set_ps(0, 0, v, u);
|
||||
__m128 uv_floor = floor_simd(uv);
|
||||
__m128 uv_floor = _mm_floor_ps(uv);
|
||||
__m128i i_uv = _mm_cvttps_epi32(uv_floor);
|
||||
|
||||
/* Sample area entirely outside image?
|
||||
@@ -153,7 +111,7 @@ BLI_INLINE void bicubic_interpolation_uchar_simd(
|
||||
__m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
|
||||
_mm_store_ss((float *)output, _mm_castsi128_ps(rgba8));
|
||||
}
|
||||
#endif /* BLI_HAVE_SSE2 */
|
||||
#endif /* BLI_HAVE_SSE4 */
|
||||
|
||||
template<typename T, eCubicFilter filter>
|
||||
static void bicubic_interpolation(
|
||||
@@ -161,7 +119,7 @@ static void bicubic_interpolation(
|
||||
{
|
||||
BLI_assert(src_buffer && output);
|
||||
|
||||
#if BLI_HAVE_SSE2
|
||||
#if BLI_HAVE_SSE4
|
||||
if constexpr (std::is_same_v<T, uchar>) {
|
||||
if (components == 4) {
|
||||
bicubic_interpolation_uchar_simd<filter>(src_buffer, output, width, height, u, v);
|
||||
@@ -375,9 +333,9 @@ BLI_INLINE uchar4 bilinear_byte_impl(const uchar *buffer, int width, int height,
|
||||
BLI_assert(buffer);
|
||||
uchar4 res;
|
||||
|
||||
#if BLI_HAVE_SSE2
|
||||
#if BLI_HAVE_SSE4
|
||||
__m128 uvuv = _mm_set_ps(v, u, v, u);
|
||||
__m128 uvuv_floor = floor_simd(uvuv);
|
||||
__m128 uvuv_floor = _mm_floor_ps(uvuv);
|
||||
|
||||
/* x1, y1, x2, y2 */
|
||||
__m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
|
||||
@@ -407,8 +365,8 @@ BLI_INLINE uchar4 bilinear_byte_impl(const uchar *buffer, int width, int height,
|
||||
}
|
||||
else {
|
||||
/* Clamp samples to image edges. */
|
||||
__m128i xy12_clamped = max_i_simd(xy12, _mm_setzero_si128());
|
||||
xy12_clamped = min_i_simd(xy12_clamped, size_minus_1);
|
||||
__m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
|
||||
xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
|
||||
x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user