Commit 8b9743eb40 already made Blender be compiled with SSE4.2 flags
on x64 architecture, which kicked in the SSE4 code paths in
BLI_math_interp functions.
Which made them faster, e.g. in VSE on Windows/Ryzen5950X, scaling
up an image to 4K resolution:
- Bilinear 5.8ms -> 5.3ms
- Cubic Mitchell 16.3ms -> 15.7ms
This change removes the now-unneeded SSE pre-SSE4 code paths for
_mm_floor_ps, _mm_min_epi32 and _mm_max_epi32 emulation.
Additionally, including BLI_simd.h on SSE4 platform now includes
the necessary SSE4 intrinsics header.
Pull Request: https://projects.blender.org/blender/blender/pulls/120583
46 lines
1.1 KiB
C
46 lines
1.1 KiB
C
/* SPDX-FileCopyrightText: 2023 Blender Authors
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
#pragma once
|
|
|
|
/** \file
|
|
* \ingroup bli
|
|
*
|
|
* SIMD instruction support.
|
|
*/
|
|
|
|
// TODO: Re-enable this once blenlib is converted to C++
|
|
#if (defined(__ARM_NEON) /* || (defined(_M_ARM64) && defined(_MSC_VER))*/) && \
|
|
defined(WITH_SSE2NEON)
|
|
/* SSE/SSE2 emulation on ARM Neon. Match SSE precision. */
|
|
# if !defined(SSE2NEON_PRECISE_MINMAX)
|
|
# define SSE2NEON_PRECISE_MINMAX 1
|
|
# endif
|
|
# if !defined(SSE2NEON_PRECISE_DIV)
|
|
# define SSE2NEON_PRECISE_DIV 1
|
|
# endif
|
|
# if !defined(SSE2NEON_PRECISE_SQRT)
|
|
# define SSE2NEON_PRECISE_SQRT 1
|
|
# endif
|
|
# include <sse2neon.h>
|
|
# define BLI_HAVE_SSE2 1
|
|
#elif defined(__SSE2__)
|
|
/* Native SSE2 on Intel/AMD. */
|
|
# include <emmintrin.h>
|
|
# define BLI_HAVE_SSE2 1
|
|
#else
|
|
# define BLI_HAVE_SSE2 0
|
|
#endif
|
|
|
|
#if defined(__ARM_NEON) && defined(WITH_SSE2NEON)
|
|
/* SSE4 is emulated via sse2neon. */
|
|
# define BLI_HAVE_SSE4 1
|
|
#elif defined(__SSE4_2__)
|
|
/* Native SSE4.2. */
|
|
# include <nmmintrin.h>
|
|
# define BLI_HAVE_SSE4 1
|
|
#else
|
|
# define BLI_HAVE_SSE4 0
|
|
#endif
|