2023-06-14 16:52:36 +10:00
|
|
|
/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
|
|
|
|
|
* SPDX-FileCopyrightText: 2014-2022 Blender Foundation
|
|
|
|
|
*
|
|
|
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2024-12-26 17:53:56 +01:00
|
|
|
#pragma once
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2024-12-26 17:53:59 +01:00
|
|
|
#include <cstdint>
|
2021-02-14 15:34:23 +01:00
|
|
|
#include <limits>
|
2017-08-02 02:09:08 +02:00
|
|
|
|
2021-10-24 14:19:19 +02:00
|
|
|
#include "util/defines.h"
|
2017-08-02 02:09:08 +02:00
|
|
|
|
|
|
|
|
/* SSE Intrinsics includes
|
|
|
|
|
*
|
2021-02-14 15:34:23 +01:00
|
|
|
* We assume __KERNEL_SSEX__ flags to have been defined at this point.
|
|
|
|
|
*
|
|
|
|
|
* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
|
2017-08-02 02:09:08 +02:00
|
|
|
* Since we can't avoid including <windows.h>, better only include that */
|
2021-02-14 15:34:23 +01:00
|
|
|
#if defined(FREE_WINDOWS64)
|
2021-10-24 14:19:19 +02:00
|
|
|
# include "util/windows.h"
|
2024-03-06 15:44:46 +01:00
|
|
|
#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
|
2021-02-14 15:34:23 +01:00
|
|
|
# include <intrin.h>
|
|
|
|
|
#elif (defined(__x86_64__) || defined(__i386__))
|
|
|
|
|
# include <x86intrin.h>
|
2021-02-14 15:01:26 +01:00
|
|
|
#elif defined(__KERNEL_NEON__)
|
|
|
|
|
# define SSE2NEON_PRECISE_MINMAX 1
|
|
|
|
|
# include <sse2neon.h>
|
2021-02-14 15:34:23 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
/* Floating Point Control, for Embree. */
|
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
|
|
|
# define SIMD_SET_FLUSH_TO_ZERO \
|
|
|
|
|
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
|
|
|
|
|
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
|
2022-04-12 19:36:55 +01:00
|
|
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
2023-04-27 11:05:22 +02:00
|
|
|
/* The get/set denormals to zero was implemented in sse2neon v1.5.0.
|
|
|
|
|
* Keep the compatibility code until the minimum library version is increased. */
|
|
|
|
|
# if defined(_MM_SET_FLUSH_ZERO_MODE)
|
|
|
|
|
# define SIMD_SET_FLUSH_TO_ZERO \
|
|
|
|
|
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
|
|
|
|
|
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
|
2024-03-06 15:44:46 +01:00
|
|
|
# elif !defined(_M_ARM64)
|
2023-04-27 11:05:22 +02:00
|
|
|
# define _MM_FLUSH_ZERO_ON 24
|
|
|
|
|
# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
|
|
|
|
|
# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
|
|
|
|
|
# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
|
|
|
|
|
# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
|
2024-03-06 15:44:46 +01:00
|
|
|
# else
|
|
|
|
|
# define _MM_FLUSH_ZERO_ON 24
|
|
|
|
|
# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
|
|
|
|
|
# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
|
|
|
|
|
# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
|
|
|
|
|
# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
|
2023-04-27 11:05:22 +02:00
|
|
|
# endif
|
2021-02-14 15:34:23 +01:00
|
|
|
#else
|
|
|
|
|
# define SIMD_SET_FLUSH_TO_ZERO
|
|
|
|
|
#endif
|
2019-03-06 13:27:29 +01:00
|
|
|
|
2013-12-27 21:30:03 +01:00
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
/* Data structures used by SSE classes. */
|
|
|
|
|
#ifdef __KERNEL_SSE2__
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2014-06-13 21:13:18 +02:00
|
|
|
extern const __m128 _mm_lookupmask_ps[16];
|
|
|
|
|
|
|
|
|
|
static struct TrueTy {
|
|
|
|
|
__forceinline operator bool() const
|
|
|
|
|
{
|
|
|
|
|
return true;
|
|
|
|
|
}
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} True ccl_attr_maybe_unused;
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2014-06-13 21:13:18 +02:00
|
|
|
static struct FalseTy {
|
|
|
|
|
__forceinline operator bool() const
|
|
|
|
|
{
|
|
|
|
|
return false;
|
|
|
|
|
}
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} False ccl_attr_maybe_unused;
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2020-04-01 14:48:01 +02:00
|
|
|
static struct ZeroTy {
|
|
|
|
|
__forceinline operator float() const
|
|
|
|
|
{
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
__forceinline operator int() const
|
|
|
|
|
{
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} zero ccl_attr_maybe_unused;
|
2020-04-01 14:48:01 +02:00
|
|
|
|
|
|
|
|
static struct OneTy {
|
|
|
|
|
__forceinline operator float() const
|
|
|
|
|
{
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
__forceinline operator int() const
|
|
|
|
|
{
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} one ccl_attr_maybe_unused;
|
2020-04-01 14:48:01 +02:00
|
|
|
|
2014-06-13 21:13:18 +02:00
|
|
|
static struct NegInfTy {
|
|
|
|
|
__forceinline operator float() const
|
|
|
|
|
{
|
|
|
|
|
return -std::numeric_limits<float>::infinity();
|
|
|
|
|
}
|
|
|
|
|
__forceinline operator int() const
|
|
|
|
|
{
|
|
|
|
|
return std::numeric_limits<int>::min();
|
|
|
|
|
}
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} neg_inf ccl_attr_maybe_unused;
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2014-06-13 21:13:18 +02:00
|
|
|
static struct PosInfTy {
|
|
|
|
|
__forceinline operator float() const
|
|
|
|
|
{
|
|
|
|
|
return std::numeric_limits<float>::infinity();
|
|
|
|
|
}
|
|
|
|
|
__forceinline operator int() const
|
|
|
|
|
{
|
|
|
|
|
return std::numeric_limits<int>::max();
|
|
|
|
|
}
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused;
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2020-04-01 14:48:01 +02:00
|
|
|
static struct StepTy {
|
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.
Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.
Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles
Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)
For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.
Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
2021-09-20 17:59:20 +02:00
|
|
|
} step ccl_attr_maybe_unused;
|
2020-04-01 14:48:01 +02:00
|
|
|
|
2022-04-12 19:36:55 +01:00
|
|
|
#endif
|
2023-04-27 11:05:22 +02:00
|
|
|
#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(_MM_SET_FLUSH_ZERO_MODE)
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline int set_fz(const uint32_t flag)
|
2022-04-13 13:45:42 +10:00
|
|
|
{
|
2024-12-29 17:32:00 +01:00
|
|
|
uint64_t old_fpcr;
|
|
|
|
|
uint64_t new_fpcr;
|
2022-04-13 13:45:42 +10:00
|
|
|
__get_fpcr(old_fpcr);
|
|
|
|
|
new_fpcr = old_fpcr | (1ULL << flag);
|
|
|
|
|
__set_fpcr(new_fpcr);
|
|
|
|
|
__get_fpcr(old_fpcr);
|
|
|
|
|
return old_fpcr == new_fpcr;
|
2022-04-12 19:36:55 +01:00
|
|
|
}
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline int get_fz(const uint32_t flag)
|
2022-04-13 13:45:42 +10:00
|
|
|
{
|
|
|
|
|
uint64_t cur_fpcr;
|
|
|
|
|
__get_fpcr(cur_fpcr);
|
|
|
|
|
return (cur_fpcr & (1ULL << flag)) > 0 ? 1 : 0;
|
2022-04-12 19:36:55 +01:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
#endif
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2021-02-14 15:01:26 +01:00
|
|
|
/* Utilities used by Neon */
|
|
|
|
|
#if defined(__KERNEL_NEON__)
|
2025-01-01 18:15:54 +01:00
|
|
|
template<class type, const int i0, const int i1, const int i2, const int i3>
|
|
|
|
|
type shuffle_neon(const type &a)
|
2021-02-14 15:01:26 +01:00
|
|
|
{
|
|
|
|
|
if (i0 == i1 && i0 == i2 && i0 == i3) {
|
2021-04-20 14:00:05 +02:00
|
|
|
return type(vdupq_laneq_s32(int32x4_t(a), i0));
|
2021-02-14 15:01:26 +01:00
|
|
|
}
|
|
|
|
|
static const uint8_t tbl[16] = {(i0 * 4) + 0,
|
|
|
|
|
(i0 * 4) + 1,
|
|
|
|
|
(i0 * 4) + 2,
|
|
|
|
|
(i0 * 4) + 3,
|
|
|
|
|
(i1 * 4) + 0,
|
|
|
|
|
(i1 * 4) + 1,
|
|
|
|
|
(i1 * 4) + 2,
|
|
|
|
|
(i1 * 4) + 3,
|
|
|
|
|
(i2 * 4) + 0,
|
|
|
|
|
(i2 * 4) + 1,
|
|
|
|
|
(i2 * 4) + 2,
|
|
|
|
|
(i2 * 4) + 3,
|
|
|
|
|
(i3 * 4) + 0,
|
|
|
|
|
(i3 * 4) + 1,
|
|
|
|
|
(i3 * 4) + 2,
|
|
|
|
|
(i3 * 4) + 3};
|
|
|
|
|
|
2021-04-20 14:00:05 +02:00
|
|
|
return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
|
2021-02-14 15:01:26 +01:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
template<class type, const int i0, const int i1, const int i2, const int i3>
|
2021-02-14 15:01:26 +01:00
|
|
|
type shuffle_neon(const type &a, const type &b)
|
|
|
|
|
{
|
|
|
|
|
if (&a == &b) {
|
|
|
|
|
static const uint8_t tbl[16] = {(i0 * 4) + 0,
|
|
|
|
|
(i0 * 4) + 1,
|
|
|
|
|
(i0 * 4) + 2,
|
|
|
|
|
(i0 * 4) + 3,
|
|
|
|
|
(i1 * 4) + 0,
|
|
|
|
|
(i1 * 4) + 1,
|
|
|
|
|
(i1 * 4) + 2,
|
|
|
|
|
(i1 * 4) + 3,
|
|
|
|
|
(i2 * 4) + 0,
|
|
|
|
|
(i2 * 4) + 1,
|
|
|
|
|
(i2 * 4) + 2,
|
|
|
|
|
(i2 * 4) + 3,
|
|
|
|
|
(i3 * 4) + 0,
|
|
|
|
|
(i3 * 4) + 1,
|
|
|
|
|
(i3 * 4) + 2,
|
|
|
|
|
(i3 * 4) + 3};
|
|
|
|
|
|
2021-04-20 14:00:05 +02:00
|
|
|
return type(vqtbl1q_s8(int8x16_t(b), *(uint8x16_t *)tbl));
|
2021-02-14 15:01:26 +01:00
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
|
|
static const uint8_t tbl[16] = {(i0 * 4) + 0,
|
|
|
|
|
(i0 * 4) + 1,
|
|
|
|
|
(i0 * 4) + 2,
|
|
|
|
|
(i0 * 4) + 3,
|
|
|
|
|
(i1 * 4) + 0,
|
|
|
|
|
(i1 * 4) + 1,
|
|
|
|
|
(i1 * 4) + 2,
|
|
|
|
|
(i1 * 4) + 3,
|
|
|
|
|
(i2 * 4) + 0 + 16,
|
|
|
|
|
(i2 * 4) + 1 + 16,
|
|
|
|
|
(i2 * 4) + 2 + 16,
|
|
|
|
|
(i2 * 4) + 3 + 16,
|
|
|
|
|
(i3 * 4) + 0 + 16,
|
|
|
|
|
(i3 * 4) + 1 + 16,
|
|
|
|
|
(i3 * 4) + 2 + 16,
|
|
|
|
|
(i3 * 4) + 3 + 16};
|
|
|
|
|
|
2024-06-06 09:43:48 +10:00
|
|
|
/* NOTE: This cannot all be put in a single line due to how MSVC ARM64
|
|
|
|
|
* implements the function calls as several layers of macros. */
|
2024-03-06 15:44:46 +01:00
|
|
|
int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
|
|
|
|
|
uint8x16_t idx = *(uint8x16_t *)tbl;
|
|
|
|
|
return type(vqtbl2q_s8(t, idx));
|
2021-02-14 15:01:26 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif /* __KERNEL_NEON */
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
/* Intrinsics Functions
|
|
|
|
|
*
|
|
|
|
|
* For fast bit operations. */
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
#if defined(__BMI__) && defined(__GNUC__)
|
|
|
|
|
# ifndef _tzcnt_u32
|
|
|
|
|
# define _tzcnt_u32 __tzcnt_u32
|
|
|
|
|
# endif
|
|
|
|
|
# ifndef _tzcnt_u64
|
|
|
|
|
# define _tzcnt_u64 __tzcnt_u64
|
|
|
|
|
# endif
|
|
|
|
|
#endif
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
#if defined(__LZCNT__)
|
|
|
|
|
# define _lzcnt_u32 __lzcnt32
|
|
|
|
|
# define _lzcnt_u64 __lzcnt64
|
|
|
|
|
#endif
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
|
|
|
|
|
/* Intrinsic functions on Windows. */
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t __bsf(const uint32_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_AVX2__)
|
2014-06-13 21:13:18 +02:00
|
|
|
return _tzcnt_u32(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# else
|
2014-06-13 21:13:18 +02:00
|
|
|
unsigned long r = 0;
|
|
|
|
|
_BitScanForward(&r, v);
|
|
|
|
|
return r;
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
2013-12-27 21:30:03 +01:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t __bsr(const uint32_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
|
|
|
|
unsigned long r = 0;
|
|
|
|
|
_BitScanReverse(&r, v);
|
|
|
|
|
return r;
|
|
|
|
|
}
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t __btc(const uint32_t v, const uint32_t i)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
|
|
|
|
long r = v;
|
|
|
|
|
_bittestandcomplement(&r, i);
|
|
|
|
|
return r;
|
2013-12-27 21:30:03 +01:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t bitscan(const uint32_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_AVX2__)
|
2014-06-13 21:13:18 +02:00
|
|
|
return _tzcnt_u32(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# else
|
2014-06-13 21:13:18 +02:00
|
|
|
return __bsf(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
2014-01-12 18:14:00 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_64_BIT__)
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint64_t __bsf(const uint64_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_AVX2__)
|
2014-06-13 21:13:18 +02:00
|
|
|
return _tzcnt_u64(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# else
|
2014-06-13 21:13:18 +02:00
|
|
|
unsigned long r = 0;
|
|
|
|
|
_BitScanForward64(&r, v);
|
|
|
|
|
return r;
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
2014-02-01 14:07:53 +04:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint64_t __bsr(const uint64_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
|
|
|
|
unsigned long r = 0;
|
|
|
|
|
_BitScanReverse64(&r, v);
|
|
|
|
|
return r;
|
2014-02-01 14:07:53 +04:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint64_t r = v;
|
2014-06-13 21:13:18 +02:00
|
|
|
_bittestandcomplement64((__int64 *)&r, i);
|
|
|
|
|
return r;
|
2013-12-27 21:30:03 +01:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint64_t bitscan(const uint64_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_AVX2__)
|
|
|
|
|
# if defined(__KERNEL_64_BIT__)
|
2014-06-13 21:13:18 +02:00
|
|
|
return _tzcnt_u64(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# else
|
2014-06-13 21:13:18 +02:00
|
|
|
return _tzcnt_u32(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
|
|
|
|
# else
|
2014-06-13 21:13:18 +02:00
|
|
|
return __bsf(v);
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
2014-01-11 22:20:03 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif /* __KERNEL_64_BIT__ */
|
2014-06-13 21:13:18 +02:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
|
2021-02-18 13:26:39 +11:00
|
|
|
/* Intrinsic functions with x86 SSE. */
|
2014-02-04 23:38:53 +04:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint32_t __bsf(const uint32_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint32_t r = 0;
|
2014-06-13 21:13:18 +02:00
|
|
|
asm("bsf %1,%0" : "=r"(r) : "r"(v));
|
|
|
|
|
return r;
|
2014-03-23 00:45:48 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint32_t __bsr(const uint32_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint32_t r = 0;
|
2014-06-13 21:13:18 +02:00
|
|
|
asm("bsr %1,%0" : "=r"(r) : "r"(v));
|
|
|
|
|
return r;
|
2014-01-11 22:20:03 +04:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t __btc(const uint32_t v, const uint32_t i)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint32_t r = 0;
|
2014-06-13 21:13:18 +02:00
|
|
|
asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
|
|
|
|
|
return r;
|
2014-01-11 22:20:03 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
|
|
|
|
|
!(defined(__ILP32__) && defined(__x86_64__))
|
|
|
|
|
__forceinline uint64_t __bsf(const uint64_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint64_t r = 0;
|
2014-06-13 21:13:18 +02:00
|
|
|
asm("bsf %1,%0" : "=r"(r) : "r"(v));
|
|
|
|
|
return r;
|
2014-02-27 14:49:21 +04:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
2014-02-27 14:49:21 +04:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint64_t __bsr(const uint64_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint64_t r = 0;
|
|
|
|
|
asm("bsr %1,%0" : "=r"(r) : "r"(v));
|
2014-06-13 21:13:18 +02:00
|
|
|
return r;
|
2014-02-27 14:49:21 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
uint64_t r = 0;
|
|
|
|
|
asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
|
2014-06-13 21:13:18 +02:00
|
|
|
return r;
|
|
|
|
|
}
|
2014-02-27 14:49:21 +04:00
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t bitscan(const uint32_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_AVX2__)
|
|
|
|
|
return _tzcnt_u32(v);
|
|
|
|
|
# else
|
|
|
|
|
return __bsf(v);
|
|
|
|
|
# endif
|
2014-02-27 14:49:21 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
|
|
|
|
|
!(defined(__ILP32__) && defined(__x86_64__))
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint64_t bitscan(const uint64_t v)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
# if defined(__KERNEL_AVX2__)
|
|
|
|
|
# if defined(__KERNEL_64_BIT__)
|
|
|
|
|
return _tzcnt_u64(v);
|
|
|
|
|
# else
|
|
|
|
|
return _tzcnt_u32(v);
|
|
|
|
|
# endif
|
|
|
|
|
# else
|
|
|
|
|
return __bsf(v);
|
|
|
|
|
# endif
|
2014-02-27 14:49:21 +04:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
# endif
|
2014-02-27 14:49:21 +04:00
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
#else
|
|
|
|
|
/* Intrinsic functions fallback for arbitrary processor. */
|
|
|
|
|
__forceinline uint32_t __bsf(const uint32_t x)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
for (int i = 0; i < 32; i++) {
|
2023-09-24 14:52:38 +10:00
|
|
|
if (x & (1U << i)) {
|
2021-02-14 15:34:23 +01:00
|
|
|
return i;
|
2023-09-24 14:52:38 +10:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
}
|
|
|
|
|
return 32;
|
2014-06-13 21:13:18 +02:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint32_t __bsr(const uint32_t x)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
for (int i = 0; i < 32; i++) {
|
2023-09-24 14:52:38 +10:00
|
|
|
if (x & (1U << (31 - i))) {
|
2021-02-14 15:34:23 +01:00
|
|
|
return (31 - i);
|
2023-09-24 14:52:38 +10:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
}
|
|
|
|
|
return 32;
|
2014-03-23 00:45:48 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2024-12-29 17:32:00 +01:00
|
|
|
const uint32_t mask = 1U << bit;
|
2021-02-14 15:34:23 +01:00
|
|
|
return x & (~mask);
|
2014-04-03 23:34:53 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint32_t __bsf(const uint64_t x)
|
2014-06-13 21:13:18 +02:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
for (int i = 0; i < 64; i++) {
|
2023-09-24 14:52:38 +10:00
|
|
|
if (x & (1UL << i)) {
|
2021-02-14 15:34:23 +01:00
|
|
|
return i;
|
2023-09-24 14:52:38 +10:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
}
|
|
|
|
|
return 64;
|
2014-03-23 00:45:48 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint32_t __bsr(const uint64_t x)
|
2014-03-23 00:45:48 +04:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
for (int i = 0; i < 64; i++) {
|
2023-09-24 14:52:38 +10:00
|
|
|
if (x & (1UL << (63 - i))) {
|
2021-02-14 15:34:23 +01:00
|
|
|
return (63 - i);
|
2023-09-24 14:52:38 +10:00
|
|
|
}
|
2021-02-14 15:34:23 +01:00
|
|
|
}
|
|
|
|
|
return 64;
|
2014-03-23 00:45:48 +04:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
__forceinline uint64_t __btc(const uint64_t x, const uint32_t bit)
|
2014-03-23 00:45:48 +04:00
|
|
|
{
|
2024-12-29 17:32:00 +01:00
|
|
|
const uint64_t mask = 1UL << bit;
|
2021-02-14 15:34:23 +01:00
|
|
|
return x & (~mask);
|
2014-03-23 00:45:48 +04:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint32_t bitscan(const uint32_t value)
|
2014-03-23 00:45:48 +04:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
assert(value != 0);
|
|
|
|
|
uint32_t bit = 0;
|
|
|
|
|
while ((value & (1 << bit)) == 0) {
|
|
|
|
|
++bit;
|
|
|
|
|
}
|
|
|
|
|
return bit;
|
2014-03-23 00:45:48 +04:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
__forceinline uint64_t bitscan(const uint64_t value)
|
2014-03-23 00:45:48 +04:00
|
|
|
{
|
2021-02-14 15:34:23 +01:00
|
|
|
assert(value != 0);
|
|
|
|
|
uint64_t bit = 0;
|
|
|
|
|
while ((value & (1 << bit)) == 0) {
|
|
|
|
|
++bit;
|
|
|
|
|
}
|
|
|
|
|
return bit;
|
2014-06-13 21:13:18 +02:00
|
|
|
}
|
|
|
|
|
|
2021-02-14 15:34:23 +01:00
|
|
|
#endif /* Intrinsics */
|
|
|
|
|
|
2020-04-02 17:25:48 +02:00
|
|
|
/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
|
2020-04-02 18:19:28 +02:00
|
|
|
* _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
|
2024-02-26 14:49:19 +01:00
|
|
|
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
|
|
|
|
|
# undef _mm256_cvtss_f32
|
|
|
|
|
# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
|
|
|
|
|
#endif
|
2013-12-27 21:30:03 +01:00
|
|
|
|
2017-08-02 02:09:08 +02:00
|
|
|
/* quiet unused define warnings */
|
2021-02-14 15:34:23 +01:00
|
|
|
#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
|
2024-02-09 17:25:58 +01:00
|
|
|
defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
|
2017-08-02 02:09:08 +02:00
|
|
|
/* do nothing */
|
2021-02-14 15:34:23 +01:00
|
|
|
#endif
|
2017-08-02 02:09:08 +02:00
|
|
|
|
2013-12-27 21:30:03 +01:00
|
|
|
CCL_NAMESPACE_END
|