test/intern/cycles/util/simd.h

/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
 * SPDX-FileCopyrightText: 2014-2022 Blender Foundation
 *
 * SPDX-License-Identifier: Apache-2.0 */

#pragma once

#include <cstdint>
#include <limits>

#include "util/defines.h"

/* SSE Intrinsics includes
 *
 * We assume __KERNEL_SSEX__ flags to have been defined at this point.
 *
 * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
 * Since we can't avoid including <windows.h>, better only include that */
#if defined(FREE_WINDOWS64)
#  include "util/windows.h"
#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
#  include <intrin.h>
#elif (defined(__x86_64__) || defined(__i386__))
#  include <x86intrin.h>
#elif defined(__KERNEL_NEON__)
#  define SSE2NEON_PRECISE_MINMAX 1
#  include <sse2neon.h>
#endif

/* Floating Point Control, for Embree. */
#if defined(__x86_64__) || defined(_M_X64)
#  define SIMD_SET_FLUSH_TO_ZERO \
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#elif defined(__aarch64__) || defined(_M_ARM64)
/* The get/set denormals to zero was implemented in sse2neon v1.5.0.
 * Keep the compatibility code until the minimum library version is increased. */
#  if defined(_MM_SET_FLUSH_ZERO_MODE)
#    define SIMD_SET_FLUSH_TO_ZERO \
      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#  elif !defined(_M_ARM64)
#    define _MM_FLUSH_ZERO_ON 24
#    define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
#    define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
#    define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
#    define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
#  else
#    define _MM_FLUSH_ZERO_ON 24
#    define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
#    define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
#    define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
#    define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
#  endif
#else
#  define SIMD_SET_FLUSH_TO_ZERO
#endif

CCL_NAMESPACE_BEGIN

/* Data structures used by SSE classes. */
#ifdef __KERNEL_SSE2__

extern const __m128 _mm_lookupmask_ps[16];

static struct TrueTy {
  __forceinline operator bool() const
  {
    return true;
  }
} True ccl_attr_maybe_unused;

static struct FalseTy {
  __forceinline operator bool() const
  {
    return false;
  }
} False ccl_attr_maybe_unused;

static struct ZeroTy {
  __forceinline operator float() const
  {
    return 0;
  }
  __forceinline operator int() const
  {
    return 0;
  }
} zero ccl_attr_maybe_unused;

static struct OneTy {
  __forceinline operator float() const
  {
    return 1;
  }
  __forceinline operator int() const
  {
    return 1;
  }
} one ccl_attr_maybe_unused;

static struct NegInfTy {
  __forceinline operator float() const
  {
    return -std::numeric_limits<float>::infinity();
  }
  __forceinline operator int() const
  {
    return std::numeric_limits<int>::min();
  }
} neg_inf ccl_attr_maybe_unused;

static struct PosInfTy {
  __forceinline operator float() const
  {
    return std::numeric_limits<float>::infinity();
  }
  __forceinline operator int() const
  {
    return std::numeric_limits<int>::max();
  }
} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused;

static struct StepTy {
} step ccl_attr_maybe_unused;

#endif
#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(_MM_SET_FLUSH_ZERO_MODE)
__forceinline int set_fz(const uint32_t flag)
{
  uint64_t old_fpcr;
  uint64_t new_fpcr;
  __get_fpcr(old_fpcr);
  new_fpcr = old_fpcr | (1ULL << flag);
  __set_fpcr(new_fpcr);
  __get_fpcr(old_fpcr);
  return old_fpcr == new_fpcr;
}
__forceinline int get_fz(const uint32_t flag)
{
  uint64_t cur_fpcr;
  __get_fpcr(cur_fpcr);
  return (cur_fpcr & (1ULL << flag)) > 0 ? 1 : 0;
}
#endif

/* Utilities used by Neon */
#if defined(__KERNEL_NEON__)
template<class type, const int i0, const int i1, const int i2, const int i3>
type shuffle_neon(const type &a)
{
  if (i0 == i1 && i0 == i2 && i0 == i3) {
    return type(vdupq_laneq_s32(int32x4_t(a), i0));
  }
  static const uint8_t tbl[16] = {(i0 * 4) + 0,
                                  (i0 * 4) + 1,
                                  (i0 * 4) + 2,
                                  (i0 * 4) + 3,
                                  (i1 * 4) + 0,
                                  (i1 * 4) + 1,
                                  (i1 * 4) + 2,
                                  (i1 * 4) + 3,
                                  (i2 * 4) + 0,
                                  (i2 * 4) + 1,
                                  (i2 * 4) + 2,
                                  (i2 * 4) + 3,
                                  (i3 * 4) + 0,
                                  (i3 * 4) + 1,
                                  (i3 * 4) + 2,
                                  (i3 * 4) + 3};

  return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
}

template<class type, const int i0, const int i1, const int i2, const int i3>
type shuffle_neon(const type &a, const type &b)
{
  if (&a == &b) {
    static const uint8_t tbl[16] = {(i0 * 4) + 0,
                                    (i0 * 4) + 1,
                                    (i0 * 4) + 2,
                                    (i0 * 4) + 3,
                                    (i1 * 4) + 0,
                                    (i1 * 4) + 1,
                                    (i1 * 4) + 2,
                                    (i1 * 4) + 3,
                                    (i2 * 4) + 0,
                                    (i2 * 4) + 1,
                                    (i2 * 4) + 2,
                                    (i2 * 4) + 3,
                                    (i3 * 4) + 0,
                                    (i3 * 4) + 1,
                                    (i3 * 4) + 2,
                                    (i3 * 4) + 3};

    return type(vqtbl1q_s8(int8x16_t(b), *(uint8x16_t *)tbl));
  }
  else {

    static const uint8_t tbl[16] = {(i0 * 4) + 0,
                                    (i0 * 4) + 1,
                                    (i0 * 4) + 2,
                                    (i0 * 4) + 3,
                                    (i1 * 4) + 0,
                                    (i1 * 4) + 1,
                                    (i1 * 4) + 2,
                                    (i1 * 4) + 3,
                                    (i2 * 4) + 0 + 16,
                                    (i2 * 4) + 1 + 16,
                                    (i2 * 4) + 2 + 16,
                                    (i2 * 4) + 3 + 16,
                                    (i3 * 4) + 0 + 16,
                                    (i3 * 4) + 1 + 16,
                                    (i3 * 4) + 2 + 16,
                                    (i3 * 4) + 3 + 16};

    /* NOTE: This cannot all be put in a single line due to how MSVC ARM64
     * implements the function calls as several layers of macros. */
    int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
    uint8x16_t idx = *(uint8x16_t *)tbl;
    return type(vqtbl2q_s8(t, idx));
  }
}
#endif /* __KERNEL_NEON */

/* Intrinsics Functions
 *
 * For fast bit operations. */

#if defined(__BMI__) && defined(__GNUC__)
#  ifndef _tzcnt_u32
#    define _tzcnt_u32 __tzcnt_u32
#  endif
#  ifndef _tzcnt_u64
#    define _tzcnt_u64 __tzcnt_u64
#  endif
#endif

#if defined(__LZCNT__)
#  define _lzcnt_u32 __lzcnt32
#  define _lzcnt_u64 __lzcnt64
#endif

#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
/* Intrinsic functions on Windows. */
__forceinline uint32_t __bsf(const uint32_t v)
{
#  if defined(__KERNEL_AVX2__)
  return _tzcnt_u32(v);
#  else
  unsigned long r = 0;
  _BitScanForward(&r, v);
  return r;
#  endif
}

__forceinline uint32_t __bsr(const uint32_t v)
{
  unsigned long r = 0;
  _BitScanReverse(&r, v);
  return r;
}

__forceinline uint32_t __btc(const uint32_t v, const uint32_t i)
{
  long r = v;
  _bittestandcomplement(&r, i);
  return r;
}

__forceinline uint32_t bitscan(const uint32_t v)
{
#  if defined(__KERNEL_AVX2__)
  return _tzcnt_u32(v);
#  else
  return __bsf(v);
#  endif
}

#  if defined(__KERNEL_64_BIT__)

__forceinline uint64_t __bsf(const uint64_t v)
{
#    if defined(__KERNEL_AVX2__)
  return _tzcnt_u64(v);
#    else
  unsigned long r = 0;
  _BitScanForward64(&r, v);
  return r;
#    endif
}

__forceinline uint64_t __bsr(const uint64_t v)
{
  unsigned long r = 0;
  _BitScanReverse64(&r, v);
  return r;
}

__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
{
  uint64_t r = v;
  _bittestandcomplement64((__int64 *)&r, i);
  return r;
}

__forceinline uint64_t bitscan(const uint64_t v)
{
#    if defined(__KERNEL_AVX2__)
#      if defined(__KERNEL_64_BIT__)
  return _tzcnt_u64(v);
#      else
  return _tzcnt_u32(v);
#      endif
#    else
  return __bsf(v);
#    endif
}

#  endif /* __KERNEL_64_BIT__ */

#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
/* Intrinsic functions with x86 SSE. */

__forceinline uint32_t __bsf(const uint32_t v)
{
  uint32_t r = 0;
  asm("bsf %1,%0" : "=r"(r) : "r"(v));
  return r;
}

__forceinline uint32_t __bsr(const uint32_t v)
{
  uint32_t r = 0;
  asm("bsr %1,%0" : "=r"(r) : "r"(v));
  return r;
}

__forceinline uint32_t __btc(const uint32_t v, const uint32_t i)
{
  uint32_t r = 0;
  asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  return r;
}

#  if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
      !(defined(__ILP32__) && defined(__x86_64__))
__forceinline uint64_t __bsf(const uint64_t v)
{
  uint64_t r = 0;
  asm("bsf %1,%0" : "=r"(r) : "r"(v));
  return r;
}
#  endif

__forceinline uint64_t __bsr(const uint64_t v)
{
  uint64_t r = 0;
  asm("bsr %1,%0" : "=r"(r) : "r"(v));
  return r;
}

__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
{
  uint64_t r = 0;
  asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  return r;
}

__forceinline uint32_t bitscan(const uint32_t v)
{
#  if defined(__KERNEL_AVX2__)
  return _tzcnt_u32(v);
#  else
  return __bsf(v);
#  endif
}

#  if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
      !(defined(__ILP32__) && defined(__x86_64__))
__forceinline uint64_t bitscan(const uint64_t v)
{
#    if defined(__KERNEL_AVX2__)
#      if defined(__KERNEL_64_BIT__)
  return _tzcnt_u64(v);
#      else
  return _tzcnt_u32(v);
#      endif
#    else
  return __bsf(v);
#    endif
}
#  endif

#else
/* Intrinsic functions fallback for arbitrary processor. */
__forceinline uint32_t __bsf(const uint32_t x)
{
  for (int i = 0; i < 32; i++) {
    if (x & (1U << i)) {
      return i;
    }
  }
  return 32;
}

__forceinline uint32_t __bsr(const uint32_t x)
{
  for (int i = 0; i < 32; i++) {
    if (x & (1U << (31 - i))) {
      return (31 - i);
    }
  }
  return 32;
}

__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
{
  const uint32_t mask = 1U << bit;
  return x & (~mask);
}

__forceinline uint32_t __bsf(const uint64_t x)
{
  for (int i = 0; i < 64; i++) {
    if (x & (1UL << i)) {
      return i;
    }
  }
  return 64;
}

__forceinline uint32_t __bsr(const uint64_t x)
{
  for (int i = 0; i < 64; i++) {
    if (x & (1UL << (63 - i))) {
      return (63 - i);
    }
  }
  return 64;
}

__forceinline uint64_t __btc(const uint64_t x, const uint32_t bit)
{
  const uint64_t mask = 1UL << bit;
  return x & (~mask);
}

__forceinline uint32_t bitscan(const uint32_t value)
{
  assert(value != 0);
  uint32_t bit = 0;
  while ((value & (1 << bit)) == 0) {
    ++bit;
  }
  return bit;
}

__forceinline uint64_t bitscan(const uint64_t value)
{
  assert(value != 0);
  uint64_t bit = 0;
  while ((value & (1 << bit)) == 0) {
    ++bit;
  }
  return bit;
}

#endif /* Intrinsics */

/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
 * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
#  undef _mm256_cvtss_f32
#  define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
#endif

/* quiet unused define warnings */
#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
    defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
/* do nothing */
#endif

CCL_NAMESPACE_END