Refactor: Cycles: Make math and type headers work by themselves

Remove separate impl.h headers, shuffle around some code and add more
includes so that individual header files compile without errors.

Pull Request: https://projects.blender.org/blender/blender/pulls/132361
This commit is contained in:
Brecht Van Lommel
2024-12-26 17:53:56 +01:00
parent f53e13411b
commit 7db0bc2e64
71 changed files with 1959 additions and 2416 deletions

View File

@@ -16,7 +16,7 @@
#include "kernel/sample/lcg.h"
#include "kernel/sample/mapping.h"
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
#include "kernel/closure/bsdf_microfacet.h"

View File

@@ -352,7 +352,7 @@ set(SRC_KERNEL_SAMPLE_HEADERS
)
set(SRC_KERNEL_UTIL_HEADERS
util/color.h
util/colorspace.h
util/differential.h
util/ies.h
util/lookup_table.h
@@ -390,6 +390,7 @@ set(SRC_UTIL_HEADERS
../util/half.h
../util/hash.h
../util/math.h
../util/math_base.h
../util/math_fast.h
../util/math_intersect.h
../util/math_float2.h
@@ -409,35 +410,22 @@ set(SRC_UTIL_HEADERS
../util/transform_inverse.h
../util/texture.h
../util/types.h
../util/types_base.h
../util/types_float2.h
../util/types_float2_impl.h
../util/types_float3.h
../util/types_float3_impl.h
../util/types_float4.h
../util/types_float4_impl.h
../util/types_float8.h
../util/types_float8_impl.h
../util/types_int2.h
../util/types_int2_impl.h
../util/types_int3.h
../util/types_int3_impl.h
../util/types_int4.h
../util/types_int4_impl.h
../util/types_int8.h
../util/types_int8_impl.h
../util/types_spectrum.h
../util/types_uchar2.h
../util/types_uchar2_impl.h
../util/types_uchar3.h
../util/types_uchar3_impl.h
../util/types_uchar4.h
../util/types_uchar4_impl.h
../util/types_uint2.h
../util/types_uint2_impl.h
../util/types_uint3.h
../util/types_uint3_impl.h
../util/types_uint4.h
../util/types_uint4_impl.h
../util/types_ushort4.h
)

View File

@@ -10,7 +10,7 @@
#include "kernel/geom/geom.h"
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
CCL_NAMESPACE_BEGIN

View File

@@ -8,7 +8,7 @@
#pragma once
#include "kernel/sample/mapping.h"
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
CCL_NAMESPACE_BEGIN

View File

@@ -7,7 +7,7 @@
#pragma once
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
CCL_NAMESPACE_BEGIN

View File

@@ -11,7 +11,7 @@
# include <fenv.h>
#endif
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
CCL_NAMESPACE_BEGIN

View File

@@ -10,7 +10,7 @@
#include "kernel/closure/bsdf_util.h"
#include "kernel/sample/lcg.h"
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
CCL_NAMESPACE_BEGIN

View File

@@ -7,6 +7,8 @@
#pragma once
#include "util/color.h"
CCL_NAMESPACE_BEGIN
/* Compute fresnel reflectance for perpendicular (aka S-) and parallel (aka P-) polarized light.

View File

@@ -8,6 +8,8 @@
#pragma once
#include "util/color.h"
CCL_NAMESPACE_BEGIN
/* --------------------------------------------------------------------

View File

@@ -4,7 +4,7 @@
#pragma once
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
#ifdef __KERNEL_GPU__
# define __ATOMIC_PASS_WRITE__

View File

@@ -8,6 +8,8 @@
#include "kernel/closure/bsdf.h"
#include "kernel/film/write.h"
#include "util/color.h"
CCL_NAMESPACE_BEGIN
/* Utilities. */

View File

@@ -18,6 +18,8 @@
#include "kernel/light/light.h"
#include "kernel/light/sample.h"
#include "util/color.h"
CCL_NAMESPACE_BEGIN
#ifdef __VOLUME__

View File

@@ -8,6 +8,8 @@
#include "kernel/integrator/guiding.h"
#include "util/color.h"
CCL_NAMESPACE_BEGIN
#ifdef __SUBSURFACE__

View File

@@ -45,8 +45,6 @@
#include "kernel/svm/svm.h"
#include "kernel/util/color.h"
CCL_NAMESPACE_BEGIN
/* RenderServices implementation */

View File

@@ -255,7 +255,7 @@ ccl_device_extern ccl_private OSLClosure *osl_allocate_weighted_closure_componen
/* Utilities */
#include "kernel/svm/math_util.h"
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
ccl_device_extern void osl_error(ccl_private ShaderGlobals *sg, const char *format, void *args) {}

View File

@@ -9,7 +9,7 @@
#include "kernel/closure/bsdf_util.h"
#include "kernel/closure/emissive.h"
#include "kernel/util/color.h"
#include "kernel/util/colorspace.h"
CCL_NAMESPACE_BEGIN

View File

@@ -4,8 +4,6 @@
#pragma once
#include "util/color.h"
CCL_NAMESPACE_BEGIN
ccl_device float3 xyz_to_rgb(KernelGlobals kg, float3 xyz)

View File

@@ -173,7 +173,7 @@ class ShaderNode : public Node {
/* Simplify settings used by artists to the ones which are simpler to
* evaluate in the kernel but keep the final result unchanged.
*/
virtual void simplify_settings(Scene * /*scene*/) {};
virtual void simplify_settings(Scene * /*scene*/){};
virtual bool has_surface_emission()
{

View File

@@ -43,6 +43,7 @@ set(SRC_HEADERS
array.h
atomic.h
boundbox.h
color.h
debug.h
defines.h
deque.h
@@ -60,6 +61,7 @@ set(SRC_HEADERS
log.h
map.h
math.h
math_base.h
math_cdf.h
math_fast.h
math_intersect.h
@@ -101,35 +103,22 @@ set(SRC_HEADERS
transform.h
transform_inverse.h
types.h
types_base.h
types_float2.h
types_float2_impl.h
types_float3.h
types_float3_impl.h
types_float4.h
types_float4_impl.h
types_float8.h
types_float8_impl.h
types_int2.h
types_int2_impl.h
types_int3.h
types_int3_impl.h
types_int4.h
types_int4_impl.h
types_int8.h
types_int8_impl.h
types_spectrum.h
types_uchar2.h
types_uchar2_impl.h
types_uchar3.h
types_uchar3_impl.h
types_uchar4.h
types_uchar4_impl.h
types_uint2.h
types_uint2_impl.h
types_uint3.h
types_uint3_impl.h
types_uint4.h
types_uint4_impl.h
types_ushort4.h
unique_ptr.h
vector.h

View File

@@ -12,5 +12,6 @@ using std::remove;
using std::sort;
using std::stable_sort;
using std::swap;
using std::upper_bound;
CCL_NAMESPACE_END

View File

@@ -2,8 +2,7 @@
*
* SPDX-License-Identifier: Apache-2.0 */
#ifndef __UTIL_COLOR_H__
#define __UTIL_COLOR_H__
#pragma once
#include "util/math.h"
#include "util/types.h"
@@ -351,6 +350,67 @@ ccl_device float3 color_highlight_uncompress(float3 color)
return exp(color) - one_float3();
}
CCL_NAMESPACE_END
/* Color division */
#endif /* __UTIL_COLOR_H__ */
ccl_device_inline Spectrum safe_invert_color(Spectrum a)
{
FOREACH_SPECTRUM_CHANNEL (i) {
GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ?
1.0f / GET_SPECTRUM_CHANNEL(a, i) :
0.0f;
}
return a;
}
/* Returns `a/b`, and replace the channel value with `fallback` if `b == 0`. */
ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b, const float fallback = 0.0f)
{
FOREACH_SPECTRUM_CHANNEL (i) {
GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ?
GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) :
fallback;
}
return a;
}
ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
{
float x, y, z;
x = (b.x != 0.0f) ? a.x / b.x : 0.0f;
y = (b.y != 0.0f) ? a.y / b.y : 0.0f;
z = (b.z != 0.0f) ? a.z / b.z : 0.0f;
/* try to get gray even if b is zero */
if (b.x == 0.0f) {
if (b.y == 0.0f) {
x = z;
y = z;
}
else if (b.z == 0.0f) {
x = y;
z = y;
}
else {
x = 0.5f * (y + z);
}
}
else if (b.y == 0.0f) {
if (b.z == 0.0f) {
y = x;
z = x;
}
else {
y = 0.5f * (x + z);
}
}
else if (b.z == 0.0f) {
z = 0.5f * (x + y);
}
return make_float3(x, y, z);
}
CCL_NAMESPACE_END

View File

@@ -2,1089 +2,21 @@
*
* SPDX-License-Identifier: Apache-2.0 */
#ifndef __UTIL_MATH_H__
#define __UTIL_MATH_H__
#pragma once
/* Math
*
* Basic math functions on scalar and vector types. This header is used by
* both the kernel code when compiled as C++, and other C++ non-kernel code. */
#include "util/types.h" // IWYU pragma: export
#ifndef __KERNEL_GPU__
# include <cmath>
#endif
#include "util/math_base.h" // IWYU pragma: export
#ifdef __HIP__
# include <hip/hip_vector_types.h>
#endif
#include "util/math_int2.h" // IWYU pragma: export
#include "util/math_int3.h" // IWYU pragma: export
#include "util/math_int4.h" // IWYU pragma: export
#include "util/math_int8.h" // IWYU pragma: export
#if !defined(__KERNEL_METAL__)
# include <float.h>
# include <math.h>
# include <stdio.h>
#endif /* !defined(__KERNEL_METAL__) */
#include "util/math_float2.h" // IWYU pragma: export
#include "util/math_float4.h" // IWYU pragma: export
#include "util/math_float8.h" // IWYU pragma: export
#include "util/types.h"
#include "util/math_float3.h" // IWYU pragma: export
CCL_NAMESPACE_BEGIN
/* Float Pi variations */
/* Division */
#ifndef M_PI_F
# define M_PI_F (3.1415926535897932f) /* pi */
#endif
#ifndef M_PI_2_F
# define M_PI_2_F (1.5707963267948966f) /* pi/2 */
#endif
#ifndef M_PI_4_F
# define M_PI_4_F (0.7853981633974830f) /* pi/4 */
#endif
#ifndef M_1_PI_F
# define M_1_PI_F (0.3183098861837067f) /* 1/pi */
#endif
#ifndef M_2_PI_F
# define M_2_PI_F (0.6366197723675813f) /* 2/pi */
#endif
#ifndef M_1_2PI_F
# define M_1_2PI_F (0.1591549430918953f) /* 1/(2*pi) */
#endif
#ifndef M_1_4PI_F
# define M_1_4PI_F (0.0795774715459476f) /* 1/(4*pi) */
#endif
#ifndef M_SQRT_PI_8_F
# define M_SQRT_PI_8_F (0.6266570686577501f) /* sqrt(pi/8) */
#endif
#ifndef M_LN_2PI_F
# define M_LN_2PI_F (1.8378770664093454f) /* ln(2*pi) */
#endif
/* Multiplication */
#ifndef M_2PI_F
# define M_2PI_F (6.2831853071795864f) /* 2*pi */
#endif
#ifndef M_4PI_F
# define M_4PI_F (12.566370614359172f) /* 4*pi */
#endif
#ifndef M_PI_4F
# define M_PI_4F 0.78539816339744830962f /* pi/4 */
#endif
/* Float sqrt variations */
#ifndef M_SQRT2_F
# define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */
#endif
#ifndef M_CBRT2_F
# define M_CBRT2_F 1.2599210498948732f /* cbrt(2) */
#endif
#ifndef M_SQRT1_2F
# define M_SQRT1_2F 0.70710678118654752440f /* sqrt(1/2) */
#endif
#ifndef M_SQRT3_F
# define M_SQRT3_F (1.7320508075688772f) /* sqrt(3) */
#endif
#ifndef M_LN2_F
# define M_LN2_F (0.6931471805599453f) /* ln(2) */
#endif
#ifndef M_LN10_F
# define M_LN10_F (2.3025850929940457f) /* ln(10) */
#endif
/* Scalar */
#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
# ifdef _WIN32
ccl_device_inline float fmaxf(float a, float b)
{
return (a > b) ? a : b;
}
ccl_device_inline float fminf(float a, float b)
{
return (a < b) ? a : b;
}
# endif /* _WIN32 */
#endif /* __HIP__, __KERNEL_ONEAPI__ */
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
# ifndef __KERNEL_ONEAPI__
using std::isfinite;
using std::isnan;
using std::sqrt;
# else
# define isfinite(x) sycl::isfinite((x))
# define isnan(x) sycl::isnan((x))
# endif
ccl_device_inline int abs(int x)
{
return (x > 0) ? x : -x;
}
ccl_device_inline int max(int a, int b)
{
return (a > b) ? a : b;
}
ccl_device_inline int min(int a, int b)
{
return (a < b) ? a : b;
}
ccl_device_inline uint32_t max(uint32_t a, uint32_t b)
{
return (a > b) ? a : b;
}
ccl_device_inline uint32_t min(uint32_t a, uint32_t b)
{
return (a < b) ? a : b;
}
ccl_device_inline uint64_t max(uint64_t a, uint64_t b)
{
return (a > b) ? a : b;
}
ccl_device_inline uint64_t min(uint64_t a, uint64_t b)
{
return (a < b) ? a : b;
}
/* NOTE: On 64bit Darwin the `size_t` is defined as `unsigned long int` and `uint64_t` is defined
* as `unsigned long long`. Both of the definitions are 64 bit unsigned integer, but the automatic
* substitution does not allow to automatically pick function defined for `uint64_t` as it is not
* exactly the same type definition.
* Work this around by adding a templated function enabled for `size_t` type which will be used
* when there is no explicit specialization of `min()`/`max()` above. */
template<class T>
ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> max(T a, T b)
{
return (a > b) ? a : b;
}
template<class T>
ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> min(T a, T b)
{
return (a < b) ? a : b;
}
ccl_device_inline float max(float a, float b)
{
return (a > b) ? a : b;
}
ccl_device_inline float min(float a, float b)
{
return (a < b) ? a : b;
}
ccl_device_inline double max(double a, double b)
{
return (a > b) ? a : b;
}
ccl_device_inline double min(double a, double b)
{
return (a < b) ? a : b;
}
/* These 2 guys are templated for usage with registers data.
*
* NOTE: Since this is CPU-only functions it is ok to use references here.
* But for other devices we'll need to be careful about this.
*/
template<typename T> ccl_device_inline T min4(const T &a, const T &b, const T &c, const T &d)
{
return min(min(a, b), min(c, d));
}
template<typename T> ccl_device_inline T max4(const T &a, const T &b, const T &c, const T &d)
{
return max(max(a, b), max(c, d));
}
#endif /* __KERNEL_GPU__ */
ccl_device_inline float min4(float a, float b, float c, float d)
{
return min(min(a, b), min(c, d));
}
ccl_device_inline float max4(float a, float b, float c, float d)
{
return max(max(a, b), max(c, d));
}
#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
/* Int/Float conversion */
ccl_device_inline int as_int(uint i)
{
union {
uint ui;
int i;
} u;
u.ui = i;
return u.i;
}
ccl_device_inline uint as_uint(int i)
{
union {
uint ui;
int i;
} u;
u.i = i;
return u.ui;
}
ccl_device_inline uint as_uint(float f)
{
union {
uint i;
float f;
} u;
u.f = f;
return u.i;
}
# ifndef __HIP__
ccl_device_inline int __float_as_int(float f)
{
union {
int i;
float f;
} u;
u.f = f;
return u.i;
}
ccl_device_inline float __int_as_float(int i)
{
union {
int i;
float f;
} u;
u.i = i;
return u.f;
}
ccl_device_inline uint __float_as_uint(float f)
{
union {
uint i;
float f;
} u;
u.f = f;
return u.i;
}
ccl_device_inline float __uint_as_float(uint i)
{
union {
uint i;
float f;
} u;
u.i = i;
return u.f;
}
# endif
ccl_device_inline int4 __float4_as_int4(float4 f)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(f.m128));
# else
return make_int4(
__float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
# endif
}
ccl_device_inline float4 __int4_as_float4(int4 i)
{
# ifdef __KERNEL_SSE__
return float4(_mm_castsi128_ps(i.m128));
# else
return make_float4(
__int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
# endif
}
#endif /* !defined(__KERNEL_METAL__) */
#if defined(__KERNEL_METAL__)
ccl_device_forceinline bool isnan_safe(float f)
{
return isnan(f);
}
ccl_device_forceinline bool isfinite_safe(float f)
{
return isfinite(f);
}
#else
template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
{
return ((uint64_t)ptr) & 0xFFFFFFFF;
}
template<typename T> ccl_device_inline uint pointer_pack_to_uint_1(T *ptr)
{
return (((uint64_t)ptr) >> 32) & 0xFFFFFFFF;
}
template<typename T> ccl_device_inline T *pointer_unpack_from_uint(const uint a, const uint b)
{
return (T *)(((uint64_t)b << 32) | a);
}
ccl_device_inline uint uint16_pack_to_uint(const uint a, const uint b)
{
return (a << 16) | b;
}
ccl_device_inline uint uint16_unpack_from_uint_0(const uint i)
{
return i >> 16;
}
ccl_device_inline uint uint16_unpack_from_uint_1(const uint i)
{
return i & 0xFFFF;
}
/* Versions of functions which are safe for fast math. */
ccl_device_inline bool isnan_safe(float f)
{
unsigned int x = __float_as_uint(f);
return (x << 1) > 0xff000000u;
}
ccl_device_inline bool isfinite_safe(float f)
{
/* By IEEE 754 rule, 2*Inf equals Inf */
unsigned int x = __float_as_uint(f);
return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
}
#endif
ccl_device_inline float ensure_finite(float v)
{
return isfinite_safe(v) ? v : 0.0f;
}
#if !defined(__KERNEL_METAL__)
ccl_device_inline int clamp(int a, int mn, int mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float clamp(float a, float mn, float mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float mix(float a, float b, float t)
{
return a + t * (b - a);
}
ccl_device_inline float smoothstep(float edge0, float edge1, float x)
{
float result;
if (x < edge0) {
result = 0.0f;
}
else if (x >= edge1) {
result = 1.0f;
}
else {
float t = (x - edge0) / (edge1 - edge0);
result = (3.0f - 2.0f * t) * (t * t);
}
return result;
}
#endif /* !defined(__KERNEL_METAL__) */
#if defined(__KERNEL_CUDA__)
ccl_device_inline float saturatef(float a)
{
return __saturatef(a);
}
#elif !defined(__KERNEL_METAL__)
ccl_device_inline float saturatef(float a)
{
return clamp(a, 0.0f, 1.0f);
}
#endif /* __KERNEL_CUDA__ */
ccl_device_inline int float_to_int(float f)
{
return (int)f;
}
ccl_device_inline int floor_to_int(float f)
{
return float_to_int(floorf(f));
}
ccl_device_inline float floorfrac(float x, ccl_private int *i)
{
float f = floorf(x);
*i = float_to_int(f);
return x - f;
}
ccl_device_inline int ceil_to_int(float f)
{
return float_to_int(ceilf(f));
}
ccl_device_inline float fractf(float x)
{
return x - floorf(x);
}
/* Adapted from `godot-engine` math_funcs.h. */
ccl_device_inline float wrapf(float value, float max, float min)
{
float range = max - min;
return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
}
ccl_device_inline float pingpongf(float a, float b)
{
return (b != 0.0f) ? fabsf(fractf((a - b) / (b * 2.0f)) * b * 2.0f - b) : 0.0f;
}
ccl_device_inline float smoothminf(float a, float b, float k)
{
if (k != 0.0f) {
float h = fmaxf(k - fabsf(a - b), 0.0f) / k;
return fminf(a, b) - h * h * h * k * (1.0f / 6.0f);
}
else {
return fminf(a, b);
}
}
ccl_device_inline float signf(float f)
{
return (f < 0.0f) ? -1.0f : 1.0f;
}
ccl_device_inline float nonzerof(float f, float eps)
{
if (fabsf(f) < eps) {
return signf(f) * eps;
}
else {
return f;
}
}
/* The behavior of `atan2(0, 0)` is undefined on many platforms, to ensure consistent behavior, we
* return 0 in this case. See !126951.
* Computes the angle between the positive x axis and the vector pointing from origin to (x, y). */
ccl_device_inline float compatible_atan2(const float y, const float x)
{
return (x == 0.0f && y == 0.0f) ? 0.0f : atan2f(y, x);
}
/* `signum` function testing for zero. Matches GLSL and OSL functions. */
ccl_device_inline float compatible_signf(float f)
{
if (f == 0.0f) {
return 0.0f;
}
else {
return signf(f);
}
}
ccl_device_inline float smoothstepf(float f)
{
if (f <= 0.0f) {
return 0.0f;
}
if (f >= 1.0f) {
return 1.0f;
}
float ff = f * f;
return (3.0f * ff - 2.0f * ff * f);
}
ccl_device_inline int mod(int x, int m)
{
return (x % m + m) % m;
}
ccl_device_inline float inverse_lerp(float a, float b, float x)
{
return (x - a) / (b - a);
}
/* Cubic interpolation between b and c, a and d are the previous and next point. */
ccl_device_inline float cubic_interp(float a, float b, float c, float d, float x)
{
return 0.5f *
(((d + 3.0f * (b - c) - a) * x + (2.0f * a - 5.0f * b + 4.0f * c - d)) * x +
(c - a)) *
x +
b;
}
CCL_NAMESPACE_END
#include "util/math_int2.h"
#include "util/math_int3.h"
#include "util/math_int4.h"
#include "util/math_int8.h"
#include "util/math_float2.h"
#include "util/math_float4.h"
#include "util/math_float8.h"
#include "util/math_float3.h"
#include "util/rect.h"
CCL_NAMESPACE_BEGIN
/* Triangle */
ccl_device_inline float triangle_area(ccl_private const float3 &v1,
ccl_private const float3 &v2,
ccl_private const float3 &v3)
{
return len(cross(v3 - v2, v1 - v2)) * 0.5f;
}
/* Orthonormal vectors */
ccl_device_inline void make_orthonormals(const float3 N,
ccl_private float3 *a,
ccl_private float3 *b)
{
#if 0
if (fabsf(N.y) >= 0.999f) {
*a = make_float3(1, 0, 0);
*b = make_float3(0, 0, 1);
return;
}
if (fabsf(N.z) >= 0.999f) {
*a = make_float3(1, 0, 0);
*b = make_float3(0, 1, 0);
return;
}
#endif
if (N.x != N.y || N.x != N.z)
*a = make_float3(N.z - N.y, N.x - N.z, N.y - N.x); //(1,1,1)x N
else
*a = make_float3(N.z - N.y, N.x + N.z, -N.y - N.x); //(-1,1,1)x N
*a = normalize(*a);
*b = cross(N, *a);
}
/* Color division */
ccl_device_inline Spectrum safe_invert_color(Spectrum a)
{
FOREACH_SPECTRUM_CHANNEL (i) {
GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ?
1.0f / GET_SPECTRUM_CHANNEL(a, i) :
0.0f;
}
return a;
}
/* Returns `a/b`, and replace the channel value with `fallback` if `b == 0`. */
ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b, const float fallback = 0.0f)
{
FOREACH_SPECTRUM_CHANNEL (i) {
GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ?
GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) :
fallback;
}
return a;
}
ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
{
float x, y, z;
x = (b.x != 0.0f) ? a.x / b.x : 0.0f;
y = (b.y != 0.0f) ? a.y / b.y : 0.0f;
z = (b.z != 0.0f) ? a.z / b.z : 0.0f;
/* try to get gray even if b is zero */
if (b.x == 0.0f) {
if (b.y == 0.0f) {
x = z;
y = z;
}
else if (b.z == 0.0f) {
x = y;
z = y;
}
else {
x = 0.5f * (y + z);
}
}
else if (b.y == 0.0f) {
if (b.z == 0.0f) {
y = x;
z = x;
}
else {
y = 0.5f * (x + z);
}
}
else if (b.z == 0.0f) {
z = 0.5f * (x + y);
}
return make_float3(x, y, z);
}
/* Rotation of point around axis and angle */
ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
{
float costheta = cosf(angle);
float sintheta = sinf(angle);
float3 r;
r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
(((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
(((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
(((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
(((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
return r;
}
/* NaN-safe math ops */
ccl_device_inline float safe_sqrtf(float f)
{
return sqrtf(max(f, 0.0f));
}
ccl_device_inline float inversesqrtf(float f)
{
#if defined(__KERNEL_METAL__)
return (f > 0.0f) ? rsqrt(f) : 0.0f;
#else
return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
#endif
}
ccl_device float safe_asinf(float a)
{
return asinf(clamp(a, -1.0f, 1.0f));
}
ccl_device float safe_acosf(float a)
{
return acosf(clamp(a, -1.0f, 1.0f));
}
ccl_device float compatible_powf(float x, float y)
{
#ifdef __KERNEL_GPU__
if (y == 0.0f) /* x^0 -> 1, including 0^0 */
return 1.0f;
/* GPU pow doesn't accept negative x, do manual checks here */
if (x < 0.0f) {
if (fmodf(-y, 2.0f) == 0.0f)
return powf(-x, y);
else
return -powf(-x, y);
}
else if (x == 0.0f)
return 0.0f;
#endif
return powf(x, y);
}
ccl_device float safe_powf(float a, float b)
{
if (UNLIKELY(a < 0.0f && b != float_to_int(b))) {
return 0.0f;
}
return compatible_powf(a, b);
}
ccl_device float safe_divide(float a, float b)
{
return (b != 0.0f) ? a / b : 0.0f;
}
ccl_device float safe_logf(float a, float b)
{
if (UNLIKELY(a <= 0.0f || b <= 0.0f)) {
return 0.0f;
}
return safe_divide(logf(a), logf(b));
}
ccl_device float safe_modulo(float a, float b)
{
return (b != 0.0f) ? fmodf(a, b) : 0.0f;
}
ccl_device float safe_floored_modulo(float a, float b)
{
return (b != 0.0f) ? a - floorf(a / b) * b : 0.0f;
}
ccl_device_inline float sqr(float a)
{
return a * a;
}
ccl_device_inline float sin_from_cos(const float c)
{
return safe_sqrtf(1.0f - sqr(c));
}
ccl_device_inline float cos_from_sin(const float s)
{
return safe_sqrtf(1.0f - sqr(s));
}
ccl_device_inline float sin_sqr_to_one_minus_cos(const float s_sq)
{
/* Using second-order Taylor expansion at small angles for better accuracy. */
return s_sq > 0.0004f ? 1.0f - safe_sqrtf(1.0f - s_sq) : 0.5f * s_sq;
}
ccl_device_inline float one_minus_cos(const float angle)
{
/* Using second-order Taylor expansion at small angles for better accuracy. */
return angle > 0.02f ? 1.0f - cosf(angle) : 0.5f * sqr(angle);
}
ccl_device_inline float pow20(float a)
{
return sqr(sqr(sqr(sqr(a)) * a));
}
ccl_device_inline float pow22(float a)
{
return sqr(a * sqr(sqr(sqr(a)) * a));
}
#ifdef __KERNEL_METAL__
ccl_device_inline float lgammaf(float x)
{
/* Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik
*/
const float _1_180 = 1.0f / 180.0f;
const float log2pi = 1.83787706641f;
const float logx = log(x);
return (log2pi - logx +
x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) *
0.5f;
}
#endif
ccl_device_inline float beta(float x, float y)
{
return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
}
ccl_device_inline float xor_signmask(float x, int y)
{
return __int_as_float(__float_as_int(x) ^ y);
}
ccl_device float bits_to_01(uint bits)
{
return bits * (1.0f / (float)0xFFFFFFFF);
}
#if !defined(__KERNEL_GPU__)
# if defined(__GNUC__)
ccl_device_inline uint popcount(uint x)
{
return __builtin_popcount(x);
}
# else
ccl_device_inline uint popcount(uint x)
{
/* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
uint i = x;
i = i - ((i >> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
return i;
}
# endif
#elif defined(__KERNEL_ONEAPI__)
# define popcount(x) sycl::popcount(x)
#elif defined(__KERNEL_HIP__)
/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
# define popcount(x) __popcll(x)
#elif !defined(__KERNEL_METAL__)
# define popcount(x) __popc(x)
#endif
ccl_device_inline uint count_leading_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return __clz(x);
#elif defined(__KERNEL_METAL__)
return clz(x);
#elif defined(__KERNEL_ONEAPI__)
return sycl::clz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
unsigned long leading_zero = 0;
_BitScanReverse(&leading_zero, x);
return (31 - leading_zero);
# else
return __builtin_clz(x);
# endif
#endif
}
ccl_device_inline uint count_trailing_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return (__ffs(x) - 1);
#elif defined(__KERNEL_METAL__)
return ctz(x);
#elif defined(__KERNEL_ONEAPI__)
return sycl::ctz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
unsigned long ctz = 0;
_BitScanForward(&ctz, x);
return ctz;
# else
return __builtin_ctz(x);
# endif
#endif
}
ccl_device_inline uint find_first_set(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return __ffs(x);
#elif defined(__KERNEL_METAL__)
return (x != 0) ? ctz(x) + 1 : 0;
#else
# ifdef _MSC_VER
return (x != 0) ? (32 - count_leading_zeros(x & (~x + 1))) : 0;
# else
return __builtin_ffs(x);
# endif
#endif
}
/* projections */
ccl_device_inline float2 map_to_tube(const float3 co)
{
float len, u, v;
len = sqrtf(co.x * co.x + co.y * co.y);
if (len > 0.0f) {
u = (1.0f - (atan2f(co.x / len, co.y / len) / M_PI_F)) * 0.5f;
v = (co.z + 1.0f) * 0.5f;
}
else {
u = v = 0.0f;
}
return make_float2(u, v);
}
ccl_device_inline float2 map_to_sphere(const float3 co)
{
float l = dot(co, co);
float u, v;
if (l > 0.0f) {
if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
u = 0.0f; /* Otherwise domain error. */
}
else {
u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F);
}
v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F;
}
else {
u = v = 0.0f;
}
return make_float2(u, v);
}
/* Compares two floats.
* Returns true if their absolute difference is smaller than abs_diff (for numbers near zero)
* or their relative difference is less than ulp_diff ULPs.
* Based on
* https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
*/
ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
{
if (fabsf(a - b) < abs_diff) {
return true;
}
if ((a < 0.0f) != (b < 0.0f)) {
return false;
}
return (abs(__float_as_int(a) - __float_as_int(b)) < ulp_diff);
}
/* Calculate the angle between the two vectors a and b.
* The usual approach `acos(dot(a, b))` has severe precision issues for small angles,
* which are avoided by this method.
* Based on "Mangled Angles" from https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf
*/
ccl_device_inline float precise_angle(float3 a, float3 b)
{
return 2.0f * atan2f(len(a - b), len(a + b));
}
/* Tangent of the angle between vectors a and b. */
ccl_device_inline float tan_angle(float3 a, float3 b)
{
return len(cross(a, b)) / dot(a, b);
}
/* Return value which is greater than the given one and is a power of two. */
ccl_device_inline uint next_power_of_two(uint x)
{
return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
}
/* Return value which is lower than the given one and is a power of two. */
ccl_device_inline uint prev_power_of_two(uint x)
{
return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
}
#ifndef __has_builtin
# define __has_builtin(v) 0
#endif
/* Reverses the bits of a 32 bit integer. */
ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
{
/* Use a native instruction if it exists. */
#if defined(__KERNEL_CUDA__)
return __brev(x);
#elif defined(__KERNEL_METAL__)
return reverse_bits(x);
#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
/* Assume the rbit is always available on 64bit ARM architecture. */
__asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
return x;
#elif defined(__arm__) && ((__ARM_ARCH > 7) || __ARM_ARCH == 6 && __ARM_ARCH_ISA_THUMB >= 2)
/* This ARM instruction is available in ARMv6T2 and above.
* This 32-bit Thumb instruction is available in ARMv6T2 and above. */
__asm__("rbit %0, %1" : "=r"(x) : "r"(x));
return x;
#elif __has_builtin(__builtin_bitreverse32)
return __builtin_bitreverse32(x);
#else
/* Flip pairwise. */
x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
/* Flip pairs. */
x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
/* Flip nibbles. */
x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
/* Flip bytes. CPUs have an instruction for that, pretty fast one. */
# ifdef _MSC_VER
return _byteswap_ulong(x);
# elif defined(__INTEL_COMPILER)
return (uint32_t)_bswap((int)x);
# else
/* Assuming gcc or clang. */
return __builtin_bswap32(x);
# endif
#endif
}
/* Solve quadratic equation a*x^2 + b*x + c = 0, adapted from Mitsuba 3
* The solution is ordered so that x1 <= x2.
* Returns true if at least one solution is found. */
ccl_device_inline bool solve_quadratic(
const float a, const float b, const float c, ccl_private float &x1, ccl_private float &x2)
{
/* If the equation is linear, the solution is -c/b, but b has to be non-zero. */
const bool valid_linear = (a == 0.0f) && (b != 0.0f);
x1 = x2 = -c / b;
const float discriminant = sqr(b) - 4.0f * a * c;
/* Allow slightly negative discriminant in case of numerical precision issues. */
const bool valid_quadratic = (a != 0.0f) && (discriminant > -1e-5f);
if (valid_quadratic) {
/* Numerically stable version of (-b ± sqrt(discriminant)) / (2 * a), avoiding catastrophic
* cancellation when `b` is very close to `sqrt(discriminant)`, by finding the solution of
* greater magnitude which does not suffer from loss of precision, then using the identity
* x1 * x2 = c / a. */
const float temp = -0.5f * (b + copysignf(safe_sqrtf(discriminant), b));
const float r1 = temp / a;
const float r2 = c / temp;
x1 = fminf(r1, r2);
x2 = fmaxf(r1, r2);
}
return (valid_linear || valid_quadratic);
}
/* Defines a closed interval [min, max]. */
template<typename T> struct Interval {
T min;
T max;
ccl_device_inline_method bool is_empty() const
{
return min >= max;
}
ccl_device_inline_method bool contains(T value) const
{
return value >= min && value <= max;
}
ccl_device_inline_method T length() const
{
return max - min;
}
};
/* Computes the intersection of two intervals. */
template<typename T>
ccl_device_inline Interval<T> intervals_intersection(ccl_private const Interval<T> &first,
ccl_private const Interval<T> &second)
{
return {max(first.min, second.min), min(first.max, second.max)};
}
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_H__ */
#include "util/rect.h" // IWYU pragma: export

View File

@@ -0,0 +1,868 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
/* Math
*
* Basic math functions on scalar and vector types. This header is used by
* both the kernel code when compiled as C++, and other C++ non-kernel code. */
#include "util/defines.h"
#include "util/types_base.h"
#ifdef __HIP__
# include <hip/hip_vector_types.h>
#endif
#if !defined(__KERNEL_METAL__)
# include <cfloat> // IWYU pragma: export
# include <cmath> // IWYU pragma: export
#endif
CCL_NAMESPACE_BEGIN
/* Float Pi variations */
/* Division */
#ifndef M_PI_F
# define M_PI_F (3.1415926535897932f) /* pi */
#endif
#ifndef M_PI_2_F
# define M_PI_2_F (1.5707963267948966f) /* pi/2 */
#endif
#ifndef M_PI_4_F
# define M_PI_4_F (0.7853981633974830f) /* pi/4 */
#endif
#ifndef M_1_PI_F
# define M_1_PI_F (0.3183098861837067f) /* 1/pi */
#endif
#ifndef M_2_PI_F
# define M_2_PI_F (0.6366197723675813f) /* 2/pi */
#endif
#ifndef M_1_2PI_F
# define M_1_2PI_F (0.1591549430918953f) /* 1/(2*pi) */
#endif
#ifndef M_1_4PI_F
# define M_1_4PI_F (0.0795774715459476f) /* 1/(4*pi) */
#endif
#ifndef M_SQRT_PI_8_F
# define M_SQRT_PI_8_F (0.6266570686577501f) /* sqrt(pi/8) */
#endif
#ifndef M_LN_2PI_F
# define M_LN_2PI_F (1.8378770664093454f) /* ln(2*pi) */
#endif
/* Multiplication */
#ifndef M_2PI_F
# define M_2PI_F (6.2831853071795864f) /* 2*pi */
#endif
#ifndef M_4PI_F
# define M_4PI_F (12.566370614359172f) /* 4*pi */
#endif
#ifndef M_PI_4F
# define M_PI_4F 0.78539816339744830962f /* pi/4 */
#endif
/* Float sqrt variations */
#ifndef M_SQRT2_F
# define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */
#endif
#ifndef M_CBRT2_F
# define M_CBRT2_F 1.2599210498948732f /* cbrt(2) */
#endif
#ifndef M_SQRT1_2F
# define M_SQRT1_2F 0.70710678118654752440f /* sqrt(1/2) */
#endif
#ifndef M_SQRT3_F
# define M_SQRT3_F (1.7320508075688772f) /* sqrt(3) */
#endif
#ifndef M_LN2_F
# define M_LN2_F (0.6931471805599453f) /* ln(2) */
#endif
#ifndef M_LN10_F
# define M_LN10_F (2.3025850929940457f) /* ln(10) */
#endif
/* Scalar */
#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
# ifdef _WIN32
ccl_device_inline float fmaxf(float a, float b)
{
return (a > b) ? a : b;
}
ccl_device_inline float fminf(float a, float b)
{
return (a < b) ? a : b;
}
# endif /* _WIN32 */
#endif /* __HIP__, __KERNEL_ONEAPI__ */
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
# ifndef __KERNEL_ONEAPI__
using std::isfinite;
using std::isnan;
using std::sqrt;
# else
# define isfinite(x) sycl::isfinite((x))
# define isnan(x) sycl::isnan((x))
# endif
ccl_device_inline int abs(int x)
{
return (x > 0) ? x : -x;
}
ccl_device_inline int max(int a, int b)
{
return (a > b) ? a : b;
}
ccl_device_inline int min(int a, int b)
{
return (a < b) ? a : b;
}
ccl_device_inline uint32_t max(uint32_t a, uint32_t b)
{
return (a > b) ? a : b;
}
ccl_device_inline uint32_t min(uint32_t a, uint32_t b)
{
return (a < b) ? a : b;
}
ccl_device_inline uint64_t max(uint64_t a, uint64_t b)
{
return (a > b) ? a : b;
}
ccl_device_inline uint64_t min(uint64_t a, uint64_t b)
{
return (a < b) ? a : b;
}
/* NOTE: On 64bit Darwin the `size_t` is defined as `unsigned long int` and `uint64_t` is defined
* as `unsigned long long`. Both of the definitions are 64 bit unsigned integer, but the automatic
* substitution does not allow to automatically pick function defined for `uint64_t` as it is not
* exactly the same type definition.
* Work this around by adding a templated function enabled for `size_t` type which will be used
* when there is no explicit specialization of `min()`/`max()` above. */
template<class T>
ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> max(T a, T b)
{
return (a > b) ? a : b;
}
template<class T>
ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> min(T a, T b)
{
return (a < b) ? a : b;
}
ccl_device_inline float max(float a, float b)
{
return (a > b) ? a : b;
}
ccl_device_inline float min(float a, float b)
{
return (a < b) ? a : b;
}
ccl_device_inline double max(double a, double b)
{
return (a > b) ? a : b;
}
ccl_device_inline double min(double a, double b)
{
return (a < b) ? a : b;
}
/* These 2 guys are templated for usage with registers data.
*
* NOTE: Since this is CPU-only functions it is ok to use references here.
* But for other devices we'll need to be careful about this.
*/
template<typename T> ccl_device_inline T min4(const T &a, const T &b, const T &c, const T &d)
{
return min(min(a, b), min(c, d));
}
template<typename T> ccl_device_inline T max4(const T &a, const T &b, const T &c, const T &d)
{
return max(max(a, b), max(c, d));
}
#endif /* __KERNEL_GPU__ */
ccl_device_inline float min4(float a, float b, float c, float d)
{
return min(min(a, b), min(c, d));
}
ccl_device_inline float max4(float a, float b, float c, float d)
{
return max(max(a, b), max(c, d));
}
#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
/* Int/Float conversion */
ccl_device_inline int as_int(uint i)
{
union {
uint ui;
int i;
} u;
u.ui = i;
return u.i;
}
ccl_device_inline uint as_uint(int i)
{
union {
uint ui;
int i;
} u;
u.i = i;
return u.ui;
}
ccl_device_inline uint as_uint(float f)
{
union {
uint i;
float f;
} u;
u.f = f;
return u.i;
}
# ifndef __HIP__
ccl_device_inline int __float_as_int(float f)
{
union {
int i;
float f;
} u;
u.f = f;
return u.i;
}
ccl_device_inline float __int_as_float(int i)
{
union {
int i;
float f;
} u;
u.i = i;
return u.f;
}
ccl_device_inline uint __float_as_uint(float f)
{
union {
uint i;
float f;
} u;
u.f = f;
return u.i;
}
ccl_device_inline float __uint_as_float(uint i)
{
union {
uint i;
float f;
} u;
u.i = i;
return u.f;
}
# endif
#endif /* !defined(__KERNEL_METAL__) */
#if defined(__KERNEL_METAL__)
ccl_device_forceinline bool isnan_safe(float f)
{
return isnan(f);
}
ccl_device_forceinline bool isfinite_safe(float f)
{
return isfinite(f);
}
#else
template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
{
return ((uint64_t)ptr) & 0xFFFFFFFF;
}
template<typename T> ccl_device_inline uint pointer_pack_to_uint_1(T *ptr)
{
return (((uint64_t)ptr) >> 32) & 0xFFFFFFFF;
}
template<typename T> ccl_device_inline T *pointer_unpack_from_uint(const uint a, const uint b)
{
return (T *)(((uint64_t)b << 32) | a);
}
ccl_device_inline uint uint16_pack_to_uint(const uint a, const uint b)
{
return (a << 16) | b;
}
ccl_device_inline uint uint16_unpack_from_uint_0(const uint i)
{
return i >> 16;
}
ccl_device_inline uint uint16_unpack_from_uint_1(const uint i)
{
return i & 0xFFFF;
}
/* Versions of functions which are safe for fast math. */
ccl_device_inline bool isnan_safe(float f)
{
unsigned int x = __float_as_uint(f);
return (x << 1) > 0xff000000u;
}
ccl_device_inline bool isfinite_safe(float f)
{
/* By IEEE 754 rule, 2*Inf equals Inf */
unsigned int x = __float_as_uint(f);
return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
}
#endif
ccl_device_inline float ensure_finite(float v)
{
return isfinite_safe(v) ? v : 0.0f;
}
#if !defined(__KERNEL_METAL__)
ccl_device_inline int clamp(int a, int mn, int mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float clamp(float a, float mn, float mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float mix(float a, float b, float t)
{
return a + t * (b - a);
}
ccl_device_inline float smoothstep(float edge0, float edge1, float x)
{
float result;
if (x < edge0) {
result = 0.0f;
}
else if (x >= edge1) {
result = 1.0f;
}
else {
float t = (x - edge0) / (edge1 - edge0);
result = (3.0f - 2.0f * t) * (t * t);
}
return result;
}
#endif /* !defined(__KERNEL_METAL__) */
#if defined(__KERNEL_CUDA__)
ccl_device_inline float saturatef(float a)
{
return __saturatef(a);
}
#elif !defined(__KERNEL_METAL__)
ccl_device_inline float saturatef(float a)
{
return clamp(a, 0.0f, 1.0f);
}
#endif /* __KERNEL_CUDA__ */
ccl_device_inline int float_to_int(float f)
{
return (int)f;
}
ccl_device_inline int floor_to_int(float f)
{
return float_to_int(floorf(f));
}
ccl_device_inline float floorfrac(float x, ccl_private int *i)
{
float f = floorf(x);
*i = float_to_int(f);
return x - f;
}
ccl_device_inline int ceil_to_int(float f)
{
return float_to_int(ceilf(f));
}
ccl_device_inline float fractf(float x)
{
return x - floorf(x);
}
/* Adapted from `godot-engine` math_funcs.h. */
ccl_device_inline float wrapf(float value, float max, float min)
{
float range = max - min;
return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
}
ccl_device_inline float pingpongf(float a, float b)
{
return (b != 0.0f) ? fabsf(fractf((a - b) / (b * 2.0f)) * b * 2.0f - b) : 0.0f;
}
ccl_device_inline float smoothminf(float a, float b, float k)
{
if (k != 0.0f) {
float h = fmaxf(k - fabsf(a - b), 0.0f) / k;
return fminf(a, b) - h * h * h * k * (1.0f / 6.0f);
}
return fminf(a, b);
}
ccl_device_inline float signf(float f)
{
return (f < 0.0f) ? -1.0f : 1.0f;
}
ccl_device_inline float nonzerof(float f, float eps)
{
if (fabsf(f) < eps) {
return signf(f) * eps;
}
return f;
}
/* The behavior of `atan2(0, 0)` is undefined on many platforms, to ensure consistent behavior, we
* return 0 in this case. See !126951.
* Computes the angle between the positive x axis and the vector pointing from origin to (x, y). */
ccl_device_inline float compatible_atan2(const float y, const float x)
{
return (x == 0.0f && y == 0.0f) ? 0.0f : atan2f(y, x);
}
/* `signum` function testing for zero. Matches GLSL and OSL functions. */
ccl_device_inline float compatible_signf(float f)
{
if (f == 0.0f) {
return 0.0f;
}
return signf(f);
}
ccl_device_inline float smoothstepf(float f)
{
if (f <= 0.0f) {
return 0.0f;
}
if (f >= 1.0f) {
return 1.0f;
}
float ff = f * f;
return (3.0f * ff - 2.0f * ff * f);
}
ccl_device_inline int mod(int x, int m)
{
return (x % m + m) % m;
}
ccl_device_inline float inverse_lerp(float a, float b, float x)
{
return (x - a) / (b - a);
}
/* Cubic interpolation between b and c, a and d are the previous and next point. */
ccl_device_inline float cubic_interp(float a, float b, float c, float d, float x)
{
return 0.5f *
(((d + 3.0f * (b - c) - a) * x + (2.0f * a - 5.0f * b + 4.0f * c - d)) * x +
(c - a)) *
x +
b;
}
/* NaN-safe math ops */
ccl_device_inline float safe_sqrtf(float f)
{
return sqrtf(max(f, 0.0f));
}
ccl_device_inline float inversesqrtf(float f)
{
#if defined(__KERNEL_METAL__)
return (f > 0.0f) ? rsqrt(f) : 0.0f;
#else
return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
#endif
}
ccl_device float safe_asinf(float a)
{
return asinf(clamp(a, -1.0f, 1.0f));
}
ccl_device float safe_acosf(float a)
{
return acosf(clamp(a, -1.0f, 1.0f));
}
ccl_device float compatible_powf(float x, float y)
{
#ifdef __KERNEL_GPU__
if (y == 0.0f) /* x^0 -> 1, including 0^0 */
return 1.0f;
/* GPU pow doesn't accept negative x, do manual checks here */
if (x < 0.0f) {
if (fmodf(-y, 2.0f) == 0.0f)
return powf(-x, y);
else
return -powf(-x, y);
}
else if (x == 0.0f)
return 0.0f;
#endif
return powf(x, y);
}
ccl_device float safe_powf(float a, float b)
{
if (UNLIKELY(a < 0.0f && b != float_to_int(b))) {
return 0.0f;
}
return compatible_powf(a, b);
}
ccl_device float safe_divide(float a, float b)
{
return (b != 0.0f) ? a / b : 0.0f;
}
ccl_device float safe_logf(float a, float b)
{
if (UNLIKELY(a <= 0.0f || b <= 0.0f)) {
return 0.0f;
}
return safe_divide(logf(a), logf(b));
}
ccl_device float safe_modulo(float a, float b)
{
return (b != 0.0f) ? fmodf(a, b) : 0.0f;
}
ccl_device float safe_floored_modulo(float a, float b)
{
return (b != 0.0f) ? a - floorf(a / b) * b : 0.0f;
}
ccl_device_inline float sqr(float a)
{
return a * a;
}
ccl_device_inline float sin_from_cos(const float c)
{
return safe_sqrtf(1.0f - sqr(c));
}
ccl_device_inline float cos_from_sin(const float s)
{
return safe_sqrtf(1.0f - sqr(s));
}
ccl_device_inline float sin_sqr_to_one_minus_cos(const float s_sq)
{
/* Using second-order Taylor expansion at small angles for better accuracy. */
return s_sq > 0.0004f ? 1.0f - safe_sqrtf(1.0f - s_sq) : 0.5f * s_sq;
}
ccl_device_inline float one_minus_cos(const float angle)
{
/* Using second-order Taylor expansion at small angles for better accuracy. */
return angle > 0.02f ? 1.0f - cosf(angle) : 0.5f * sqr(angle);
}
ccl_device_inline float pow20(float a)
{
return sqr(sqr(sqr(sqr(a)) * a));
}
ccl_device_inline float pow22(float a)
{
return sqr(a * sqr(sqr(sqr(a)) * a));
}
#ifdef __KERNEL_METAL__
ccl_device_inline float lgammaf(float x)
{
/* Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik
*/
const float _1_180 = 1.0f / 180.0f;
const float log2pi = 1.83787706641f;
const float logx = log(x);
return (log2pi - logx +
x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) *
0.5f;
}
#endif
ccl_device_inline float beta(float x, float y)
{
return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
}
ccl_device_inline float xor_signmask(float x, int y)
{
return __int_as_float(__float_as_int(x) ^ y);
}
ccl_device float bits_to_01(uint bits)
{
return bits * (1.0f / (float)0xFFFFFFFF);
}
#if !defined(__KERNEL_GPU__)
# if defined(__GNUC__)
ccl_device_inline uint popcount(uint x)
{
return __builtin_popcount(x);
}
# else
ccl_device_inline uint popcount(uint x)
{
/* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
uint i = x;
i = i - ((i >> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
return i;
}
# endif
#elif defined(__KERNEL_ONEAPI__)
# define popcount(x) sycl::popcount(x)
#elif defined(__KERNEL_HIP__)
/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
# define popcount(x) __popcll(x)
#elif !defined(__KERNEL_METAL__)
# define popcount(x) __popc(x)
#endif
ccl_device_inline uint count_leading_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return __clz(x);
#elif defined(__KERNEL_METAL__)
return clz(x);
#elif defined(__KERNEL_ONEAPI__)
return sycl::clz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
unsigned long leading_zero = 0;
_BitScanReverse(&leading_zero, x);
return (31 - leading_zero);
# else
return __builtin_clz(x);
# endif
#endif
}
ccl_device_inline uint count_trailing_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return (__ffs(x) - 1);
#elif defined(__KERNEL_METAL__)
return ctz(x);
#elif defined(__KERNEL_ONEAPI__)
return sycl::ctz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
unsigned long ctz = 0;
_BitScanForward(&ctz, x);
return ctz;
# else
return __builtin_ctz(x);
# endif
#endif
}
ccl_device_inline uint find_first_set(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return __ffs(x);
#elif defined(__KERNEL_METAL__)
return (x != 0) ? ctz(x) + 1 : 0;
#else
# ifdef _MSC_VER
return (x != 0) ? (32 - count_leading_zeros(x & (~x + 1))) : 0;
# else
return __builtin_ffs(x);
# endif
#endif
}
/* Compares two floats.
* Returns true if their absolute difference is smaller than abs_diff (for numbers near zero)
* or their relative difference is less than ulp_diff ULPs.
* Based on
* https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
*/
ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
{
if (fabsf(a - b) < abs_diff) {
return true;
}
if ((a < 0.0f) != (b < 0.0f)) {
return false;
}
return (abs(__float_as_int(a) - __float_as_int(b)) < ulp_diff);
}
/* Return value which is greater than the given one and is a power of two. */
ccl_device_inline uint next_power_of_two(uint x)
{
return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
}
/* Return value which is lower than the given one and is a power of two. */
ccl_device_inline uint prev_power_of_two(uint x)
{
return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
}
#ifndef __has_builtin
# define __has_builtin(v) 0
#endif
/* Reverses the bits of a 32 bit integer. */
ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
{
/* Use a native instruction if it exists. */
#if defined(__KERNEL_CUDA__)
return __brev(x);
#elif defined(__KERNEL_METAL__)
return reverse_bits(x);
#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
/* Assume the rbit is always available on 64bit ARM architecture. */
__asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
return x;
#elif defined(__arm__) && ((__ARM_ARCH > 7) || __ARM_ARCH == 6 && __ARM_ARCH_ISA_THUMB >= 2)
/* This ARM instruction is available in ARMv6T2 and above.
* This 32-bit Thumb instruction is available in ARMv6T2 and above. */
__asm__("rbit %0, %1" : "=r"(x) : "r"(x));
return x;
#elif __has_builtin(__builtin_bitreverse32)
return __builtin_bitreverse32(x);
#else
/* Flip pairwise. */
x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
/* Flip pairs. */
x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
/* Flip nibbles. */
x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
/* Flip bytes. CPUs have an instruction for that, pretty fast one. */
# ifdef _MSC_VER
return _byteswap_ulong(x);
# elif defined(__INTEL_COMPILER)
return (uint32_t)_bswap((int)x);
# else
/* Assuming gcc or clang. */
return __builtin_bswap32(x);
# endif
#endif
}
/* Solve quadratic equation a*x^2 + b*x + c = 0, adapted from Mitsuba 3
* The solution is ordered so that x1 <= x2.
* Returns true if at least one solution is found. */
ccl_device_inline bool solve_quadratic(
const float a, const float b, const float c, ccl_private float &x1, ccl_private float &x2)
{
/* If the equation is linear, the solution is -c/b, but b has to be non-zero. */
const bool valid_linear = (a == 0.0f) && (b != 0.0f);
x1 = x2 = -c / b;
const float discriminant = sqr(b) - 4.0f * a * c;
/* Allow slightly negative discriminant in case of numerical precision issues. */
const bool valid_quadratic = (a != 0.0f) && (discriminant > -1e-5f);
if (valid_quadratic) {
/* Numerically stable version of (-b ± sqrt(discriminant)) / (2 * a), avoiding catastrophic
* cancellation when `b` is very close to `sqrt(discriminant)`, by finding the solution of
* greater magnitude which does not suffer from loss of precision, then using the identity
* x1 * x2 = c / a. */
const float temp = -0.5f * (b + copysignf(safe_sqrtf(discriminant), b));
const float r1 = temp / a;
const float r2 = c / temp;
x1 = fminf(r1, r2);
x2 = fmaxf(r1, r2);
}
return (valid_linear || valid_quadratic);
}
/* Defines a closed interval [min, max]. */
template<typename T> struct Interval {
T min;
T max;
ccl_device_inline_method bool is_empty() const
{
return min >= max;
}
ccl_device_inline_method bool contains(T value) const
{
return value >= min && value <= max;
}
ccl_device_inline_method T length() const
{
return max - min;
}
};
/* Computes the intersection of two intervals. */
template<typename T>
ccl_device_inline Interval<T> intervals_intersection(ccl_private const Interval<T> &first,
ccl_private const Interval<T> &second)
{
return {max(first.min, second.min), min(first.max, second.max)};
}
CCL_NAMESPACE_END

View File

@@ -4,8 +4,9 @@
#include "util/math_cdf.h"
#include <cassert>
#include "util/algorithm.h"
#include "util/math.h"
CCL_NAMESPACE_BEGIN

View File

@@ -4,8 +4,7 @@
#pragma once
#include "util/algorithm.h"
#include "util/math.h"
#include "util/math_base.h"
#include "util/vector.h"
CCL_NAMESPACE_BEGIN

View File

@@ -23,6 +23,13 @@
#pragma once
#include "util/math_base.h"
#include "util/math_float3.h"
#include "util/math_float4.h"
#include "util/math_int4.h"
#include "util/types_float3.h"
#include "util/types_float4.h"
CCL_NAMESPACE_BEGIN
ccl_device_inline float madd(const float a, const float b, const float c)
@@ -91,8 +98,9 @@ ccl_device float fast_sinf(float x)
x = madd(qf, -1.2816720341285448015e-12f * 4, x);
x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
float s = x * x;
if ((q & 1) != 0)
if ((q & 1) != 0) {
x = -x;
}
/* This polynomial approximation has very low error on [-pi/2,+pi/2]
* 1.19209e-07 max error in total over [-2pi,+2pi]. */
float u = 2.6083159809786593541503e-06f;
@@ -468,9 +476,7 @@ ccl_device_inline float fast_expm1f(float x)
x = 1.0f - (1.0f - x); /* Crush denormals. */
return madd(0.5f, x * x, x);
}
else {
return fast_expf(x) - 1.0f;
}
return fast_expf(x) - 1.0f;
}
ccl_device float fast_sinhf(float x)
@@ -482,17 +488,15 @@ ccl_device float fast_sinhf(float x)
float e = fast_expf(a);
return copysignf(0.5f * e - 0.5f / e, x);
}
else {
a = 1.0f - (1.0f - a); /* Crush denorms. */
float a2 = a * a;
/* Degree 7 polynomial generated with sollya. */
/* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
float r = 2.03945513931e-4f;
r = madd(r, a2, 8.32990277558e-3f);
r = madd(r, a2, 0.1666673421859f);
r = madd(r * a, a2, a);
return copysignf(r, x);
}
a = 1.0f - (1.0f - a); /* Crush denorms. */
float a2 = a * a;
/* Degree 7 polynomial generated with sollya. */
/* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
float r = 2.03945513931e-4f;
r = madd(r, a2, 8.32990277558e-3f);
r = madd(r, a2, 0.1666673421859f);
r = madd(r * a, a2, a);
return copysignf(r, x);
}
ccl_device_inline float fast_coshf(float x)
@@ -516,10 +520,12 @@ ccl_device_inline float fast_tanhf(float x)
ccl_device float fast_safe_powf(float x, float y)
{
if (y == 0)
if (y == 0) {
return 1.0f; /* x^1=1 */
if (x == 0)
}
if (x == 0) {
return 0.0f; /* 0^y=0 */
}
float sign = 1.0f;
if (x < 0.0f) {
/* if x is negative, only deal with integer powers

View File

@@ -4,9 +4,9 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/math_base.h"
#include "util/types_float2.h"
#include "util/types_float4.h"
CCL_NAMESPACE_BEGIN

View File

@@ -5,9 +5,10 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/math_base.h"
#include "util/math_float4.h"
#include "util/types_float3.h"
#include "util/types_float4.h"
CCL_NAMESPACE_BEGIN
@@ -384,10 +385,10 @@ ccl_device_inline float3 reflect(const float3 incident, const float3 unit_normal
ccl_device_inline float3 refract(const float3 incident, const float3 normal, const float eta)
{
float k = 1.0f - eta * eta * (1.0f - dot(normal, incident) * dot(normal, incident));
if (k < 0.0f)
if (k < 0.0f) {
return zero_float3();
else
return eta * incident - (eta * dot(normal, incident) + sqrt(k)) * normal;
}
return eta * incident - (eta * dot(normal, incident) + sqrt(k)) * normal;
}
ccl_device_inline float3 faceforward(const float3 vector,
@@ -498,13 +499,128 @@ ccl_device_inline bool isfinite_safe(float3 v)
ccl_device_inline float3 ensure_finite(float3 v)
{
if (!isfinite_safe(v.x))
if (!isfinite_safe(v.x)) {
v.x = 0.0f;
if (!isfinite_safe(v.y))
}
if (!isfinite_safe(v.y)) {
v.y = 0.0f;
if (!isfinite_safe(v.z))
}
if (!isfinite_safe(v.z)) {
v.z = 0.0f;
}
return v;
}
/* Triangle */
ccl_device_inline float triangle_area(ccl_private const float3 &v1,
ccl_private const float3 &v2,
ccl_private const float3 &v3)
{
return len(cross(v3 - v2, v1 - v2)) * 0.5f;
}
/* Orthonormal vectors */
ccl_device_inline void make_orthonormals(const float3 N,
ccl_private float3 *a,
ccl_private float3 *b)
{
#if 0
if (fabsf(N.y) >= 0.999f) {
*a = make_float3(1, 0, 0);
*b = make_float3(0, 0, 1);
return;
}
if (fabsf(N.z) >= 0.999f) {
*a = make_float3(1, 0, 0);
*b = make_float3(0, 1, 0);
return;
}
#endif
if (N.x != N.y || N.x != N.z) {
*a = make_float3(N.z - N.y, N.x - N.z, N.y - N.x); //(1,1,1)x N
}
else {
*a = make_float3(N.z - N.y, N.x + N.z, -N.y - N.x); //(-1,1,1)x N
}
*a = normalize(*a);
*b = cross(N, *a);
}
/* Rotation of point around axis and angle */
ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
{
float costheta = cosf(angle);
float sintheta = sinf(angle);
float3 r;
r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
(((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
(((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
(((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
(((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
return r;
}
/* Calculate the angle between the two vectors a and b.
* The usual approach `acos(dot(a, b))` has severe precision issues for small angles,
* which are avoided by this method.
* Based on "Mangled Angles" from https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf
*/
ccl_device_inline float precise_angle(float3 a, float3 b)
{
return 2.0f * atan2f(len(a - b), len(a + b));
}
/* Tangent of the angle between vectors a and b. */
ccl_device_inline float tan_angle(float3 a, float3 b)
{
return len(cross(a, b)) / dot(a, b);
}
/* projections */
ccl_device_inline float2 map_to_tube(const float3 co)
{
float len, u, v;
len = sqrtf(co.x * co.x + co.y * co.y);
if (len > 0.0f) {
u = (1.0f - (atan2f(co.x / len, co.y / len) / M_PI_F)) * 0.5f;
v = (co.z + 1.0f) * 0.5f;
}
else {
u = v = 0.0f;
}
return make_float2(u, v);
}
ccl_device_inline float2 map_to_sphere(const float3 co)
{
float l = dot(co, co);
float u, v;
if (l > 0.0f) {
if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
u = 0.0f; /* Otherwise domain error. */
}
else {
u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F);
}
v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F;
}
else {
u = v = 0.0f;
}
return make_float2(u, v);
}
CCL_NAMESPACE_END

View File

@@ -5,9 +5,8 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/math_base.h"
#include "util/types_float4.h"
CCL_NAMESPACE_BEGIN
@@ -244,41 +243,40 @@ ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
}
#ifdef __KERNEL_SSE__
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const float4 shuffle(const float4 b)
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline float4 shuffle(const float4 a)
{
# ifdef __KERNEL_NEON__
return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a.m128));
# else
return float4(
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))));
# endif
}
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
template<> __forceinline float4 shuffle<0, 1, 0, 1>(const float4 a)
{
return float4(_mm_movelh_ps(a, a));
}
template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
template<> __forceinline float4 shuffle<2, 3, 2, 3>(const float4 a)
{
return float4(_mm_movehl_ps(a, a));
}
# ifdef __KERNEL_SSE3__
template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
template<> __forceinline float4 shuffle<0, 0, 2, 2>(const float4 a)
{
return float4(_mm_moveldup_ps(b));
return float4(_mm_moveldup_ps(a));
}
template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
template<> __forceinline float4 shuffle<1, 1, 3, 3>(const float4 a)
{
return float4(_mm_movehdup_ps(b));
return float4(_mm_movehdup_ps(a));
}
# endif /* __KERNEL_SSE3__ */
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const float4 shuffle(const float4 a, const float4 b)
__forceinline float4 shuffle(const float4 a, const float4 b)
{
# ifdef __KERNEL_NEON__
return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
@@ -287,11 +285,11 @@ __forceinline const float4 shuffle(const float4 a, const float4 b)
# endif
}
template<size_t i0> __forceinline const float4 shuffle(const float4 b)
template<size_t i0> __forceinline float4 shuffle(const float4 a)
{
return shuffle<i0, i0, i0, i0>(b);
return shuffle<i0, i0, i0, i0>(a);
}
template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
template<size_t i0> __forceinline float4 shuffle(const float4 a, const float4 b)
{
# ifdef __KERNEL_NEON__
return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
@@ -300,12 +298,12 @@ template<size_t i0> __forceinline const float4 shuffle(const float4 a, const flo
# endif
}
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
template<> __forceinline float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
{
return float4(_mm_movelh_ps(a, b));
}
template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
template<> __forceinline float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
{
return float4(_mm_movehl_ps(b, a));
}
@@ -592,14 +590,18 @@ ccl_device_inline bool isfinite_safe(float4 v)
ccl_device_inline float4 ensure_finite(float4 v)
{
if (!isfinite_safe(v.x))
if (!isfinite_safe(v.x)) {
v.x = 0.0f;
if (!isfinite_safe(v.y))
}
if (!isfinite_safe(v.y)) {
v.y = 0.0f;
if (!isfinite_safe(v.z))
}
if (!isfinite_safe(v.z)) {
v.z = 0.0f;
if (!isfinite_safe(v.w))
}
if (!isfinite_safe(v.w)) {
v.w = 0.0f;
}
return v;
}
@@ -609,4 +611,27 @@ ccl_device_inline float4 power(float4 v, float e)
return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.w, e));
}
#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
/* Int/Float conversion */
ccl_device_inline int4 __float4_as_int4(float4 f)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(f.m128));
# else
return make_int4(
__float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
# endif
}
ccl_device_inline float4 __int4_as_float4(int4 i)
{
# ifdef __KERNEL_SSE__
return float4(_mm_castsi128_ps(i.m128));
# else
return make_float4(
__int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
# endif
}
#endif /* !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__) */
CCL_NAMESPACE_END

View File

@@ -5,9 +5,9 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/math_base.h"
#include "util/types_float8.h"
#include "util/types_int8.h"
CCL_NAMESPACE_BEGIN

View File

@@ -4,9 +4,8 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/defines.h"
#include "util/types_int2.h"
CCL_NAMESPACE_BEGIN

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_int3.h"
CCL_NAMESPACE_BEGIN

View File

@@ -5,9 +5,8 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_float4.h"
#include "util/types_int4.h"
CCL_NAMESPACE_BEGIN

View File

@@ -5,9 +5,9 @@
#pragma once
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/math_base.h"
#include "util/types_float8.h"
#include "util/types_int8.h"
CCL_NAMESPACE_BEGIN

View File

@@ -4,6 +4,10 @@
#pragma once
#include "util/math_float2.h"
#include "util/math_float3.h"
#include "util/math_float4.h"
CCL_NAMESPACE_BEGIN
/* Ray Intersection */
@@ -287,16 +291,20 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
}
/* Store the result. */
/* TODO(sergey): Check whether we can avoid some checks here. */
if (isect_P != nullptr)
if (isect_P != nullptr) {
*isect_P = hit;
if (isect_t != nullptr)
}
if (isect_t != nullptr) {
*isect_t = t;
}
/* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
if (isect_u != nullptr)
if (isect_u != nullptr) {
*isect_u = v + 0.5f;
if (isect_v != nullptr)
}
if (isect_v != nullptr) {
*isect_v = -u - v;
}
return true;
}

View File

@@ -4,6 +4,11 @@
#pragma once
#include "util/atomic.h"
#include "util/math_base.h"
#include "util/math_float3.h"
#include "util/math_float4.h"
CCL_NAMESPACE_BEGIN
#define MAT(A, size, row, col) A[(row) * (size) + (col)]
@@ -92,7 +97,7 @@ ccl_device_inline void math_vec3_add(ccl_private float3 *v, int n, ccl_private f
}
ccl_device_inline void math_vec3_add_strided(
ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
ccl_global float3 *v, int n, ccl_private const float *x, float3 w, int stride)
{
for (int i = 0; i < n; i++) {
ccl_global float *elem = (ccl_global float *)(v + i * stride);
@@ -215,16 +220,18 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A,
/* Use forward substitution to solve L*b = y, replacing y by b. */
for (int row = 0; row < n; row++) {
float3 sum = VECS(y, row, stride);
for (int col = 0; col < row; col++)
for (int col = 0; col < row; col++) {
sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
}
VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
}
/* Use backward substitution to solve Lt*S = b, replacing b by S. */
for (int row = n - 1; row >= 0; row--) {
float3 sum = VECS(y, row, stride);
for (int col = row + 1; col < n; col++)
for (int col = row + 1; col < n; col++) {
sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
}
VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
}
}
@@ -430,7 +437,7 @@ ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restr
{
for (int row = 0; row < n; row++) {
for (int col = 0; col <= row; col++) {
MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col));
}
}
}

View File

@@ -4,6 +4,8 @@
#pragma once
#include "util/defines.h"
CCL_NAMESPACE_BEGIN
ccl_device_forceinline bool projection_inverse_impl(ccl_private float R[4][4],

View File

@@ -4,7 +4,8 @@
#pragma once
#include "util/types.h"
#include "util/math_base.h"
#include "util/types_int4.h"
CCL_NAMESPACE_BEGIN

View File

@@ -10,7 +10,8 @@
CCL_NAMESPACE_BEGIN
/* Stack allocator for the use with STL. */
template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {
template<int SIZE, typename T> class ccl_try_align(16) StackAllocator
{
public:
typedef size_t size_type;
typedef ptrdiff_t difference_type;
@@ -58,7 +59,7 @@ template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {
return mem;
}
void deallocate(T *p, size_t n)
void deallocate(T * p, size_t n)
{
if (p == nullptr) {
return;
@@ -77,7 +78,7 @@ template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {
/* Address of an reference. */
T *address(T &x) const
T *address(T & x) const
{
return &x;
}
@@ -89,14 +90,14 @@ template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {
/* Object construction/destruction. */
void construct(T *p, const T &val)
void construct(T * p, const T &val)
{
if (p != nullptr) {
new ((T *)p) T(val);
}
}
void destroy(T *p)
void destroy(T * p)
{
p->~T();
}

View File

@@ -2,134 +2,28 @@
*
* SPDX-License-Identifier: Apache-2.0 */
#ifndef __UTIL_TYPES_H__
#define __UTIL_TYPES_H__
#pragma once
#if !defined(__KERNEL_METAL__)
# include <stdlib.h>
#endif
#include "util/types_base.h" // IWYU pragma: export
/* Standard Integer Types */
#include "util/types_uchar2.h" // IWYU pragma: export
#include "util/types_uchar3.h" // IWYU pragma: export
#include "util/types_uchar4.h" // IWYU pragma: export
#if !defined(__KERNEL_GPU__)
# include <stdint.h>
# include <stdio.h>
#endif
#include "util/types_int2.h" // IWYU pragma: export
#include "util/types_int3.h" // IWYU pragma: export
#include "util/types_int4.h" // IWYU pragma: export
#include "util/types_int8.h" // IWYU pragma: export
#include "util/defines.h"
#include "util/types_uint2.h" // IWYU pragma: export
#include "util/types_uint3.h" // IWYU pragma: export
#include "util/types_uint4.h" // IWYU pragma: export
#ifndef __KERNEL_GPU__
# include "util/optimization.h"
# include "util/simd.h"
#endif
#include "util/types_ushort4.h" // IWYU pragma: export
CCL_NAMESPACE_BEGIN
#include "util/types_float2.h" // IWYU pragma: export
#include "util/types_float3.h" // IWYU pragma: export
#include "util/types_float4.h" // IWYU pragma: export
#include "util/types_float8.h" // IWYU pragma: export
/* Types
*
* Define simpler unsigned type names, and integer with defined number of bits.
* Also vector types, named to be compatible with OpenCL builtin types, while
* working for CUDA and C++ too. */
/* Shorter Unsigned Names */
typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned short ushort;
/* Fixed Bits Types */
#ifndef __KERNEL_GPU__
/* Generic Memory Pointer */
typedef uint64_t device_ptr;
#endif /* __KERNEL_GPU__ */
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
{
return (offset + alignment - 1) & ~(alignment - 1);
}
ccl_device_inline size_t divide_up(size_t x, size_t y)
{
return (x + y - 1) / y;
}
ccl_device_inline size_t round_up(size_t x, size_t multiple)
{
return ((x + multiple - 1) / multiple) * multiple;
}
ccl_device_inline size_t round_down(size_t x, size_t multiple)
{
return (x / multiple) * multiple;
}
ccl_device_inline bool is_power_of_two(size_t x)
{
return (x & (x - 1)) == 0;
}
CCL_NAMESPACE_END
/* Device side printf only tested on CUDA, may work on more GPU devices. */
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__)
# define __KERNEL_PRINTF__
#endif
ccl_device_inline void print_float(ccl_private const char *label, const float a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f\n", label, (double)a);
#endif
}
/* Most GPU APIs matching native vector types, so we only need to implement them for
* CPU and oneAPI. */
#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
# define __KERNEL_NATIVE_VECTOR_TYPES__
#endif
/* Vectorized types declaration. */
#include "util/types_uchar2.h"
#include "util/types_uchar3.h"
#include "util/types_uchar4.h"
#include "util/types_int2.h"
#include "util/types_int3.h"
#include "util/types_int4.h"
#include "util/types_int8.h"
#include "util/types_uint2.h"
#include "util/types_uint3.h"
#include "util/types_uint4.h"
#include "util/types_ushort4.h"
#include "util/types_float2.h"
#include "util/types_float3.h"
#include "util/types_float4.h"
#include "util/types_float8.h"
#include "util/types_spectrum.h"
/* Vectorized types implementation. */
#include "util/types_uchar2_impl.h"
#include "util/types_uchar3_impl.h"
#include "util/types_uchar4_impl.h"
#include "util/types_int2_impl.h"
#include "util/types_int3_impl.h"
#include "util/types_int4_impl.h"
#include "util/types_int8_impl.h"
#include "util/types_uint2_impl.h"
#include "util/types_uint3_impl.h"
#include "util/types_uint4_impl.h"
#include "util/types_float2_impl.h"
#include "util/types_float3_impl.h"
#include "util/types_float4_impl.h"
#include "util/types_float8_impl.h"
#endif /* __UTIL_TYPES_H__ */
#include "util/types_spectrum.h" // IWYU pragma: export

View File

@@ -0,0 +1,90 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#if !defined(__KERNEL_METAL__)
# include <cstdlib>
#endif
/* Standard Integer Types */
#if !defined(__KERNEL_GPU__)
# include <cstdint>
# include <cstdio>
#endif
#include "util/defines.h"
#ifndef __KERNEL_GPU__
# include "util/optimization.h"
# include "util/simd.h"
#endif
CCL_NAMESPACE_BEGIN
/* Types
*
* Define simpler unsigned type names, and integer with defined number of bits.
* Also vector types, named to be compatible with OpenCL builtin types, while
* working for CUDA and C++ too. */
/* Shorter Unsigned Names */
using uchar = unsigned char;
using uint = unsigned int;
using ushort = unsigned short;
/* Fixed Bits Types */
#ifndef __KERNEL_GPU__
/* Generic Memory Pointer */
using device_ptr = uint64_t;
#endif /* __KERNEL_GPU__ */
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
{
return (offset + alignment - 1) & ~(alignment - 1);
}
ccl_device_inline size_t divide_up(size_t x, size_t y)
{
return (x + y - 1) / y;
}
ccl_device_inline size_t round_up(size_t x, size_t multiple)
{
return ((x + multiple - 1) / multiple) * multiple;
}
ccl_device_inline size_t round_down(size_t x, size_t multiple)
{
return (x / multiple) * multiple;
}
ccl_device_inline bool is_power_of_two(size_t x)
{
return (x & (x - 1)) == 0;
}
CCL_NAMESPACE_END
/* Device side printf only tested on CUDA, may work on more GPU devices. */
#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__)
# define __KERNEL_PRINTF__
#endif
ccl_device_inline void print_float(ccl_private const char *label, const float a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f\n", label, (double)a);
#endif
}
/* Most GPU APIs matching native vector types, so we only need to implement them for
* CPU and oneAPI. */
#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
# define __KERNEL_NATIVE_VECTOR_TYPES__
#endif

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -20,10 +18,34 @@ struct float2 {
# endif
};
ccl_device_inline float2 make_float2(float x, float y);
# ifndef __KERNEL_GPU__
__forceinline float float2::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
__forceinline float &float2::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
# endif
ccl_device_inline float2 make_float2(float x, float y)
{
float2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline float2 make_float2(float3 a);
ccl_device_inline void print_float2(ccl_private const char *label, const float2 a);
ccl_device_inline void print_float2(ccl_private const char *label, const float2 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
#endif
}
CCL_NAMESPACE_END

View File

@@ -1,49 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
__forceinline float float2::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
__forceinline float &float2::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
# endif
ccl_device_inline float2 make_float2(float x, float y)
{
float2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline float2 make_float2(float3 a)
{
return make_float2(a.x, a.y);
}
ccl_device_inline void print_float2(ccl_private const char *label, const float2 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
#endif
}
CCL_NAMESPACE_END

View File

@@ -4,9 +4,9 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
#include "util/types_float2.h"
#include "util/types_int4.h"
CCL_NAMESPACE_BEGIN
@@ -37,33 +37,107 @@ struct ccl_try_align(16) float3
# ifdef __KERNEL_SSE__
/* Convenient constructors and operators for SIMD, otherwise default is enough. */
__forceinline float3();
__forceinline float3(const float3 &a);
__forceinline explicit float3(const __m128 &a);
__forceinline float3() = default;
__forceinline float3(const float3 &a) = default;
__forceinline explicit float3(const __m128 &a) : m128(a) {}
__forceinline operator const __m128 &() const;
__forceinline operator __m128 &();
__forceinline operator const __m128 &() const
{
return m128;
}
__forceinline operator __m128 &()
{
return m128;
}
__forceinline float3 &operator=(const float3 &a);
__forceinline float3 &operator=(const float3 &a)
{
m128 = a.m128;
return *this;
}
# endif
# ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
__forceinline float operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
__forceinline float &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
# endif
};
ccl_device_inline float3 make_float3(float x, float y, float z);
ccl_device_inline float3 make_float3(float x, float y, float z)
{
# if defined(__KERNEL_GPU__)
return {x, y, z};
# elif defined(__KERNEL_SSE__)
return float3(_mm_set_ps(0.0f, z, y, x));
# else
return {x, y, z, 0.0f};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline float3 make_float3(float f);
ccl_device_inline float3 make_float3(float4 a);
ccl_device_inline float3 make_float3(float2 a);
ccl_device_inline float3 make_float3(float2 a, float b);
ccl_device_inline void print_float3(ccl_private const char *label, const float3 a);
ccl_device_inline float3 make_float3(float f)
{
#if defined(__KERNEL_GPU__)
return make_float3(f, f, f);
#elif defined(__KERNEL_SSE__)
return float3(_mm_set1_ps(f));
#else
return {f, f, f, f};
#endif
}
/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
ccl_device_inline float3 make_float3(float2 a)
{
return make_float3(a.x, a.y, 0.0f);
}
ccl_device_inline float3 make_float3(float2 a, float b)
{
return make_float3(a.x, a.y, b);
}
ccl_device_inline void print_float3(ccl_private const char *label, const float3 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
#else
(void)label;
(void)a;
#endif
}
ccl_device_inline float2 make_float2(float3 a)
{
return make_float2(a.x, a.y);
}
ccl_device_inline int4 make_int4(const float3 f)
{
#if defined(__KERNEL_GPU__)
return make_int4((int)f.x, (int)f.y, (int)f.z, 0);
#elif defined(__KERNEL_SSE__)
return int4(_mm_cvtps_epi32(f.m128));
#else
return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
#endif
}
/* Packed float3
*
* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
* CPU SIMD instructions can be used. */
#if defined(__KERNEL_METAL__)
/* Metal has native packed_float3. */
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) || defined(__KERNEL_ONEAPI__)
@@ -71,7 +145,7 @@ ccl_device_inline void print_float3(ccl_private const char *label, const float3
typedef float3 packed_float3;
#else
struct packed_float3 {
ccl_device_inline_method packed_float3(){};
ccl_device_inline_method packed_float3() = default;
ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z) {}

View File

@@ -1,100 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline float3::float3() {}
__forceinline float3::float3(const float3 &a) : m128(a.m128) {}
__forceinline float3::float3(const __m128 &a) : m128(a) {}
__forceinline float3::operator const __m128 &() const
{
return m128;
}
__forceinline float3::operator __m128 &()
{
return m128;
}
__forceinline float3 &float3::operator=(const float3 &a)
{
m128 = a.m128;
return *this;
}
# endif /* __KERNEL_SSE__ */
# ifndef __KERNEL_GPU__
__forceinline float float3::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
__forceinline float &float3::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
# endif
ccl_device_inline float3 make_float3(float x, float y, float z)
{
# if defined(__KERNEL_GPU__)
return {x, y, z};
# elif defined(__KERNEL_SSE__)
return float3(_mm_set_ps(0.0f, z, y, x));
# else
return {x, y, z, 0.0f};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline float3 make_float3(float f)
{
#if defined(__KERNEL_GPU__)
return make_float3(f, f, f);
#elif defined(__KERNEL_SSE__)
return float3(_mm_set1_ps(f));
#else
return {f, f, f, f};
#endif
}
ccl_device_inline float3 make_float3(float4 a)
{
return make_float3(a.x, a.y, a.z);
}
ccl_device_inline float3 make_float3(float2 a)
{
return make_float3(a.x, a.y, 0.0f);
}
ccl_device_inline float3 make_float3(float2 a, float b)
{
return make_float3(a.x, a.y, b);
}
ccl_device_inline void print_float3(ccl_private const char *label, const float3 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
#endif
}
CCL_NAMESPACE_END

View File

@@ -4,9 +4,9 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
#include "util/types_float3.h"
#include "util/types_int4.h"
CCL_NAMESPACE_BEGIN
@@ -23,30 +23,103 @@ struct ccl_try_align(16) float4
};
};
__forceinline float4();
__forceinline explicit float4(const __m128 &a);
__forceinline float4() = default;
__forceinline float4(const float4 &a) = default;
__forceinline explicit float4(const __m128 &a) : m128(a) {}
__forceinline operator const __m128 &() const;
__forceinline operator __m128 &();
__forceinline operator const __m128 &() const
{
return m128;
}
__forceinline operator __m128 &()
{
return m128;
}
__forceinline float4 &operator=(const float4 &a);
__forceinline float4 &operator=(const float4 &a)
{
m128 = a.m128;
return *this;
}
# else /* __KERNEL_SSE__ */
float x, y, z, w;
# endif /* __KERNEL_SSE__ */
# ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
__forceinline float operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
__forceinline float &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
# endif
};
ccl_device_inline float4 make_float4(float x, float y, float z, float w);
ccl_device_inline float4 make_float4(float x, float y, float z, float w)
{
# ifdef __KERNEL_SSE__
return float4(_mm_set_ps(w, z, y, x));
# else
return {x, y, z, w};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline float4 make_float4(float f);
ccl_device_inline float4 make_float4(float3 a, float b);
ccl_device_inline float4 make_float4(const int4 i);
ccl_device_inline void print_float4(ccl_private const char *label, const float4 a);
ccl_device_inline float4 make_float4(float f)
{
#ifdef __KERNEL_SSE__
return float4(_mm_set1_ps(f));
#else
return make_float4(f, f, f, f);
#endif
}
ccl_device_inline float4 make_float4(float3 a, float b)
{
return make_float4(a.x, a.y, a.z, b);
}
ccl_device_inline float4 make_float4(float3 a)
{
return make_float4(a.x, a.y, a.z, 1.0f);
}
ccl_device_inline float4 make_float4(const int4 i)
{
#ifdef __KERNEL_SSE__
return float4(_mm_cvtepi32_ps(i.m128));
#else
return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w);
#endif
}
ccl_device_inline float3 make_float3(float4 a)
{
return make_float3(a.x, a.y, a.z);
}
ccl_device_inline int4 make_int4(const float4 f)
{
#ifdef __KERNEL_SSE__
return int4(_mm_cvtps_epi32(f.m128));
#else
return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
#endif
}
ccl_device_inline void print_float4(ccl_private const char *label, const float4 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
#endif
}
CCL_NAMESPACE_END

View File

@@ -1,98 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline float4::float4() {}
__forceinline float4::float4(const __m128 &a) : m128(a) {}
__forceinline float4::operator const __m128 &() const
{
return m128;
}
__forceinline float4::operator __m128 &()
{
return m128;
}
__forceinline float4 &float4::operator=(const float4 &a)
{
m128 = a.m128;
return *this;
}
# endif /* __KERNEL_SSE__ */
# ifndef __KERNEL_GPU__
__forceinline float float4::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
__forceinline float &float4::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
# endif
ccl_device_inline float4 make_float4(float x, float y, float z, float w)
{
# ifdef __KERNEL_SSE__
return float4(_mm_set_ps(w, z, y, x));
# else
return {x, y, z, w};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline float4 make_float4(float f)
{
#ifdef __KERNEL_SSE__
return float4(_mm_set1_ps(f));
#else
return make_float4(f, f, f, f);
#endif
}
ccl_device_inline float4 make_float4(float3 a, float b)
{
return make_float4(a.x, a.y, a.z, b);
}
ccl_device_inline float4 make_float4(float3 a)
{
return make_float4(a.x, a.y, a.z, 1.0f);
}
ccl_device_inline float4 make_float4(const int4 i)
{
#ifdef __KERNEL_SSE__
return float4(_mm_cvtepi32_ps(i.m128));
#else
return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w);
#endif
}
ccl_device_inline void print_float4(ccl_private const char *label, const float4 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
#endif
}
CCL_NAMESPACE_END

View File

@@ -7,9 +7,9 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
#include "util/types_float4.h"
#include "util/types_int8.h"
CCL_NAMESPACE_BEGIN
@@ -30,30 +30,99 @@ struct ccl_try_align(32) vfloat8
};
};
__forceinline vfloat8();
__forceinline vfloat8(const vfloat8 &a);
__forceinline explicit vfloat8(const __m256 &a);
__forceinline vfloat8() = default;
__forceinline vfloat8(const vfloat8 &a) = default;
__forceinline explicit vfloat8(const __m256 &a) : m256(a) {}
__forceinline operator const __m256 &() const;
__forceinline operator __m256 &();
__forceinline operator const __m256 &() const
{
return m256;
}
__forceinline operator __m256 &()
{
return m256;
}
__forceinline vfloat8 &operator=(const vfloat8 &a);
__forceinline vfloat8 &operator=(const vfloat8 &a)
{
m256 = a.m256;
return *this;
}
#else /* __KERNEL_AVX__ */
float a, b, c, d, e, f, g, h;
#endif /* __KERNEL_AVX__ */
#ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
__forceinline float operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
__forceinline float &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
#endif
};
ccl_device_inline vfloat8 make_vfloat8(float f);
ccl_device_inline vfloat8
make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h);
ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b);
ccl_device_inline vfloat8 make_vfloat8(float f)
{
#ifdef __KERNEL_AVX__
vfloat8 r(_mm256_set1_ps(f));
#else
vfloat8 r = {f, f, f, f, f, f, f, f};
#endif
return r;
}
ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a);
ccl_device_inline vfloat8
make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
{
#ifdef __KERNEL_AVX__
vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
#else
vfloat8 r = {a, b, c, d, e, f, g, h};
#endif
return r;
}
ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
#else
return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
#endif
}
ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
label,
(double)a.a,
(double)a.b,
(double)a.c,
(double)a.d,
(double)a.e,
(double)a.f,
(double)a.g,
(double)a.h);
#endif
}
ccl_device_inline vint8 make_vint8(const vfloat8 f)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_cvtps_epi32(f.m256));
#else
return make_vint8(
(int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
#endif
}
CCL_NAMESPACE_END

View File

@@ -1,102 +0,0 @@
/* SPDX-FileCopyrightText: 2017 Intel Corporation
* SPDX-FileCopyrightText: 2018-2022 Blender Foundation
*
* SPDX-License-Identifier: BSD-3-Clause
*
* Originally by Intel Corporation, modified by the Blender Foundation. */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_AVX__
__forceinline vfloat8::vfloat8() {}
__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256) {}
__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f) {}
__forceinline vfloat8::operator const __m256 &() const
{
return m256;
}
__forceinline vfloat8::operator __m256 &()
{
return m256;
}
__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f)
{
m256 = f.m256;
return *this;
}
#endif /* __KERNEL_AVX__ */
#ifndef __KERNEL_GPU__
__forceinline float vfloat8::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
__forceinline float &vfloat8::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
#endif
ccl_device_inline vfloat8 make_vfloat8(float f)
{
#ifdef __KERNEL_AVX__
vfloat8 r(_mm256_set1_ps(f));
#else
vfloat8 r = {f, f, f, f, f, f, f, f};
#endif
return r;
}
ccl_device_inline vfloat8
make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
{
#ifdef __KERNEL_AVX__
vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
#else
vfloat8 r = {a, b, c, d, e, f, g, h};
#endif
return r;
}
ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
#else
return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
#endif
}
ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
label,
(double)a.a,
(double)a.b,
(double)a.c,
(double)a.d,
(double)a.e,
(double)a.f,
(double)a.g,
(double)a.h);
#endif
}
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,27 @@ struct int2 {
int x, y;
# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
__forceinline int operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
__forceinline int &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
# endif
};
ccl_device_inline int2 make_int2(int x, int y);
ccl_device_inline int2 make_int2(int x, int y)
{
int2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -1,37 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
int int2::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
int &int2::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
# endif
ccl_device_inline int2 make_int2(int x, int y)
{
int2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -26,30 +24,77 @@ struct ccl_try_align(16) int3
};
};
__forceinline int3();
__forceinline int3(const int3 &a);
__forceinline explicit int3(const __m128i &a);
__forceinline int3() = default;
__forceinline int3(const int3 &a) = default;
__forceinline explicit int3(const __m128i &a) : m128(a) {}
__forceinline operator const __m128i &() const;
__forceinline operator __m128i &();
__forceinline operator const __m128i &() const
{
return m128;
}
__forceinline int3 &operator=(const int3 &a);
__forceinline operator __m128i &()
{
return m128;
}
__forceinline int3 &operator=(const int3 &a)
{
m128 = a.m128;
return *this;
}
# else /* __KERNEL_SSE__ */
int x, y, z, w;
# endif /* __KERNEL_SSE__ */
# endif
# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
__forceinline int operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
__forceinline int &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
# endif
};
ccl_device_inline int3 make_int3(int x, int y, int z);
ccl_device_inline int3 make_int3(int x, int y, int z)
{
# if defined(__KERNEL_GPU__)
return {x, y, z};
# elif defined(__KERNEL_SSE__)
return int3(_mm_set_epi32(0, z, y, x));
# else
return {x, y, z, 0};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline int3 make_int3(int i);
ccl_device_inline void print_int3(ccl_private const char *label, const int3 a);
ccl_device_inline int3 make_int3(int i)
{
#if defined(__KERNEL_GPU__)
return make_int3(i, i, i);
#elif defined(__KERNEL_SSE__)
return int3(_mm_set1_epi32(i));
#else
return {i, i, i, i};
#endif
}
ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
#endif
}
#if defined(__KERNEL_METAL__)
/* Metal has native packed_int3. */
@@ -61,7 +106,7 @@ typedef int3 packed_int3;
struct packed_int3 {
int x, y, z;
ccl_device_inline_method packed_int3(){};
ccl_device_inline_method packed_int3() = default;
ccl_device_inline_method packed_int3(const int px, const int py, const int pz)
: x(px), y(py), z(pz){};
@@ -82,12 +127,27 @@ struct packed_int3 {
}
# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
__forceinline int operator[](int i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline int &operator[](int i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
};
static_assert(sizeof(packed_int3) == 12, "packed_int3 expected to be exactly 12 bytes");
#endif
ccl_device_inline packed_int3 make_packed_int3(int x, int y, int z)
{
packed_int3 a = {x, y, z};
return a;
}
CCL_NAMESPACE_END

View File

@@ -1,103 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline int3::int3() {}
__forceinline int3::int3(const __m128i &a) : m128(a) {}
__forceinline int3::int3(const int3 &a) : m128(a.m128) {}
__forceinline int3::operator const __m128i &() const
{
return m128;
}
__forceinline int3::operator __m128i &()
{
return m128;
}
__forceinline int3 &int3::operator=(const int3 &a)
{
m128 = a.m128;
return *this;
}
# endif /* __KERNEL_SSE__ */
# ifndef __KERNEL_GPU__
__forceinline int int3::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
__forceinline int &int3::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
__forceinline int packed_int3::operator[](int i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline int &packed_int3::operator[](int i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
ccl_device_inline int3 make_int3(int x, int y, int z)
{
# if defined(__KERNEL_GPU__)
return {x, y, z};
# elif defined(__KERNEL_SSE__)
return int3(_mm_set_epi32(0, z, y, x));
# else
return {x, y, z, 0};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline int3 make_int3(int i)
{
#if defined(__KERNEL_GPU__)
return make_int3(i, i, i);
#elif defined(__KERNEL_SSE__)
return int3(_mm_set1_epi32(i));
#else
return {i, i, i, i};
#endif
}
ccl_device_inline packed_int3 make_packed_int3(int x, int y, int z)
{
packed_int3 a = {x, y, z};
return a;
}
ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
#endif
}
CCL_NAMESPACE_END

View File

@@ -4,17 +4,12 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct float3;
struct float4;
struct ccl_try_align(16) int4
{
# ifdef __KERNEL_SSE__
@@ -25,30 +20,68 @@ struct ccl_try_align(16) int4
};
};
__forceinline int4();
__forceinline int4(const int4 &a);
__forceinline explicit int4(const __m128i &a);
__forceinline int4() = default;
__forceinline int4(const int4 &a) = default;
__forceinline explicit int4(const __m128i &a) : m128(a) {}
__forceinline operator const __m128i &() const;
__forceinline operator __m128i &();
__forceinline operator const __m128i &() const
{
return m128;
}
__forceinline operator __m128i &()
{
return m128;
}
__forceinline int4 &operator=(const int4 &a);
__forceinline int4 &operator=(const int4 &a)
{
m128 = a.m128;
return *this;
}
# else /* __KERNEL_SSE__ */
int x, y, z, w;
# endif /* __KERNEL_SSE__ */
# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
__forceinline int operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
__forceinline int &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
# endif
};
ccl_device_inline int4 make_int4(int x, int y, int z, int w);
ccl_device_inline int4 make_int4(int x, int y, int z, int w)
{
# ifdef __KERNEL_SSE__
return int4(_mm_set_epi32(w, z, y, x));
# else
return {x, y, z, w};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline int4 make_int4(int i);
ccl_device_inline int4 make_int4(const float3 f);
ccl_device_inline int4 make_int4(const float4 f);
ccl_device_inline void print_int4(ccl_private const char *label, const int4 a);
ccl_device_inline int4 make_int4(int i)
{
#ifdef __KERNEL_SSE__
return int4(_mm_set1_epi32(i));
#else
return make_int4(i, i, i, i);
#endif
}
ccl_device_inline void print_int4(ccl_private const char *label, const int4 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
#endif
}
CCL_NAMESPACE_END

View File

@@ -1,101 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline int4::int4() {}
__forceinline int4::int4(const int4 &a) : m128(a.m128) {}
__forceinline int4::int4(const __m128i &a) : m128(a) {}
__forceinline int4::operator const __m128i &() const
{
return m128;
}
__forceinline int4::operator __m128i &()
{
return m128;
}
__forceinline int4 &int4::operator=(const int4 &a)
{
m128 = a.m128;
return *this;
}
# endif /* __KERNEL_SSE__ */
# ifndef __KERNEL_GPU__
__forceinline int int4::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
__forceinline int &int4::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
# endif
ccl_device_inline int4 make_int4(int x, int y, int z, int w)
{
# ifdef __KERNEL_SSE__
return int4(_mm_set_epi32(w, z, y, x));
# else
return {x, y, z, w};
# endif
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
ccl_device_inline int4 make_int4(int i)
{
#ifdef __KERNEL_SSE__
return int4(_mm_set1_epi32(i));
#else
return make_int4(i, i, i, i);
#endif
}
ccl_device_inline int4 make_int4(const float3 f)
{
#if defined(__KERNEL_GPU__)
return make_int4((int)f.x, (int)f.y, (int)f.z, 0);
#elif defined(__KERNEL_SSE__)
return int4(_mm_cvtps_epi32(f.m128));
#else
return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
#endif
}
ccl_device_inline int4 make_int4(const float4 f)
{
#ifdef __KERNEL_SSE__
return int4(_mm_cvtps_epi32(f.m128));
#else
return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
#endif
}
ccl_device_inline void print_int4(ccl_private const char *label, const int4 a)
{
#ifdef __KERNEL_PRINTF__
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
#endif
}
CCL_NAMESPACE_END

View File

@@ -4,9 +4,8 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
#include "util/types_int4.h"
CCL_NAMESPACE_BEGIN
@@ -26,27 +25,69 @@ struct ccl_try_align(32) vint8
};
};
__forceinline vint8();
__forceinline vint8(const vint8 &a);
__forceinline explicit vint8(const __m256i &a);
__forceinline vint8() = default;
__forceinline vint8(const vint8 &a) = default;
__forceinline explicit vint8(const __m256i &a) : m256(a) {}
__forceinline operator const __m256i &() const;
__forceinline operator __m256i &();
__forceinline operator const __m256i &() const
{
return m256;
}
__forceinline operator __m256i &()
{
return m256;
}
__forceinline vint8 &operator=(const vint8 &a);
__forceinline vint8 &operator=(const vint8 &a)
{
m256 = a.m256;
return *this;
}
#else /* __KERNEL_AVX__ */
int a, b, c, d, e, f, g, h;
#endif /* __KERNEL_AVX__ */
#ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
__forceinline int operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
__forceinline int &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
#endif
};
ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h);
ccl_device_inline vint8 make_vint8(int i);
ccl_device_inline vint8 make_vint8(const vfloat8 f);
ccl_device_inline vint8 make_vint8(const int4 a, const int4 b);
ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
#else
return {a, b, c, d, e, f, g, h};
#endif
}
ccl_device_inline vint8 make_vint8(int i)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_set1_epi32(i));
#else
return make_vint8(i, i, i, i, i, i, i, i);
#endif
}
ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
#else
return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
#endif
}
CCL_NAMESPACE_END

View File

@@ -1,90 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_AVX__
__forceinline vint8::vint8() {}
__forceinline vint8::vint8(const vint8 &a) : m256(a.m256) {}
__forceinline vint8::vint8(const __m256i &a) : m256(a) {}
__forceinline vint8::operator const __m256i &() const
{
return m256;
}
__forceinline vint8::operator __m256i &()
{
return m256;
}
__forceinline vint8 &vint8::operator=(const vint8 &a)
{
m256 = a.m256;
return *this;
}
#endif /* __KERNEL_AVX__ */
#ifndef __KERNEL_GPU__
__forceinline int vint8::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
__forceinline int &vint8::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
#endif
ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
#else
return {a, b, c, d, e, f, g, h};
#endif
}
ccl_device_inline vint8 make_vint8(int i)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_set1_epi32(i));
#else
return make_vint8(i, i, i, i, i, i, i, i);
#endif
}
ccl_device_inline vint8 make_vint8(const vfloat8 f)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_cvtps_epi32(f.m256));
#else
return make_vint8(
(int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
#endif
}
ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
#else
return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
#endif
}
CCL_NAMESPACE_END

View File

@@ -4,25 +4,21 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_float3.h"
CCL_NAMESPACE_BEGIN
#define SPECTRUM_CHANNELS 3
#define SPECTRUM_DATA_TYPE float3
#define PACKED_SPECTRUM_DATA_TYPE packed_float3
using Spectrum = SPECTRUM_DATA_TYPE;
using PackedSpectrum = PACKED_SPECTRUM_DATA_TYPE;
using Spectrum = float3;
using PackedSpectrum = packed_float3;
#define make_spectrum(f) CONCAT(make_, SPECTRUM_DATA_TYPE(f))
#define load_spectrum(f) CONCAT(load_, SPECTRUM_DATA_TYPE(f))
#define store_spectrum(s, f) CONCAT(store_, SPECTRUM_DATA_TYPE((s), (f)))
#define make_spectrum(f) make_float3(f)
#define load_spectrum(f) load_float3(f)
#define store_spectrum(s, f) store_float3(f)
#define zero_spectrum CONCAT(zero_, SPECTRUM_DATA_TYPE)
#define one_spectrum CONCAT(one_, SPECTRUM_DATA_TYPE)
#define zero_spectrum zero_float3
#define one_spectrum one_float3
#define FOREACH_SPECTRUM_CHANNEL(counter) \
for (int counter = 0; counter < SPECTRUM_CHANNELS; counter++)

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,27 @@ struct uchar2 {
uchar x, y;
# ifndef __KERNEL_GPU__
__forceinline uchar operator[](int i) const;
__forceinline uchar &operator[](int i);
__forceinline uchar operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
__forceinline uchar &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
# endif
};
ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
{
uchar2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -1,37 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
uchar uchar2::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
uchar &uchar2::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 2);
return *(&x + i);
}
# endif
ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
{
uchar2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,27 @@ struct uchar3 {
uchar x, y, z;
# ifndef __KERNEL_GPU__
__forceinline uchar operator[](int i) const;
__forceinline uchar &operator[](int i);
__forceinline uchar operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
__forceinline uchar &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
# endif
};
ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
{
uchar3 a = {x, y, z};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -1,37 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
uchar uchar3::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
uchar &uchar3::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 3);
return *(&x + i);
}
# endif
ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
{
uchar3 a = {x, y, z};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,27 @@ struct uchar4 {
uchar x, y, z, w;
# ifndef __KERNEL_GPU__
__forceinline uchar operator[](int i) const;
__forceinline uchar &operator[](int i);
__forceinline uchar operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
__forceinline uchar &operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
# endif
};
ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
{
uchar4 a = {x, y, z, w};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -1,37 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
uchar uchar4::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
uchar &uchar4::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 4);
return *(&x + i);
}
# endif
ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
{
uchar4 a = {x, y, z, w};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,25 @@ struct uint2 {
uint x, y;
# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
__forceinline uint operator[](uint i) const
{
util_assert(i < 2);
return *(&x + i);
}
__forceinline uint &operator[](uint i)
{
util_assert(i < 2);
return *(&x + i);
}
# endif
};
ccl_device_inline uint2 make_uint2(uint x, uint y);
ccl_device_inline uint2 make_uint2(uint x, uint y)
{
uint2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -1,35 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
__forceinline uint uint2::operator[](uint i) const
{
util_assert(i < 2);
return *(&x + i);
}
__forceinline uint &uint2::operator[](uint i)
{
util_assert(i < 2);
return *(&x + i);
}
# endif
ccl_device_inline uint2 make_uint2(uint x, uint y)
{
uint2 a = {x, y};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,25 @@ struct uint3 {
uint x, y, z;
# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
__forceinline uint operator[](uint i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint &operator[](uint i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
};
ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
{
uint3 a = {x, y, z};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
#if defined(__KERNEL_METAL__)
@@ -33,7 +44,7 @@ typedef uint3 packed_uint3;
struct packed_uint3 {
uint x, y, z;
ccl_device_inline_method packed_uint3(){};
ccl_device_inline_method packed_uint3() = default;
ccl_device_inline_method packed_uint3(const uint px, const uint py, const uint pz)
: x(px), y(py), z(pz){};
@@ -54,11 +65,27 @@ struct packed_uint3 {
}
# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
__forceinline uint operator[](uint i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint &operator[](uint i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
};
static_assert(sizeof(packed_uint3) == 12, "packed_uint3 expected to be exactly 12 bytes");
#endif
ccl_device_inline packed_uint3 make_packed_uint3(uint x, uint y, uint z)
{
packed_uint3 a = {x, y, z};
return a;
}
CCL_NAMESPACE_END

View File

@@ -1,53 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
__forceinline uint uint3::operator[](uint i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint &uint3::operator[](uint i)
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint packed_uint3::operator[](uint i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint &packed_uint3::operator[](uint i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
{
uint3 a = {x, y, z};
return a;
}
ccl_device_inline packed_uint3 make_packed_uint3(uint x, uint y, uint z)
{
packed_uint3 a = {x, y, z};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
@@ -15,12 +13,25 @@ struct uint4 {
uint x, y, z, w;
# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
__forceinline uint operator[](uint i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint &operator[](uint i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
};
ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
{
uint4 a = {x, y, z, w};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -1,35 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifndef __KERNEL_GPU__
__forceinline uint uint4::operator[](uint i) const
{
util_assert(i < 3);
return *(&x + i);
}
__forceinline uint &uint4::operator[](uint i)
{
util_assert(i < 3);
return *(&x + i);
}
# endif
ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
{
uint4 a = {x, y, z, w};
return a;
}
#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END

View File

@@ -4,9 +4,7 @@
#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
#include "util/defines.h"
CCL_NAMESPACE_BEGIN