Refactor: Cycles: Make math and type headers work by themselves

Remove separate impl.h headers, shuffle around some code and add more includes so that individual header files compile without errors. Pull Request: https://projects.blender.org/blender/blender/pulls/132361
2024-12-26 17:53:56 +01:00
parent f53e13411b
commit 7db0bc2e64
71 changed files with 1959 additions and 2416 deletions
--- a/intern/cycles/app/cycles_precompute.cpp
+++ b/intern/cycles/app/cycles_precompute.cpp
@@ -16,7 +16,7 @@
 #include "kernel/sample/lcg.h"
 #include "kernel/sample/mapping.h"

-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 #include "kernel/closure/bsdf_microfacet.h"

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -352,7 +352,7 @@ set(SRC_KERNEL_SAMPLE_HEADERS
 )

 set(SRC_KERNEL_UTIL_HEADERS
-  util/color.h
+  util/colorspace.h
  util/differential.h
  util/ies.h
  util/lookup_table.h
@@ -390,6 +390,7 @@ set(SRC_UTIL_HEADERS
  ../util/half.h
  ../util/hash.h
  ../util/math.h
+  ../util/math_base.h
  ../util/math_fast.h
  ../util/math_intersect.h
  ../util/math_float2.h
@@ -409,35 +410,22 @@ set(SRC_UTIL_HEADERS
  ../util/transform_inverse.h
  ../util/texture.h
  ../util/types.h
+  ../util/types_base.h
  ../util/types_float2.h
-  ../util/types_float2_impl.h
  ../util/types_float3.h
-  ../util/types_float3_impl.h
  ../util/types_float4.h
-  ../util/types_float4_impl.h
  ../util/types_float8.h
-  ../util/types_float8_impl.h
  ../util/types_int2.h
-  ../util/types_int2_impl.h
  ../util/types_int3.h
-  ../util/types_int3_impl.h
  ../util/types_int4.h
-  ../util/types_int4_impl.h
  ../util/types_int8.h
-  ../util/types_int8_impl.h
  ../util/types_spectrum.h
  ../util/types_uchar2.h
-  ../util/types_uchar2_impl.h
  ../util/types_uchar3.h
-  ../util/types_uchar3_impl.h
  ../util/types_uchar4.h
-  ../util/types_uchar4_impl.h
  ../util/types_uint2.h
-  ../util/types_uint2_impl.h
  ../util/types_uint3.h
-  ../util/types_uint3_impl.h
  ../util/types_uint4.h
-  ../util/types_uint4_impl.h
  ../util/types_ushort4.h
 )

--- a/intern/cycles/kernel/bake/bake.h
+++ b/intern/cycles/kernel/bake/bake.h
@@ -10,7 +10,7 @@

 #include "kernel/geom/geom.h"

-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -8,7 +8,7 @@
 #pragma once

 #include "kernel/sample/mapping.h"
-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -7,7 +7,7 @@

 #pragma once

-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/closure/bsdf_principled_hair_chiang.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_hair_chiang.h
@@ -11,7 +11,7 @@
 #  include <fenv.h>
 #endif

-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/closure/bsdf_principled_hair_huang.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_hair_huang.h
@@ -10,7 +10,7 @@

 #include "kernel/closure/bsdf_util.h"
 #include "kernel/sample/lcg.h"
-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -7,6 +7,8 @@

 #pragma once

+#include "util/color.h"
+
 CCL_NAMESPACE_BEGIN

 /* Compute fresnel reflectance for perpendicular (aka S-) and parallel (aka P-) polarized light.
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -8,6 +8,8 @@

 #pragma once

+#include "util/color.h"
+
 CCL_NAMESPACE_BEGIN

 /* --------------------------------------------------------------------
--- a/intern/cycles/kernel/film/write.h
+++ b/intern/cycles/kernel/film/write.h
@@ -4,7 +4,7 @@

 #pragma once

-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 #ifdef __KERNEL_GPU__
 #  define __ATOMIC_PASS_WRITE__
--- a/intern/cycles/kernel/integrator/guiding.h
+++ b/intern/cycles/kernel/integrator/guiding.h
@@ -8,6 +8,8 @@
 #include "kernel/closure/bsdf.h"
 #include "kernel/film/write.h"

+#include "util/color.h"
+
 CCL_NAMESPACE_BEGIN

 /* Utilities. */
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -18,6 +18,8 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"

+#include "util/color.h"
+
 CCL_NAMESPACE_BEGIN

 #ifdef __VOLUME__
--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -8,6 +8,8 @@

 #include "kernel/integrator/guiding.h"

+#include "util/color.h"
+
 CCL_NAMESPACE_BEGIN

 #ifdef __SUBSURFACE__
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -45,8 +45,6 @@

 #include "kernel/svm/svm.h"

-#include "kernel/util/color.h"
-
 CCL_NAMESPACE_BEGIN

 /* RenderServices implementation */
--- a/intern/cycles/kernel/osl/services_gpu.h
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -255,7 +255,7 @@ ccl_device_extern ccl_private OSLClosure *osl_allocate_weighted_closure_componen
 /* Utilities */

 #include "kernel/svm/math_util.h"
-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 ccl_device_extern void osl_error(ccl_private ShaderGlobals *sg, const char *format, void *args) {}

--- a/intern/cycles/kernel/svm/closure.h
+++ b/intern/cycles/kernel/svm/closure.h
@@ -9,7 +9,7 @@
 #include "kernel/closure/bsdf_util.h"
 #include "kernel/closure/emissive.h"

-#include "kernel/util/color.h"
+#include "kernel/util/colorspace.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/util/colorspace.h
+++ b/intern/cycles/kernel/util/colorspace.h
@@ -4,8 +4,6 @@

 #pragma once

-#include "util/color.h"
-
 CCL_NAMESPACE_BEGIN

 ccl_device float3 xyz_to_rgb(KernelGlobals kg, float3 xyz)
--- a/intern/cycles/scene/shader_graph.h
+++ b/intern/cycles/scene/shader_graph.h
@@ -173,7 +173,7 @@ class ShaderNode : public Node {
  /* Simplify settings used by artists to the ones which are simpler to
   * evaluate in the kernel but keep the final result unchanged.
   */
-  virtual void simplify_settings(Scene * /*scene*/) {};
+  virtual void simplify_settings(Scene * /*scene*/){};

  virtual bool has_surface_emission()
  {
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -43,6 +43,7 @@ set(SRC_HEADERS
  array.h
  atomic.h
  boundbox.h
+  color.h
  debug.h
  defines.h
  deque.h
@@ -60,6 +61,7 @@ set(SRC_HEADERS
  log.h
  map.h
  math.h
+  math_base.h
  math_cdf.h
  math_fast.h
  math_intersect.h
@@ -101,35 +103,22 @@ set(SRC_HEADERS
  transform.h
  transform_inverse.h
  types.h
+  types_base.h
  types_float2.h
-  types_float2_impl.h
  types_float3.h
-  types_float3_impl.h
  types_float4.h
-  types_float4_impl.h
  types_float8.h
-  types_float8_impl.h
  types_int2.h
-  types_int2_impl.h
  types_int3.h
-  types_int3_impl.h
  types_int4.h
-  types_int4_impl.h
  types_int8.h
-  types_int8_impl.h
  types_spectrum.h
  types_uchar2.h
-  types_uchar2_impl.h
  types_uchar3.h
-  types_uchar3_impl.h
  types_uchar4.h
-  types_uchar4_impl.h
  types_uint2.h
-  types_uint2_impl.h
  types_uint3.h
-  types_uint3_impl.h
  types_uint4.h
-  types_uint4_impl.h
  types_ushort4.h
  unique_ptr.h
  vector.h
--- a/intern/cycles/util/algorithm.h
+++ b/intern/cycles/util/algorithm.h
@@ -12,5 +12,6 @@ using std::remove;
 using std::sort;
 using std::stable_sort;
 using std::swap;
+using std::upper_bound;

 CCL_NAMESPACE_END
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -2,8 +2,7 @@
 *
 * SPDX-License-Identifier: Apache-2.0 */

-#ifndef __UTIL_COLOR_H__
-#define __UTIL_COLOR_H__
+#pragma once

 #include "util/math.h"
 #include "util/types.h"
@@ -351,6 +350,67 @@ ccl_device float3 color_highlight_uncompress(float3 color)
  return exp(color) - one_float3();
 }

-CCL_NAMESPACE_END
+/* Color division */

-#endif /* __UTIL_COLOR_H__ */
+ccl_device_inline Spectrum safe_invert_color(Spectrum a)
+{
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ?
+                                     1.0f / GET_SPECTRUM_CHANNEL(a, i) :
+                                     0.0f;
+  }
+
+  return a;
+}
+
+/* Returns `a/b`, and replace the channel value with `fallback` if `b == 0`. */
+ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b, const float fallback = 0.0f)
+{
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ?
+                                     GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) :
+                                     fallback;
+  }
+
+  return a;
+}
+
+ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
+{
+  float x, y, z;
+
+  x = (b.x != 0.0f) ? a.x / b.x : 0.0f;
+  y = (b.y != 0.0f) ? a.y / b.y : 0.0f;
+  z = (b.z != 0.0f) ? a.z / b.z : 0.0f;
+
+  /* try to get gray even if b is zero */
+  if (b.x == 0.0f) {
+    if (b.y == 0.0f) {
+      x = z;
+      y = z;
+    }
+    else if (b.z == 0.0f) {
+      x = y;
+      z = y;
+    }
+    else {
+      x = 0.5f * (y + z);
+    }
+  }
+  else if (b.y == 0.0f) {
+    if (b.z == 0.0f) {
+      y = x;
+      z = x;
+    }
+    else {
+      y = 0.5f * (x + z);
+    }
+  }
+  else if (b.z == 0.0f) {
+    z = 0.5f * (x + y);
+  }
+
+  return make_float3(x, y, z);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -2,1089 +2,21 @@
 *
 * SPDX-License-Identifier: Apache-2.0 */

-#ifndef __UTIL_MATH_H__
-#define __UTIL_MATH_H__
+#pragma once

-/* Math
- *
- * Basic math functions on scalar and vector types. This header is used by
- * both the kernel code when compiled as C++, and other C++ non-kernel code. */
+#include "util/types.h"  // IWYU pragma: export

-#ifndef __KERNEL_GPU__
-#  include <cmath>
-#endif
+#include "util/math_base.h"  // IWYU pragma: export

-#ifdef __HIP__
-#  include <hip/hip_vector_types.h>
-#endif
+#include "util/math_int2.h"  // IWYU pragma: export
+#include "util/math_int3.h"  // IWYU pragma: export
+#include "util/math_int4.h"  // IWYU pragma: export
+#include "util/math_int8.h"  // IWYU pragma: export

-#if !defined(__KERNEL_METAL__)
-#  include <float.h>
-#  include <math.h>
-#  include <stdio.h>
-#endif /* !defined(__KERNEL_METAL__) */
+#include "util/math_float2.h"  // IWYU pragma: export
+#include "util/math_float4.h"  // IWYU pragma: export
+#include "util/math_float8.h"  // IWYU pragma: export

-#include "util/types.h"
+#include "util/math_float3.h"  // IWYU pragma: export

-CCL_NAMESPACE_BEGIN
-
-/* Float Pi variations */
-
-/* Division */
-#ifndef M_PI_F
-#  define M_PI_F (3.1415926535897932f) /* pi */
-#endif
-#ifndef M_PI_2_F
-#  define M_PI_2_F (1.5707963267948966f) /* pi/2 */
-#endif
-#ifndef M_PI_4_F
-#  define M_PI_4_F (0.7853981633974830f) /* pi/4 */
-#endif
-#ifndef M_1_PI_F
-#  define M_1_PI_F (0.3183098861837067f) /* 1/pi */
-#endif
-#ifndef M_2_PI_F
-#  define M_2_PI_F (0.6366197723675813f) /* 2/pi */
-#endif
-#ifndef M_1_2PI_F
-#  define M_1_2PI_F (0.1591549430918953f) /* 1/(2*pi) */
-#endif
-#ifndef M_1_4PI_F
-#  define M_1_4PI_F (0.0795774715459476f) /* 1/(4*pi) */
-#endif
-#ifndef M_SQRT_PI_8_F
-#  define M_SQRT_PI_8_F (0.6266570686577501f) /* sqrt(pi/8) */
-#endif
-#ifndef M_LN_2PI_F
-#  define M_LN_2PI_F (1.8378770664093454f) /* ln(2*pi) */
-#endif
-
-/* Multiplication */
-#ifndef M_2PI_F
-#  define M_2PI_F (6.2831853071795864f) /* 2*pi */
-#endif
-#ifndef M_4PI_F
-#  define M_4PI_F (12.566370614359172f) /* 4*pi */
-#endif
-#ifndef M_PI_4F
-#  define M_PI_4F 0.78539816339744830962f /* pi/4 */
-#endif
-
-/* Float sqrt variations */
-#ifndef M_SQRT2_F
-#  define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */
-#endif
-#ifndef M_CBRT2_F
-#  define M_CBRT2_F 1.2599210498948732f /* cbrt(2) */
-#endif
-#ifndef M_SQRT1_2F
-#  define M_SQRT1_2F 0.70710678118654752440f /* sqrt(1/2) */
-#endif
-#ifndef M_SQRT3_F
-#  define M_SQRT3_F (1.7320508075688772f) /* sqrt(3) */
-#endif
-#ifndef M_LN2_F
-#  define M_LN2_F (0.6931471805599453f) /* ln(2) */
-#endif
-#ifndef M_LN10_F
-#  define M_LN10_F (2.3025850929940457f) /* ln(10) */
-#endif
-
-/* Scalar */
-
-#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
-#  ifdef _WIN32
-ccl_device_inline float fmaxf(float a, float b)
-{
-  return (a > b) ? a : b;
-}
-
-ccl_device_inline float fminf(float a, float b)
-{
-  return (a < b) ? a : b;
-}
-
-#  endif /* _WIN32 */
-#endif   /* __HIP__, __KERNEL_ONEAPI__ */
-
-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
-#  ifndef __KERNEL_ONEAPI__
-using std::isfinite;
-using std::isnan;
-using std::sqrt;
-#  else
-#    define isfinite(x) sycl::isfinite((x))
-#    define isnan(x) sycl::isnan((x))
-#  endif
-
-ccl_device_inline int abs(int x)
-{
-  return (x > 0) ? x : -x;
-}
-
-ccl_device_inline int max(int a, int b)
-{
-  return (a > b) ? a : b;
-}
-
-ccl_device_inline int min(int a, int b)
-{
-  return (a < b) ? a : b;
-}
-
-ccl_device_inline uint32_t max(uint32_t a, uint32_t b)
-{
-  return (a > b) ? a : b;
-}
-
-ccl_device_inline uint32_t min(uint32_t a, uint32_t b)
-{
-  return (a < b) ? a : b;
-}
-
-ccl_device_inline uint64_t max(uint64_t a, uint64_t b)
-{
-  return (a > b) ? a : b;
-}
-
-ccl_device_inline uint64_t min(uint64_t a, uint64_t b)
-{
-  return (a < b) ? a : b;
-}
-
-/* NOTE: On 64bit Darwin the `size_t` is defined as `unsigned long int` and `uint64_t` is defined
- * as `unsigned long long`. Both of the definitions are 64 bit unsigned integer, but the automatic
- * substitution does not allow to automatically pick function defined for `uint64_t` as it is not
- * exactly the same type definition.
- * Work this around by adding a templated function enabled for `size_t` type which will be used
- * when there is no explicit specialization of `min()`/`max()` above. */
-
-template<class T>
-ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> max(T a, T b)
-{
-  return (a > b) ? a : b;
-}
-
-template<class T>
-ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> min(T a, T b)
-{
-  return (a < b) ? a : b;
-}
-
-ccl_device_inline float max(float a, float b)
-{
-  return (a > b) ? a : b;
-}
-
-ccl_device_inline float min(float a, float b)
-{
-  return (a < b) ? a : b;
-}
-
-ccl_device_inline double max(double a, double b)
-{
-  return (a > b) ? a : b;
-}
-
-ccl_device_inline double min(double a, double b)
-{
-  return (a < b) ? a : b;
-}
-
-/* These 2 guys are templated for usage with registers data.
- *
- * NOTE: Since this is CPU-only functions it is ok to use references here.
- * But for other devices we'll need to be careful about this.
- */
-
-template<typename T> ccl_device_inline T min4(const T &a, const T &b, const T &c, const T &d)
-{
-  return min(min(a, b), min(c, d));
-}
-
-template<typename T> ccl_device_inline T max4(const T &a, const T &b, const T &c, const T &d)
-{
-  return max(max(a, b), max(c, d));
-}
-#endif /* __KERNEL_GPU__ */
-
-ccl_device_inline float min4(float a, float b, float c, float d)
-{
-  return min(min(a, b), min(c, d));
-}
-
-ccl_device_inline float max4(float a, float b, float c, float d)
-{
-  return max(max(a, b), max(c, d));
-}
-
-#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
-/* Int/Float conversion */
-
-ccl_device_inline int as_int(uint i)
-{
-  union {
-    uint ui;
-    int i;
-  } u;
-  u.ui = i;
-  return u.i;
-}
-
-ccl_device_inline uint as_uint(int i)
-{
-  union {
-    uint ui;
-    int i;
-  } u;
-  u.i = i;
-  return u.ui;
-}
-
-ccl_device_inline uint as_uint(float f)
-{
-  union {
-    uint i;
-    float f;
-  } u;
-  u.f = f;
-  return u.i;
-}
-
-#  ifndef __HIP__
-ccl_device_inline int __float_as_int(float f)
-{
-  union {
-    int i;
-    float f;
-  } u;
-  u.f = f;
-  return u.i;
-}
-
-ccl_device_inline float __int_as_float(int i)
-{
-  union {
-    int i;
-    float f;
-  } u;
-  u.i = i;
-  return u.f;
-}
-
-ccl_device_inline uint __float_as_uint(float f)
-{
-  union {
-    uint i;
-    float f;
-  } u;
-  u.f = f;
-  return u.i;
-}
-
-ccl_device_inline float __uint_as_float(uint i)
-{
-  union {
-    uint i;
-    float f;
-  } u;
-  u.i = i;
-  return u.f;
-}
-#  endif
-
-ccl_device_inline int4 __float4_as_int4(float4 f)
-{
-#  ifdef __KERNEL_SSE__
-  return int4(_mm_castps_si128(f.m128));
-#  else
-  return make_int4(
-      __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#  endif
-}
-
-ccl_device_inline float4 __int4_as_float4(int4 i)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_castsi128_ps(i.m128));
-#  else
-  return make_float4(
-      __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#  endif
-}
-#endif /* !defined(__KERNEL_METAL__) */
-
-#if defined(__KERNEL_METAL__)
-ccl_device_forceinline bool isnan_safe(float f)
-{
-  return isnan(f);
-}
-
-ccl_device_forceinline bool isfinite_safe(float f)
-{
-  return isfinite(f);
-}
-#else
-template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
-{
-  return ((uint64_t)ptr) & 0xFFFFFFFF;
-}
-
-template<typename T> ccl_device_inline uint pointer_pack_to_uint_1(T *ptr)
-{
-  return (((uint64_t)ptr) >> 32) & 0xFFFFFFFF;
-}
-
-template<typename T> ccl_device_inline T *pointer_unpack_from_uint(const uint a, const uint b)
-{
-  return (T *)(((uint64_t)b << 32) | a);
-}
-
-ccl_device_inline uint uint16_pack_to_uint(const uint a, const uint b)
-{
-  return (a << 16) | b;
-}
-
-ccl_device_inline uint uint16_unpack_from_uint_0(const uint i)
-{
-  return i >> 16;
-}
-
-ccl_device_inline uint uint16_unpack_from_uint_1(const uint i)
-{
-  return i & 0xFFFF;
-}
-
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
-  unsigned int x = __float_as_uint(f);
-  return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
-  /* By IEEE 754 rule, 2*Inf equals Inf */
-  unsigned int x = __float_as_uint(f);
-  return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
-}
-#endif
-
-ccl_device_inline float ensure_finite(float v)
-{
-  return isfinite_safe(v) ? v : 0.0f;
-}
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline int clamp(int a, int mn, int mx)
-{
-  return min(max(a, mn), mx);
-}
-
-ccl_device_inline float clamp(float a, float mn, float mx)
-{
-  return min(max(a, mn), mx);
-}
-
-ccl_device_inline float mix(float a, float b, float t)
-{
-  return a + t * (b - a);
-}
-
-ccl_device_inline float smoothstep(float edge0, float edge1, float x)
-{
-  float result;
-  if (x < edge0) {
-    result = 0.0f;
-  }
-  else if (x >= edge1) {
-    result = 1.0f;
-  }
-  else {
-    float t = (x - edge0) / (edge1 - edge0);
-    result = (3.0f - 2.0f * t) * (t * t);
-  }
-  return result;
-}
-
-#endif /* !defined(__KERNEL_METAL__) */
-
-#if defined(__KERNEL_CUDA__)
-ccl_device_inline float saturatef(float a)
-{
-  return __saturatef(a);
-}
-#elif !defined(__KERNEL_METAL__)
-ccl_device_inline float saturatef(float a)
-{
-  return clamp(a, 0.0f, 1.0f);
-}
-#endif /* __KERNEL_CUDA__ */
-
-ccl_device_inline int float_to_int(float f)
-{
-  return (int)f;
-}
-
-ccl_device_inline int floor_to_int(float f)
-{
-  return float_to_int(floorf(f));
-}
-
-ccl_device_inline float floorfrac(float x, ccl_private int *i)
-{
-  float f = floorf(x);
-  *i = float_to_int(f);
-  return x - f;
-}
-
-ccl_device_inline int ceil_to_int(float f)
-{
-  return float_to_int(ceilf(f));
-}
-
-ccl_device_inline float fractf(float x)
-{
-  return x - floorf(x);
-}
-
-/* Adapted from `godot-engine` math_funcs.h. */
-ccl_device_inline float wrapf(float value, float max, float min)
-{
-  float range = max - min;
-  return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
-}
-
-ccl_device_inline float pingpongf(float a, float b)
-{
-  return (b != 0.0f) ? fabsf(fractf((a - b) / (b * 2.0f)) * b * 2.0f - b) : 0.0f;
-}
-
-ccl_device_inline float smoothminf(float a, float b, float k)
-{
-  if (k != 0.0f) {
-    float h = fmaxf(k - fabsf(a - b), 0.0f) / k;
-    return fminf(a, b) - h * h * h * k * (1.0f / 6.0f);
-  }
-  else {
-    return fminf(a, b);
-  }
-}
-
-ccl_device_inline float signf(float f)
-{
-  return (f < 0.0f) ? -1.0f : 1.0f;
-}
-
-ccl_device_inline float nonzerof(float f, float eps)
-{
-  if (fabsf(f) < eps) {
-    return signf(f) * eps;
-  }
-  else {
-    return f;
-  }
-}
-
-/* The behavior of `atan2(0, 0)` is undefined on many platforms, to ensure consistent behavior, we
- * return 0 in this case. See !126951.
- * Computes the angle between the positive x axis and the vector pointing from origin to (x, y). */
-ccl_device_inline float compatible_atan2(const float y, const float x)
-{
-  return (x == 0.0f && y == 0.0f) ? 0.0f : atan2f(y, x);
-}
-
-/* `signum` function testing for zero. Matches GLSL and OSL functions. */
-ccl_device_inline float compatible_signf(float f)
-{
-  if (f == 0.0f) {
-    return 0.0f;
-  }
-  else {
-    return signf(f);
-  }
-}
-
-ccl_device_inline float smoothstepf(float f)
-{
-  if (f <= 0.0f) {
-    return 0.0f;
-  }
-  if (f >= 1.0f) {
-    return 1.0f;
-  }
-  float ff = f * f;
-  return (3.0f * ff - 2.0f * ff * f);
-}
-
-ccl_device_inline int mod(int x, int m)
-{
-  return (x % m + m) % m;
-}
-
-ccl_device_inline float inverse_lerp(float a, float b, float x)
-{
-  return (x - a) / (b - a);
-}
-
-/* Cubic interpolation between b and c, a and d are the previous and next point. */
-ccl_device_inline float cubic_interp(float a, float b, float c, float d, float x)
-{
-  return 0.5f *
-             (((d + 3.0f * (b - c) - a) * x + (2.0f * a - 5.0f * b + 4.0f * c - d)) * x +
-              (c - a)) *
-             x +
-         b;
-}
-
-CCL_NAMESPACE_END
-
-#include "util/math_int2.h"
-#include "util/math_int3.h"
-#include "util/math_int4.h"
-#include "util/math_int8.h"
-
-#include "util/math_float2.h"
-#include "util/math_float4.h"
-#include "util/math_float8.h"
-
-#include "util/math_float3.h"
-
-#include "util/rect.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Triangle */
-
-ccl_device_inline float triangle_area(ccl_private const float3 &v1,
-                                      ccl_private const float3 &v2,
-                                      ccl_private const float3 &v3)
-{
-  return len(cross(v3 - v2, v1 - v2)) * 0.5f;
-}
-
-/* Orthonormal vectors */
-
-ccl_device_inline void make_orthonormals(const float3 N,
-                                         ccl_private float3 *a,
-                                         ccl_private float3 *b)
-{
-#if 0
-  if (fabsf(N.y) >= 0.999f) {
-    *a = make_float3(1, 0, 0);
-    *b = make_float3(0, 0, 1);
-    return;
-  }
-  if (fabsf(N.z) >= 0.999f) {
-    *a = make_float3(1, 0, 0);
-    *b = make_float3(0, 1, 0);
-    return;
-  }
-#endif
-
-  if (N.x != N.y || N.x != N.z)
-    *a = make_float3(N.z - N.y, N.x - N.z, N.y - N.x);  //(1,1,1)x N
-  else
-    *a = make_float3(N.z - N.y, N.x + N.z, -N.y - N.x);  //(-1,1,1)x N
-
-  *a = normalize(*a);
-  *b = cross(N, *a);
-}
-
-/* Color division */
-
-ccl_device_inline Spectrum safe_invert_color(Spectrum a)
-{
-  FOREACH_SPECTRUM_CHANNEL (i) {
-    GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ?
-                                     1.0f / GET_SPECTRUM_CHANNEL(a, i) :
-                                     0.0f;
-  }
-
-  return a;
-}
-
-/* Returns `a/b`, and replace the channel value with `fallback` if `b == 0`. */
-ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b, const float fallback = 0.0f)
-{
-  FOREACH_SPECTRUM_CHANNEL (i) {
-    GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ?
-                                     GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) :
-                                     fallback;
-  }
-
-  return a;
-}
-
-ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
-{
-  float x, y, z;
-
-  x = (b.x != 0.0f) ? a.x / b.x : 0.0f;
-  y = (b.y != 0.0f) ? a.y / b.y : 0.0f;
-  z = (b.z != 0.0f) ? a.z / b.z : 0.0f;
-
-  /* try to get gray even if b is zero */
-  if (b.x == 0.0f) {
-    if (b.y == 0.0f) {
-      x = z;
-      y = z;
-    }
-    else if (b.z == 0.0f) {
-      x = y;
-      z = y;
-    }
-    else {
-      x = 0.5f * (y + z);
-    }
-  }
-  else if (b.y == 0.0f) {
-    if (b.z == 0.0f) {
-      y = x;
-      z = x;
-    }
-    else {
-      y = 0.5f * (x + z);
-    }
-  }
-  else if (b.z == 0.0f) {
-    z = 0.5f * (x + y);
-  }
-
-  return make_float3(x, y, z);
-}
-
-/* Rotation of point around axis and angle */
-
-ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
-{
-  float costheta = cosf(angle);
-  float sintheta = sinf(angle);
-  float3 r;
-
-  r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
-        (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
-        (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
-
-  r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
-        ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
-        (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
-
-  r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
-        (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
-        ((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
-
-  return r;
-}
-
-/* NaN-safe math ops */
-
-ccl_device_inline float safe_sqrtf(float f)
-{
-  return sqrtf(max(f, 0.0f));
-}
-
-ccl_device_inline float inversesqrtf(float f)
-{
-#if defined(__KERNEL_METAL__)
-  return (f > 0.0f) ? rsqrt(f) : 0.0f;
-#else
-  return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
-#endif
-}
-
-ccl_device float safe_asinf(float a)
-{
-  return asinf(clamp(a, -1.0f, 1.0f));
-}
-
-ccl_device float safe_acosf(float a)
-{
-  return acosf(clamp(a, -1.0f, 1.0f));
-}
-
-ccl_device float compatible_powf(float x, float y)
-{
-#ifdef __KERNEL_GPU__
-  if (y == 0.0f) /* x^0 -> 1, including 0^0 */
-    return 1.0f;
-
-  /* GPU pow doesn't accept negative x, do manual checks here */
-  if (x < 0.0f) {
-    if (fmodf(-y, 2.0f) == 0.0f)
-      return powf(-x, y);
-    else
-      return -powf(-x, y);
-  }
-  else if (x == 0.0f)
-    return 0.0f;
-#endif
-  return powf(x, y);
-}
-
-ccl_device float safe_powf(float a, float b)
-{
-  if (UNLIKELY(a < 0.0f && b != float_to_int(b))) {
-    return 0.0f;
-  }
-
-  return compatible_powf(a, b);
-}
-
-ccl_device float safe_divide(float a, float b)
-{
-  return (b != 0.0f) ? a / b : 0.0f;
-}
-
-ccl_device float safe_logf(float a, float b)
-{
-  if (UNLIKELY(a <= 0.0f || b <= 0.0f)) {
-    return 0.0f;
-  }
-
-  return safe_divide(logf(a), logf(b));
-}
-
-ccl_device float safe_modulo(float a, float b)
-{
-  return (b != 0.0f) ? fmodf(a, b) : 0.0f;
-}
-
-ccl_device float safe_floored_modulo(float a, float b)
-{
-  return (b != 0.0f) ? a - floorf(a / b) * b : 0.0f;
-}
-
-ccl_device_inline float sqr(float a)
-{
-  return a * a;
-}
-
-ccl_device_inline float sin_from_cos(const float c)
-{
-  return safe_sqrtf(1.0f - sqr(c));
-}
-
-ccl_device_inline float cos_from_sin(const float s)
-{
-  return safe_sqrtf(1.0f - sqr(s));
-}
-
-ccl_device_inline float sin_sqr_to_one_minus_cos(const float s_sq)
-{
-  /* Using second-order Taylor expansion at small angles for better accuracy. */
-  return s_sq > 0.0004f ? 1.0f - safe_sqrtf(1.0f - s_sq) : 0.5f * s_sq;
-}
-
-ccl_device_inline float one_minus_cos(const float angle)
-{
-  /* Using second-order Taylor expansion at small angles for better accuracy. */
-  return angle > 0.02f ? 1.0f - cosf(angle) : 0.5f * sqr(angle);
-}
-
-ccl_device_inline float pow20(float a)
-{
-  return sqr(sqr(sqr(sqr(a)) * a));
-}
-
-ccl_device_inline float pow22(float a)
-{
-  return sqr(a * sqr(sqr(sqr(a)) * a));
-}
-
-#ifdef __KERNEL_METAL__
-ccl_device_inline float lgammaf(float x)
-{
-  /* Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik
-   */
-  const float _1_180 = 1.0f / 180.0f;
-  const float log2pi = 1.83787706641f;
-  const float logx = log(x);
-  return (log2pi - logx +
-          x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) *
-         0.5f;
-}
-#endif
-
-ccl_device_inline float beta(float x, float y)
-{
-  return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
-}
-
-ccl_device_inline float xor_signmask(float x, int y)
-{
-  return __int_as_float(__float_as_int(x) ^ y);
-}
-
-ccl_device float bits_to_01(uint bits)
-{
-  return bits * (1.0f / (float)0xFFFFFFFF);
-}
-
-#if !defined(__KERNEL_GPU__)
-#  if defined(__GNUC__)
-ccl_device_inline uint popcount(uint x)
-{
-  return __builtin_popcount(x);
-}
-#  else
-ccl_device_inline uint popcount(uint x)
-{
-  /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
-  uint i = x;
-  i = i - ((i >> 1) & 0x55555555);
-  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
-  i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
-  return i;
-}
-#  endif
-#elif defined(__KERNEL_ONEAPI__)
-#  define popcount(x) sycl::popcount(x)
-#elif defined(__KERNEL_HIP__)
-/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
-#  define popcount(x) __popcll(x)
-#elif !defined(__KERNEL_METAL__)
-#  define popcount(x) __popc(x)
-#endif
-
-ccl_device_inline uint count_leading_zeros(uint x)
-{
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
-  return __clz(x);
-#elif defined(__KERNEL_METAL__)
-  return clz(x);
-#elif defined(__KERNEL_ONEAPI__)
-  return sycl::clz(x);
-#else
-  assert(x != 0);
-#  ifdef _MSC_VER
-  unsigned long leading_zero = 0;
-  _BitScanReverse(&leading_zero, x);
-  return (31 - leading_zero);
-#  else
-  return __builtin_clz(x);
-#  endif
-#endif
-}
-
-ccl_device_inline uint count_trailing_zeros(uint x)
-{
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
-  return (__ffs(x) - 1);
-#elif defined(__KERNEL_METAL__)
-  return ctz(x);
-#elif defined(__KERNEL_ONEAPI__)
-  return sycl::ctz(x);
-#else
-  assert(x != 0);
-#  ifdef _MSC_VER
-  unsigned long ctz = 0;
-  _BitScanForward(&ctz, x);
-  return ctz;
-#  else
-  return __builtin_ctz(x);
-#  endif
-#endif
-}
-
-ccl_device_inline uint find_first_set(uint x)
-{
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
-  return __ffs(x);
-#elif defined(__KERNEL_METAL__)
-  return (x != 0) ? ctz(x) + 1 : 0;
-#else
-#  ifdef _MSC_VER
-  return (x != 0) ? (32 - count_leading_zeros(x & (~x + 1))) : 0;
-#  else
-  return __builtin_ffs(x);
-#  endif
-#endif
-}
-
-/* projections */
-ccl_device_inline float2 map_to_tube(const float3 co)
-{
-  float len, u, v;
-  len = sqrtf(co.x * co.x + co.y * co.y);
-  if (len > 0.0f) {
-    u = (1.0f - (atan2f(co.x / len, co.y / len) / M_PI_F)) * 0.5f;
-    v = (co.z + 1.0f) * 0.5f;
-  }
-  else {
-    u = v = 0.0f;
-  }
-  return make_float2(u, v);
-}
-
-ccl_device_inline float2 map_to_sphere(const float3 co)
-{
-  float l = dot(co, co);
-  float u, v;
-  if (l > 0.0f) {
-    if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
-      u = 0.0f; /* Otherwise domain error. */
-    }
-    else {
-      u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F);
-    }
-    v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F;
-  }
-  else {
-    u = v = 0.0f;
-  }
-  return make_float2(u, v);
-}
-
-/* Compares two floats.
- * Returns true if their absolute difference is smaller than abs_diff (for numbers near zero)
- * or their relative difference is less than ulp_diff ULPs.
- * Based on
- * https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
- */
-
-ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
-{
-  if (fabsf(a - b) < abs_diff) {
-    return true;
-  }
-
-  if ((a < 0.0f) != (b < 0.0f)) {
-    return false;
-  }
-
-  return (abs(__float_as_int(a) - __float_as_int(b)) < ulp_diff);
-}
-
-/* Calculate the angle between the two vectors a and b.
- * The usual approach `acos(dot(a, b))` has severe precision issues for small angles,
- * which are avoided by this method.
- * Based on "Mangled Angles" from https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf
- */
-ccl_device_inline float precise_angle(float3 a, float3 b)
-{
-  return 2.0f * atan2f(len(a - b), len(a + b));
-}
-
-/* Tangent of the angle between vectors a and b. */
-ccl_device_inline float tan_angle(float3 a, float3 b)
-{
-  return len(cross(a, b)) / dot(a, b);
-}
-
-/* Return value which is greater than the given one and is a power of two. */
-ccl_device_inline uint next_power_of_two(uint x)
-{
-  return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
-}
-
-/* Return value which is lower than the given one and is a power of two. */
-ccl_device_inline uint prev_power_of_two(uint x)
-{
-  return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
-}
-
-#ifndef __has_builtin
-#  define __has_builtin(v) 0
-#endif
-
-/* Reverses the bits of a 32 bit integer. */
-ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
-{
-  /* Use a native instruction if it exists. */
-#if defined(__KERNEL_CUDA__)
-  return __brev(x);
-#elif defined(__KERNEL_METAL__)
-  return reverse_bits(x);
-#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
-  /* Assume the rbit is always available on 64bit ARM architecture. */
-  __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
-  return x;
-#elif defined(__arm__) && ((__ARM_ARCH > 7) || __ARM_ARCH == 6 && __ARM_ARCH_ISA_THUMB >= 2)
-  /* This ARM instruction is available in ARMv6T2 and above.
-   * This 32-bit Thumb instruction is available in ARMv6T2 and above. */
-  __asm__("rbit %0, %1" : "=r"(x) : "r"(x));
-  return x;
-#elif __has_builtin(__builtin_bitreverse32)
-  return __builtin_bitreverse32(x);
-#else
-  /* Flip pairwise. */
-  x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
-  /* Flip pairs. */
-  x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
-  /* Flip nibbles. */
-  x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
-  /* Flip bytes. CPUs have an instruction for that, pretty fast one. */
-#  ifdef _MSC_VER
-  return _byteswap_ulong(x);
-#  elif defined(__INTEL_COMPILER)
-  return (uint32_t)_bswap((int)x);
-#  else
-  /* Assuming gcc or clang. */
-  return __builtin_bswap32(x);
-#  endif
-#endif
-}
-
-/* Solve quadratic equation a*x^2 + b*x + c = 0, adapted from Mitsuba 3
- * The solution is ordered so that x1 <= x2.
- * Returns true if at least one solution is found.  */
-ccl_device_inline bool solve_quadratic(
-    const float a, const float b, const float c, ccl_private float &x1, ccl_private float &x2)
-{
-  /* If the equation is linear, the solution is -c/b, but b has to be non-zero. */
-  const bool valid_linear = (a == 0.0f) && (b != 0.0f);
-  x1 = x2 = -c / b;
-
-  const float discriminant = sqr(b) - 4.0f * a * c;
-  /* Allow slightly negative discriminant in case of numerical precision issues. */
-  const bool valid_quadratic = (a != 0.0f) && (discriminant > -1e-5f);
-
-  if (valid_quadratic) {
-    /* Numerically stable version of (-b ± sqrt(discriminant)) / (2 * a), avoiding catastrophic
-     * cancellation when `b` is very close to `sqrt(discriminant)`, by finding the solution of
-     * greater magnitude which does not suffer from loss of precision, then using the identity
-     * x1 * x2 = c / a. */
-    const float temp = -0.5f * (b + copysignf(safe_sqrtf(discriminant), b));
-    const float r1 = temp / a;
-    const float r2 = c / temp;
-
-    x1 = fminf(r1, r2);
-    x2 = fmaxf(r1, r2);
-  }
-
-  return (valid_linear || valid_quadratic);
-}
-
-/* Defines a closed interval [min, max]. */
-template<typename T> struct Interval {
-  T min;
-  T max;
-
-  ccl_device_inline_method bool is_empty() const
-  {
-    return min >= max;
-  }
-
-  ccl_device_inline_method bool contains(T value) const
-  {
-    return value >= min && value <= max;
-  }
-
-  ccl_device_inline_method T length() const
-  {
-    return max - min;
-  }
-};
-
-/* Computes the intersection of two intervals. */
-template<typename T>
-ccl_device_inline Interval<T> intervals_intersection(ccl_private const Interval<T> &first,
-                                                     ccl_private const Interval<T> &second)
-{
-  return {max(first.min, second.min), min(first.max, second.max)};
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_MATH_H__ */
+#include "util/rect.h"  // IWYU pragma: export
--- a/intern/cycles/util/math_base.h
+++ b/intern/cycles/util/math_base.h
@@ -0,0 +1,868 @@
+/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#pragma once
+
+/* Math
+ *
+ * Basic math functions on scalar and vector types. This header is used by
+ * both the kernel code when compiled as C++, and other C++ non-kernel code. */
+
+#include "util/defines.h"
+#include "util/types_base.h"
+
+#ifdef __HIP__
+#  include <hip/hip_vector_types.h>
+#endif
+
+#if !defined(__KERNEL_METAL__)
+#  include <cfloat>  // IWYU pragma: export
+#  include <cmath>   // IWYU pragma: export
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Float Pi variations */
+
+/* Division */
+#ifndef M_PI_F
+#  define M_PI_F (3.1415926535897932f) /* pi */
+#endif
+#ifndef M_PI_2_F
+#  define M_PI_2_F (1.5707963267948966f) /* pi/2 */
+#endif
+#ifndef M_PI_4_F
+#  define M_PI_4_F (0.7853981633974830f) /* pi/4 */
+#endif
+#ifndef M_1_PI_F
+#  define M_1_PI_F (0.3183098861837067f) /* 1/pi */
+#endif
+#ifndef M_2_PI_F
+#  define M_2_PI_F (0.6366197723675813f) /* 2/pi */
+#endif
+#ifndef M_1_2PI_F
+#  define M_1_2PI_F (0.1591549430918953f) /* 1/(2*pi) */
+#endif
+#ifndef M_1_4PI_F
+#  define M_1_4PI_F (0.0795774715459476f) /* 1/(4*pi) */
+#endif
+#ifndef M_SQRT_PI_8_F
+#  define M_SQRT_PI_8_F (0.6266570686577501f) /* sqrt(pi/8) */
+#endif
+#ifndef M_LN_2PI_F
+#  define M_LN_2PI_F (1.8378770664093454f) /* ln(2*pi) */
+#endif
+
+/* Multiplication */
+#ifndef M_2PI_F
+#  define M_2PI_F (6.2831853071795864f) /* 2*pi */
+#endif
+#ifndef M_4PI_F
+#  define M_4PI_F (12.566370614359172f) /* 4*pi */
+#endif
+#ifndef M_PI_4F
+#  define M_PI_4F 0.78539816339744830962f /* pi/4 */
+#endif
+
+/* Float sqrt variations */
+#ifndef M_SQRT2_F
+#  define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */
+#endif
+#ifndef M_CBRT2_F
+#  define M_CBRT2_F 1.2599210498948732f /* cbrt(2) */
+#endif
+#ifndef M_SQRT1_2F
+#  define M_SQRT1_2F 0.70710678118654752440f /* sqrt(1/2) */
+#endif
+#ifndef M_SQRT3_F
+#  define M_SQRT3_F (1.7320508075688772f) /* sqrt(3) */
+#endif
+#ifndef M_LN2_F
+#  define M_LN2_F (0.6931471805599453f) /* ln(2) */
+#endif
+#ifndef M_LN10_F
+#  define M_LN10_F (2.3025850929940457f) /* ln(10) */
+#endif
+
+/* Scalar */
+
+#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
+#  ifdef _WIN32
+ccl_device_inline float fmaxf(float a, float b)
+{
+  return (a > b) ? a : b;
+}
+
+ccl_device_inline float fminf(float a, float b)
+{
+  return (a < b) ? a : b;
+}
+
+#  endif /* _WIN32 */
+#endif   /* __HIP__, __KERNEL_ONEAPI__ */
+
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#  ifndef __KERNEL_ONEAPI__
+using std::isfinite;
+using std::isnan;
+using std::sqrt;
+#  else
+#    define isfinite(x) sycl::isfinite((x))
+#    define isnan(x) sycl::isnan((x))
+#  endif
+
+ccl_device_inline int abs(int x)
+{
+  return (x > 0) ? x : -x;
+}
+
+ccl_device_inline int max(int a, int b)
+{
+  return (a > b) ? a : b;
+}
+
+ccl_device_inline int min(int a, int b)
+{
+  return (a < b) ? a : b;
+}
+
+ccl_device_inline uint32_t max(uint32_t a, uint32_t b)
+{
+  return (a > b) ? a : b;
+}
+
+ccl_device_inline uint32_t min(uint32_t a, uint32_t b)
+{
+  return (a < b) ? a : b;
+}
+
+ccl_device_inline uint64_t max(uint64_t a, uint64_t b)
+{
+  return (a > b) ? a : b;
+}
+
+ccl_device_inline uint64_t min(uint64_t a, uint64_t b)
+{
+  return (a < b) ? a : b;
+}
+
+/* NOTE: On 64bit Darwin the `size_t` is defined as `unsigned long int` and `uint64_t` is defined
+ * as `unsigned long long`. Both of the definitions are 64 bit unsigned integer, but the automatic
+ * substitution does not allow to automatically pick function defined for `uint64_t` as it is not
+ * exactly the same type definition.
+ * Work this around by adding a templated function enabled for `size_t` type which will be used
+ * when there is no explicit specialization of `min()`/`max()` above. */
+
+template<class T>
+ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> max(T a, T b)
+{
+  return (a > b) ? a : b;
+}
+
+template<class T>
+ccl_device_inline typename std::enable_if_t<std::is_same_v<T, size_t>, T> min(T a, T b)
+{
+  return (a < b) ? a : b;
+}
+
+ccl_device_inline float max(float a, float b)
+{
+  return (a > b) ? a : b;
+}
+
+ccl_device_inline float min(float a, float b)
+{
+  return (a < b) ? a : b;
+}
+
+ccl_device_inline double max(double a, double b)
+{
+  return (a > b) ? a : b;
+}
+
+ccl_device_inline double min(double a, double b)
+{
+  return (a < b) ? a : b;
+}
+
+/* These 2 guys are templated for usage with registers data.
+ *
+ * NOTE: Since this is CPU-only functions it is ok to use references here.
+ * But for other devices we'll need to be careful about this.
+ */
+
+template<typename T> ccl_device_inline T min4(const T &a, const T &b, const T &c, const T &d)
+{
+  return min(min(a, b), min(c, d));
+}
+
+template<typename T> ccl_device_inline T max4(const T &a, const T &b, const T &c, const T &d)
+{
+  return max(max(a, b), max(c, d));
+}
+#endif /* __KERNEL_GPU__ */
+
+ccl_device_inline float min4(float a, float b, float c, float d)
+{
+  return min(min(a, b), min(c, d));
+}
+
+ccl_device_inline float max4(float a, float b, float c, float d)
+{
+  return max(max(a, b), max(c, d));
+}
+
+#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
+/* Int/Float conversion */
+
+ccl_device_inline int as_int(uint i)
+{
+  union {
+    uint ui;
+    int i;
+  } u;
+  u.ui = i;
+  return u.i;
+}
+
+ccl_device_inline uint as_uint(int i)
+{
+  union {
+    uint ui;
+    int i;
+  } u;
+  u.i = i;
+  return u.ui;
+}
+
+ccl_device_inline uint as_uint(float f)
+{
+  union {
+    uint i;
+    float f;
+  } u;
+  u.f = f;
+  return u.i;
+}
+
+#  ifndef __HIP__
+ccl_device_inline int __float_as_int(float f)
+{
+  union {
+    int i;
+    float f;
+  } u;
+  u.f = f;
+  return u.i;
+}
+
+ccl_device_inline float __int_as_float(int i)
+{
+  union {
+    int i;
+    float f;
+  } u;
+  u.i = i;
+  return u.f;
+}
+
+ccl_device_inline uint __float_as_uint(float f)
+{
+  union {
+    uint i;
+    float f;
+  } u;
+  u.f = f;
+  return u.i;
+}
+
+ccl_device_inline float __uint_as_float(uint i)
+{
+  union {
+    uint i;
+    float f;
+  } u;
+  u.i = i;
+  return u.f;
+}
+#  endif
+
+#endif /* !defined(__KERNEL_METAL__) */
+
+#if defined(__KERNEL_METAL__)
+ccl_device_forceinline bool isnan_safe(float f)
+{
+  return isnan(f);
+}
+
+ccl_device_forceinline bool isfinite_safe(float f)
+{
+  return isfinite(f);
+}
+#else
+template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
+{
+  return ((uint64_t)ptr) & 0xFFFFFFFF;
+}
+
+template<typename T> ccl_device_inline uint pointer_pack_to_uint_1(T *ptr)
+{
+  return (((uint64_t)ptr) >> 32) & 0xFFFFFFFF;
+}
+
+template<typename T> ccl_device_inline T *pointer_unpack_from_uint(const uint a, const uint b)
+{
+  return (T *)(((uint64_t)b << 32) | a);
+}
+
+ccl_device_inline uint uint16_pack_to_uint(const uint a, const uint b)
+{
+  return (a << 16) | b;
+}
+
+ccl_device_inline uint uint16_unpack_from_uint_0(const uint i)
+{
+  return i >> 16;
+}
+
+ccl_device_inline uint uint16_unpack_from_uint_1(const uint i)
+{
+  return i & 0xFFFF;
+}
+
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
+{
+  unsigned int x = __float_as_uint(f);
+  return (x << 1) > 0xff000000u;
+}
+
+ccl_device_inline bool isfinite_safe(float f)
+{
+  /* By IEEE 754 rule, 2*Inf equals Inf */
+  unsigned int x = __float_as_uint(f);
+  return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
+}
+#endif
+
+ccl_device_inline float ensure_finite(float v)
+{
+  return isfinite_safe(v) ? v : 0.0f;
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline int clamp(int a, int mn, int mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline float clamp(float a, float mn, float mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline float mix(float a, float b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float smoothstep(float edge0, float edge1, float x)
+{
+  float result;
+  if (x < edge0) {
+    result = 0.0f;
+  }
+  else if (x >= edge1) {
+    result = 1.0f;
+  }
+  else {
+    float t = (x - edge0) / (edge1 - edge0);
+    result = (3.0f - 2.0f * t) * (t * t);
+  }
+  return result;
+}
+
+#endif /* !defined(__KERNEL_METAL__) */
+
+#if defined(__KERNEL_CUDA__)
+ccl_device_inline float saturatef(float a)
+{
+  return __saturatef(a);
+}
+#elif !defined(__KERNEL_METAL__)
+ccl_device_inline float saturatef(float a)
+{
+  return clamp(a, 0.0f, 1.0f);
+}
+#endif /* __KERNEL_CUDA__ */
+
+ccl_device_inline int float_to_int(float f)
+{
+  return (int)f;
+}
+
+ccl_device_inline int floor_to_int(float f)
+{
+  return float_to_int(floorf(f));
+}
+
+ccl_device_inline float floorfrac(float x, ccl_private int *i)
+{
+  float f = floorf(x);
+  *i = float_to_int(f);
+  return x - f;
+}
+
+ccl_device_inline int ceil_to_int(float f)
+{
+  return float_to_int(ceilf(f));
+}
+
+ccl_device_inline float fractf(float x)
+{
+  return x - floorf(x);
+}
+
+/* Adapted from `godot-engine` math_funcs.h. */
+ccl_device_inline float wrapf(float value, float max, float min)
+{
+  float range = max - min;
+  return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
+}
+
+ccl_device_inline float pingpongf(float a, float b)
+{
+  return (b != 0.0f) ? fabsf(fractf((a - b) / (b * 2.0f)) * b * 2.0f - b) : 0.0f;
+}
+
+ccl_device_inline float smoothminf(float a, float b, float k)
+{
+  if (k != 0.0f) {
+    float h = fmaxf(k - fabsf(a - b), 0.0f) / k;
+    return fminf(a, b) - h * h * h * k * (1.0f / 6.0f);
+  }
+  return fminf(a, b);
+}
+
+ccl_device_inline float signf(float f)
+{
+  return (f < 0.0f) ? -1.0f : 1.0f;
+}
+
+ccl_device_inline float nonzerof(float f, float eps)
+{
+  if (fabsf(f) < eps) {
+    return signf(f) * eps;
+  }
+  return f;
+}
+
+/* The behavior of `atan2(0, 0)` is undefined on many platforms, to ensure consistent behavior, we
+ * return 0 in this case. See !126951.
+ * Computes the angle between the positive x axis and the vector pointing from origin to (x, y). */
+ccl_device_inline float compatible_atan2(const float y, const float x)
+{
+  return (x == 0.0f && y == 0.0f) ? 0.0f : atan2f(y, x);
+}
+
+/* `signum` function testing for zero. Matches GLSL and OSL functions. */
+ccl_device_inline float compatible_signf(float f)
+{
+  if (f == 0.0f) {
+    return 0.0f;
+  }
+  return signf(f);
+}
+
+ccl_device_inline float smoothstepf(float f)
+{
+  if (f <= 0.0f) {
+    return 0.0f;
+  }
+  if (f >= 1.0f) {
+    return 1.0f;
+  }
+  float ff = f * f;
+  return (3.0f * ff - 2.0f * ff * f);
+}
+
+ccl_device_inline int mod(int x, int m)
+{
+  return (x % m + m) % m;
+}
+
+ccl_device_inline float inverse_lerp(float a, float b, float x)
+{
+  return (x - a) / (b - a);
+}
+
+/* Cubic interpolation between b and c, a and d are the previous and next point. */
+ccl_device_inline float cubic_interp(float a, float b, float c, float d, float x)
+{
+  return 0.5f *
+             (((d + 3.0f * (b - c) - a) * x + (2.0f * a - 5.0f * b + 4.0f * c - d)) * x +
+              (c - a)) *
+             x +
+         b;
+}
+
+/* NaN-safe math ops */
+
+ccl_device_inline float safe_sqrtf(float f)
+{
+  return sqrtf(max(f, 0.0f));
+}
+
+ccl_device_inline float inversesqrtf(float f)
+{
+#if defined(__KERNEL_METAL__)
+  return (f > 0.0f) ? rsqrt(f) : 0.0f;
+#else
+  return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
+#endif
+}
+
+ccl_device float safe_asinf(float a)
+{
+  return asinf(clamp(a, -1.0f, 1.0f));
+}
+
+ccl_device float safe_acosf(float a)
+{
+  return acosf(clamp(a, -1.0f, 1.0f));
+}
+
+ccl_device float compatible_powf(float x, float y)
+{
+#ifdef __KERNEL_GPU__
+  if (y == 0.0f) /* x^0 -> 1, including 0^0 */
+    return 1.0f;
+
+  /* GPU pow doesn't accept negative x, do manual checks here */
+  if (x < 0.0f) {
+    if (fmodf(-y, 2.0f) == 0.0f)
+      return powf(-x, y);
+    else
+      return -powf(-x, y);
+  }
+  else if (x == 0.0f)
+    return 0.0f;
+#endif
+  return powf(x, y);
+}
+
+ccl_device float safe_powf(float a, float b)
+{
+  if (UNLIKELY(a < 0.0f && b != float_to_int(b))) {
+    return 0.0f;
+  }
+
+  return compatible_powf(a, b);
+}
+
+ccl_device float safe_divide(float a, float b)
+{
+  return (b != 0.0f) ? a / b : 0.0f;
+}
+
+ccl_device float safe_logf(float a, float b)
+{
+  if (UNLIKELY(a <= 0.0f || b <= 0.0f)) {
+    return 0.0f;
+  }
+
+  return safe_divide(logf(a), logf(b));
+}
+
+ccl_device float safe_modulo(float a, float b)
+{
+  return (b != 0.0f) ? fmodf(a, b) : 0.0f;
+}
+
+ccl_device float safe_floored_modulo(float a, float b)
+{
+  return (b != 0.0f) ? a - floorf(a / b) * b : 0.0f;
+}
+
+ccl_device_inline float sqr(float a)
+{
+  return a * a;
+}
+
+ccl_device_inline float sin_from_cos(const float c)
+{
+  return safe_sqrtf(1.0f - sqr(c));
+}
+
+ccl_device_inline float cos_from_sin(const float s)
+{
+  return safe_sqrtf(1.0f - sqr(s));
+}
+
+ccl_device_inline float sin_sqr_to_one_minus_cos(const float s_sq)
+{
+  /* Using second-order Taylor expansion at small angles for better accuracy. */
+  return s_sq > 0.0004f ? 1.0f - safe_sqrtf(1.0f - s_sq) : 0.5f * s_sq;
+}
+
+ccl_device_inline float one_minus_cos(const float angle)
+{
+  /* Using second-order Taylor expansion at small angles for better accuracy. */
+  return angle > 0.02f ? 1.0f - cosf(angle) : 0.5f * sqr(angle);
+}
+
+ccl_device_inline float pow20(float a)
+{
+  return sqr(sqr(sqr(sqr(a)) * a));
+}
+
+ccl_device_inline float pow22(float a)
+{
+  return sqr(a * sqr(sqr(sqr(a)) * a));
+}
+
+#ifdef __KERNEL_METAL__
+ccl_device_inline float lgammaf(float x)
+{
+  /* Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik
+   */
+  const float _1_180 = 1.0f / 180.0f;
+  const float log2pi = 1.83787706641f;
+  const float logx = log(x);
+  return (log2pi - logx +
+          x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) *
+         0.5f;
+}
+#endif
+
+ccl_device_inline float beta(float x, float y)
+{
+  return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
+}
+
+ccl_device_inline float xor_signmask(float x, int y)
+{
+  return __int_as_float(__float_as_int(x) ^ y);
+}
+
+ccl_device float bits_to_01(uint bits)
+{
+  return bits * (1.0f / (float)0xFFFFFFFF);
+}
+
+#if !defined(__KERNEL_GPU__)
+#  if defined(__GNUC__)
+ccl_device_inline uint popcount(uint x)
+{
+  return __builtin_popcount(x);
+}
+#  else
+ccl_device_inline uint popcount(uint x)
+{
+  /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
+  uint i = x;
+  i = i - ((i >> 1) & 0x55555555);
+  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+  i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+  return i;
+}
+#  endif
+#elif defined(__KERNEL_ONEAPI__)
+#  define popcount(x) sycl::popcount(x)
+#elif defined(__KERNEL_HIP__)
+/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
+#  define popcount(x) __popcll(x)
+#elif !defined(__KERNEL_METAL__)
+#  define popcount(x) __popc(x)
+#endif
+
+ccl_device_inline uint count_leading_zeros(uint x)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
+  return __clz(x);
+#elif defined(__KERNEL_METAL__)
+  return clz(x);
+#elif defined(__KERNEL_ONEAPI__)
+  return sycl::clz(x);
+#else
+  assert(x != 0);
+#  ifdef _MSC_VER
+  unsigned long leading_zero = 0;
+  _BitScanReverse(&leading_zero, x);
+  return (31 - leading_zero);
+#  else
+  return __builtin_clz(x);
+#  endif
+#endif
+}
+
+ccl_device_inline uint count_trailing_zeros(uint x)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
+  return (__ffs(x) - 1);
+#elif defined(__KERNEL_METAL__)
+  return ctz(x);
+#elif defined(__KERNEL_ONEAPI__)
+  return sycl::ctz(x);
+#else
+  assert(x != 0);
+#  ifdef _MSC_VER
+  unsigned long ctz = 0;
+  _BitScanForward(&ctz, x);
+  return ctz;
+#  else
+  return __builtin_ctz(x);
+#  endif
+#endif
+}
+
+ccl_device_inline uint find_first_set(uint x)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
+  return __ffs(x);
+#elif defined(__KERNEL_METAL__)
+  return (x != 0) ? ctz(x) + 1 : 0;
+#else
+#  ifdef _MSC_VER
+  return (x != 0) ? (32 - count_leading_zeros(x & (~x + 1))) : 0;
+#  else
+  return __builtin_ffs(x);
+#  endif
+#endif
+}
+
+/* Compares two floats.
+ * Returns true if their absolute difference is smaller than abs_diff (for numbers near zero)
+ * or their relative difference is less than ulp_diff ULPs.
+ * Based on
+ * https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+ */
+
+ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
+{
+  if (fabsf(a - b) < abs_diff) {
+    return true;
+  }
+
+  if ((a < 0.0f) != (b < 0.0f)) {
+    return false;
+  }
+
+  return (abs(__float_as_int(a) - __float_as_int(b)) < ulp_diff);
+}
+
+/* Return value which is greater than the given one and is a power of two. */
+ccl_device_inline uint next_power_of_two(uint x)
+{
+  return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
+}
+
+/* Return value which is lower than the given one and is a power of two. */
+ccl_device_inline uint prev_power_of_two(uint x)
+{
+  return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
+}
+
+#ifndef __has_builtin
+#  define __has_builtin(v) 0
+#endif
+
+/* Reverses the bits of a 32 bit integer. */
+ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
+{
+  /* Use a native instruction if it exists. */
+#if defined(__KERNEL_CUDA__)
+  return __brev(x);
+#elif defined(__KERNEL_METAL__)
+  return reverse_bits(x);
+#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
+  /* Assume the rbit is always available on 64bit ARM architecture. */
+  __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
+  return x;
+#elif defined(__arm__) && ((__ARM_ARCH > 7) || __ARM_ARCH == 6 && __ARM_ARCH_ISA_THUMB >= 2)
+  /* This ARM instruction is available in ARMv6T2 and above.
+   * This 32-bit Thumb instruction is available in ARMv6T2 and above. */
+  __asm__("rbit %0, %1" : "=r"(x) : "r"(x));
+  return x;
+#elif __has_builtin(__builtin_bitreverse32)
+  return __builtin_bitreverse32(x);
+#else
+  /* Flip pairwise. */
+  x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
+  /* Flip pairs. */
+  x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
+  /* Flip nibbles. */
+  x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
+  /* Flip bytes. CPUs have an instruction for that, pretty fast one. */
+#  ifdef _MSC_VER
+  return _byteswap_ulong(x);
+#  elif defined(__INTEL_COMPILER)
+  return (uint32_t)_bswap((int)x);
+#  else
+  /* Assuming gcc or clang. */
+  return __builtin_bswap32(x);
+#  endif
+#endif
+}
+
+/* Solve quadratic equation a*x^2 + b*x + c = 0, adapted from Mitsuba 3
+ * The solution is ordered so that x1 <= x2.
+ * Returns true if at least one solution is found.  */
+ccl_device_inline bool solve_quadratic(
+    const float a, const float b, const float c, ccl_private float &x1, ccl_private float &x2)
+{
+  /* If the equation is linear, the solution is -c/b, but b has to be non-zero. */
+  const bool valid_linear = (a == 0.0f) && (b != 0.0f);
+  x1 = x2 = -c / b;
+
+  const float discriminant = sqr(b) - 4.0f * a * c;
+  /* Allow slightly negative discriminant in case of numerical precision issues. */
+  const bool valid_quadratic = (a != 0.0f) && (discriminant > -1e-5f);
+
+  if (valid_quadratic) {
+    /* Numerically stable version of (-b ± sqrt(discriminant)) / (2 * a), avoiding catastrophic
+     * cancellation when `b` is very close to `sqrt(discriminant)`, by finding the solution of
+     * greater magnitude which does not suffer from loss of precision, then using the identity
+     * x1 * x2 = c / a. */
+    const float temp = -0.5f * (b + copysignf(safe_sqrtf(discriminant), b));
+    const float r1 = temp / a;
+    const float r2 = c / temp;
+
+    x1 = fminf(r1, r2);
+    x2 = fmaxf(r1, r2);
+  }
+
+  return (valid_linear || valid_quadratic);
+}
+
+/* Defines a closed interval [min, max]. */
+template<typename T> struct Interval {
+  T min;
+  T max;
+
+  ccl_device_inline_method bool is_empty() const
+  {
+    return min >= max;
+  }
+
+  ccl_device_inline_method bool contains(T value) const
+  {
+    return value >= min && value <= max;
+  }
+
+  ccl_device_inline_method T length() const
+  {
+    return max - min;
+  }
+};
+
+/* Computes the intersection of two intervals. */
+template<typename T>
+ccl_device_inline Interval<T> intervals_intersection(ccl_private const Interval<T> &first,
+                                                     ccl_private const Interval<T> &second)
+{
+  return {max(first.min, second.min), min(first.max, second.max)};
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/math_cdf.cpp
+++ b/intern/cycles/util/math_cdf.cpp
@@ -4,8 +4,9 @@

 #include "util/math_cdf.h"

+#include <cassert>
+
 #include "util/algorithm.h"
-#include "util/math.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_cdf.h
+++ b/intern/cycles/util/math_cdf.h
@@ -4,8 +4,7 @@

 #pragma once

-#include "util/algorithm.h"
-#include "util/math.h"
+#include "util/math_base.h"
 #include "util/vector.h"

 CCL_NAMESPACE_BEGIN
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -23,6 +23,13 @@

 #pragma once

+#include "util/math_base.h"
+#include "util/math_float3.h"
+#include "util/math_float4.h"
+#include "util/math_int4.h"
+#include "util/types_float3.h"
+#include "util/types_float4.h"
+
 CCL_NAMESPACE_BEGIN

 ccl_device_inline float madd(const float a, const float b, const float c)
@@ -91,8 +98,9 @@ ccl_device float fast_sinf(float x)
  x = madd(qf, -1.2816720341285448015e-12f * 4, x);
  x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
  float s = x * x;
-  if ((q & 1) != 0)
+  if ((q & 1) != 0) {
    x = -x;
+  }
  /* This polynomial approximation has very low error on [-pi/2,+pi/2]
   * 1.19209e-07 max error in total over [-2pi,+2pi]. */
  float u = 2.6083159809786593541503e-06f;
@@ -468,9 +476,7 @@ ccl_device_inline float fast_expm1f(float x)
    x = 1.0f - (1.0f - x); /* Crush denormals. */
    return madd(0.5f, x * x, x);
  }
-  else {
-    return fast_expf(x) - 1.0f;
-  }
+  return fast_expf(x) - 1.0f;
 }

 ccl_device float fast_sinhf(float x)
@@ -482,17 +488,15 @@ ccl_device float fast_sinhf(float x)
    float e = fast_expf(a);
    return copysignf(0.5f * e - 0.5f / e, x);
  }
-  else {
-    a = 1.0f - (1.0f - a); /* Crush denorms. */
-    float a2 = a * a;
-    /* Degree 7 polynomial generated with sollya. */
-    /* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
-    float r = 2.03945513931e-4f;
-    r = madd(r, a2, 8.32990277558e-3f);
-    r = madd(r, a2, 0.1666673421859f);
-    r = madd(r * a, a2, a);
-    return copysignf(r, x);
-  }
+  a = 1.0f - (1.0f - a); /* Crush denorms. */
+  float a2 = a * a;
+  /* Degree 7 polynomial generated with sollya. */
+  /* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
+  float r = 2.03945513931e-4f;
+  r = madd(r, a2, 8.32990277558e-3f);
+  r = madd(r, a2, 0.1666673421859f);
+  r = madd(r * a, a2, a);
+  return copysignf(r, x);
 }

 ccl_device_inline float fast_coshf(float x)
@@ -516,10 +520,12 @@ ccl_device_inline float fast_tanhf(float x)

 ccl_device float fast_safe_powf(float x, float y)
 {
-  if (y == 0)
+  if (y == 0) {
    return 1.0f; /* x^1=1 */
-  if (x == 0)
+  }
+  if (x == 0) {
    return 0.0f; /* 0^y=0 */
+  }
  float sign = 1.0f;
  if (x < 0.0f) {
    /* if x is negative, only deal with integer powers
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -4,9 +4,9 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/math_base.h"
+#include "util/types_float2.h"
+#include "util/types_float4.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -5,9 +5,10 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/math_base.h"
+#include "util/math_float4.h"
+#include "util/types_float3.h"
+#include "util/types_float4.h"

 CCL_NAMESPACE_BEGIN

@@ -384,10 +385,10 @@ ccl_device_inline float3 reflect(const float3 incident, const float3 unit_normal
 ccl_device_inline float3 refract(const float3 incident, const float3 normal, const float eta)
 {
  float k = 1.0f - eta * eta * (1.0f - dot(normal, incident) * dot(normal, incident));
-  if (k < 0.0f)
+  if (k < 0.0f) {
    return zero_float3();
-  else
-    return eta * incident - (eta * dot(normal, incident) + sqrt(k)) * normal;
+  }
+  return eta * incident - (eta * dot(normal, incident) + sqrt(k)) * normal;
 }

 ccl_device_inline float3 faceforward(const float3 vector,
@@ -498,13 +499,128 @@ ccl_device_inline bool isfinite_safe(float3 v)

 ccl_device_inline float3 ensure_finite(float3 v)
 {
-  if (!isfinite_safe(v.x))
+  if (!isfinite_safe(v.x)) {
    v.x = 0.0f;
-  if (!isfinite_safe(v.y))
+  }
+  if (!isfinite_safe(v.y)) {
    v.y = 0.0f;
-  if (!isfinite_safe(v.z))
+  }
+  if (!isfinite_safe(v.z)) {
    v.z = 0.0f;
+  }
  return v;
 }

+/* Triangle */
+
+ccl_device_inline float triangle_area(ccl_private const float3 &v1,
+                                      ccl_private const float3 &v2,
+                                      ccl_private const float3 &v3)
+{
+  return len(cross(v3 - v2, v1 - v2)) * 0.5f;
+}
+
+/* Orthonormal vectors */
+
+ccl_device_inline void make_orthonormals(const float3 N,
+                                         ccl_private float3 *a,
+                                         ccl_private float3 *b)
+{
+#if 0
+  if (fabsf(N.y) >= 0.999f) {
+    *a = make_float3(1, 0, 0);
+    *b = make_float3(0, 0, 1);
+    return;
+  }
+  if (fabsf(N.z) >= 0.999f) {
+    *a = make_float3(1, 0, 0);
+    *b = make_float3(0, 1, 0);
+    return;
+  }
+#endif
+
+  if (N.x != N.y || N.x != N.z) {
+    *a = make_float3(N.z - N.y, N.x - N.z, N.y - N.x);  //(1,1,1)x N
+  }
+  else {
+    *a = make_float3(N.z - N.y, N.x + N.z, -N.y - N.x);  //(-1,1,1)x N
+  }
+
+  *a = normalize(*a);
+  *b = cross(N, *a);
+}
+
+/* Rotation of point around axis and angle */
+
+ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
+{
+  float costheta = cosf(angle);
+  float sintheta = sinf(angle);
+  float3 r;
+
+  r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
+        (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
+        (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
+
+  r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
+        ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
+        (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
+
+  r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
+        (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
+        ((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
+
+  return r;
+}
+
+/* Calculate the angle between the two vectors a and b.
+ * The usual approach `acos(dot(a, b))` has severe precision issues for small angles,
+ * which are avoided by this method.
+ * Based on "Mangled Angles" from https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf
+ */
+ccl_device_inline float precise_angle(float3 a, float3 b)
+{
+  return 2.0f * atan2f(len(a - b), len(a + b));
+}
+
+/* Tangent of the angle between vectors a and b. */
+ccl_device_inline float tan_angle(float3 a, float3 b)
+{
+  return len(cross(a, b)) / dot(a, b);
+}
+
+/* projections */
+ccl_device_inline float2 map_to_tube(const float3 co)
+{
+  float len, u, v;
+  len = sqrtf(co.x * co.x + co.y * co.y);
+  if (len > 0.0f) {
+    u = (1.0f - (atan2f(co.x / len, co.y / len) / M_PI_F)) * 0.5f;
+    v = (co.z + 1.0f) * 0.5f;
+  }
+  else {
+    u = v = 0.0f;
+  }
+  return make_float2(u, v);
+}
+
+ccl_device_inline float2 map_to_sphere(const float3 co)
+{
+  float l = dot(co, co);
+  float u, v;
+  if (l > 0.0f) {
+    if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
+      u = 0.0f; /* Otherwise domain error. */
+    }
+    else {
+      u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F);
+    }
+    v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F;
+  }
+  else {
+    u = v = 0.0f;
+  }
+  return make_float2(u, v);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -5,9 +5,8 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/math_base.h"
+#include "util/types_float4.h"

 CCL_NAMESPACE_BEGIN

@@ -244,41 +243,40 @@ ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
 }

 #ifdef __KERNEL_SSE__
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const float4 shuffle(const float4 b)
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline float4 shuffle(const float4 a)
 {
 #  ifdef __KERNEL_NEON__
-  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a.m128));
 #  else
  return float4(
-      _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+      _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))));
 #  endif
 }

-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
+template<> __forceinline float4 shuffle<0, 1, 0, 1>(const float4 a)
 {
  return float4(_mm_movelh_ps(a, a));
 }

-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
+template<> __forceinline float4 shuffle<2, 3, 2, 3>(const float4 a)
 {
  return float4(_mm_movehl_ps(a, a));
 }

 #  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
+template<> __forceinline float4 shuffle<0, 0, 2, 2>(const float4 a)
 {
-  return float4(_mm_moveldup_ps(b));
+  return float4(_mm_moveldup_ps(a));
 }

-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
+template<> __forceinline float4 shuffle<1, 1, 3, 3>(const float4 a)
 {
-  return float4(_mm_movehdup_ps(b));
+  return float4(_mm_movehdup_ps(a));
 }
 #  endif /* __KERNEL_SSE3__ */

 template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const float4 shuffle(const float4 a, const float4 b)
+__forceinline float4 shuffle(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_NEON__
  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
@@ -287,11 +285,11 @@ __forceinline const float4 shuffle(const float4 a, const float4 b)
 #  endif
 }

-template<size_t i0> __forceinline const float4 shuffle(const float4 b)
+template<size_t i0> __forceinline float4 shuffle(const float4 a)
 {
-  return shuffle<i0, i0, i0, i0>(b);
+  return shuffle<i0, i0, i0, i0>(a);
 }
-template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
+template<size_t i0> __forceinline float4 shuffle(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_NEON__
  return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
@@ -300,12 +298,12 @@ template<size_t i0> __forceinline const float4 shuffle(const float4 a, const flo
 #  endif
 }

-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
+template<> __forceinline float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
 {
  return float4(_mm_movelh_ps(a, b));
 }

-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
+template<> __forceinline float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
 {
  return float4(_mm_movehl_ps(b, a));
 }
@@ -592,14 +590,18 @@ ccl_device_inline bool isfinite_safe(float4 v)

 ccl_device_inline float4 ensure_finite(float4 v)
 {
-  if (!isfinite_safe(v.x))
+  if (!isfinite_safe(v.x)) {
    v.x = 0.0f;
-  if (!isfinite_safe(v.y))
+  }
+  if (!isfinite_safe(v.y)) {
    v.y = 0.0f;
-  if (!isfinite_safe(v.z))
+  }
+  if (!isfinite_safe(v.z)) {
    v.z = 0.0f;
-  if (!isfinite_safe(v.w))
+  }
+  if (!isfinite_safe(v.w)) {
    v.w = 0.0f;
+  }
  return v;
 }

@@ -609,4 +611,27 @@ ccl_device_inline float4 power(float4 v, float e)
  return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.w, e));
 }

+#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
+/* Int/Float conversion */
+ccl_device_inline int4 __float4_as_int4(float4 f)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_castps_si128(f.m128));
+#  else
+  return make_int4(
+      __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
+#  endif
+}
+
+ccl_device_inline float4 __int4_as_float4(int4 i)
+{
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_castsi128_ps(i.m128));
+#  else
+  return make_float4(
+      __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
+#  endif
+}
+#endif /* !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__) */
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@@ -5,9 +5,9 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/math_base.h"
+#include "util/types_float8.h"
+#include "util/types_int8.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -4,9 +4,8 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/defines.h"
+#include "util/types_int2.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_int3.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_int4.h
+++ b/intern/cycles/util/math_int4.h
@@ -5,9 +5,8 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_float4.h"
+#include "util/types_int4.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_int8.h
+++ b/intern/cycles/util/math_int8.h
@@ -5,9 +5,9 @@

 #pragma once

-#ifndef __UTIL_MATH_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/math_base.h"
+#include "util/types_float8.h"
+#include "util/types_int8.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -4,6 +4,10 @@

 #pragma once

+#include "util/math_float2.h"
+#include "util/math_float3.h"
+#include "util/math_float4.h"
+
 CCL_NAMESPACE_BEGIN

 /* Ray Intersection */
@@ -287,16 +291,20 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
  }
  /* Store the result. */
  /* TODO(sergey): Check whether we can avoid some checks here. */
-  if (isect_P != nullptr)
+  if (isect_P != nullptr) {
    *isect_P = hit;
-  if (isect_t != nullptr)
+  }
+  if (isect_t != nullptr) {
    *isect_t = t;
+  }

  /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
-  if (isect_u != nullptr)
+  if (isect_u != nullptr) {
    *isect_u = v + 0.5f;
-  if (isect_v != nullptr)
+  }
+  if (isect_v != nullptr) {
    *isect_v = -u - v;
+  }

  return true;
 }
--- a/intern/cycles/util/math_matrix.h
+++ b/intern/cycles/util/math_matrix.h
@@ -4,6 +4,11 @@

 #pragma once

+#include "util/atomic.h"
+#include "util/math_base.h"
+#include "util/math_float3.h"
+#include "util/math_float4.h"
+
 CCL_NAMESPACE_BEGIN

 #define MAT(A, size, row, col) A[(row) * (size) + (col)]
@@ -92,7 +97,7 @@ ccl_device_inline void math_vec3_add(ccl_private float3 *v, int n, ccl_private f
 }

 ccl_device_inline void math_vec3_add_strided(
-    ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
+    ccl_global float3 *v, int n, ccl_private const float *x, float3 w, int stride)
 {
  for (int i = 0; i < n; i++) {
    ccl_global float *elem = (ccl_global float *)(v + i * stride);
@@ -215,16 +220,18 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A,
  /* Use forward substitution to solve L*b = y, replacing y by b. */
  for (int row = 0; row < n; row++) {
    float3 sum = VECS(y, row, stride);
-    for (int col = 0; col < row; col++)
+    for (int col = 0; col < row; col++) {
      sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+    }
    VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
  }

  /* Use backward substitution to solve Lt*S = b, replacing b by S. */
  for (int row = n - 1; row >= 0; row--) {
    float3 sum = VECS(y, row, stride);
-    for (int col = row + 1; col < n; col++)
+    for (int col = row + 1; col < n; col++) {
      sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+    }
    VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
  }
 }
@@ -430,7 +437,7 @@ ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restr
 {
  for (int row = 0; row < n; row++) {
    for (int col = 0; col <= row; col++) {
-      MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
+      MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col));
    }
  }
 }
--- a/intern/cycles/util/projection_inverse.h
+++ b/intern/cycles/util/projection_inverse.h
@@ -4,6 +4,8 @@

 #pragma once

+#include "util/defines.h"
+
 CCL_NAMESPACE_BEGIN

 ccl_device_forceinline bool projection_inverse_impl(ccl_private float R[4][4],
--- a/intern/cycles/util/rect.h
+++ b/intern/cycles/util/rect.h
@@ -4,7 +4,8 @@

 #pragma once

-#include "util/types.h"
+#include "util/math_base.h"
+#include "util/types_int4.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/util/stack_allocator.h
+++ b/intern/cycles/util/stack_allocator.h
@@ -10,7 +10,8 @@
 CCL_NAMESPACE_BEGIN

 /* Stack allocator for the use with STL. */
-template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {
+template<int SIZE, typename T> class ccl_try_align(16) StackAllocator
+{
 public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
@@ -58,7 +59,7 @@ template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {
    return mem;
  }

-  void deallocate(T *p, size_t n)
+  void deallocate(T * p, size_t n)
  {
    if (p == nullptr) {
      return;
@@ -77,7 +78,7 @@ template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {

  /* Address of an reference. */

-  T *address(T &x) const
+  T *address(T & x) const
  {
    return &x;
  }
@@ -89,14 +90,14 @@ template<int SIZE, typename T> class ccl_try_align(16) StackAllocator {

  /* Object construction/destruction. */

-  void construct(T *p, const T &val)
+  void construct(T * p, const T &val)
  {
    if (p != nullptr) {
      new ((T *)p) T(val);
    }
  }

-  void destroy(T *p)
+  void destroy(T * p)
  {
    p->~T();
  }
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -2,134 +2,28 @@
 *
 * SPDX-License-Identifier: Apache-2.0 */

-#ifndef __UTIL_TYPES_H__
-#define __UTIL_TYPES_H__
+#pragma once

-#if !defined(__KERNEL_METAL__)
-#  include <stdlib.h>
-#endif
+#include "util/types_base.h"  // IWYU pragma: export

-/* Standard Integer Types */
+#include "util/types_uchar2.h"  // IWYU pragma: export
+#include "util/types_uchar3.h"  // IWYU pragma: export
+#include "util/types_uchar4.h"  // IWYU pragma: export

-#if !defined(__KERNEL_GPU__)
-#  include <stdint.h>
-#  include <stdio.h>
-#endif
+#include "util/types_int2.h"  // IWYU pragma: export
+#include "util/types_int3.h"  // IWYU pragma: export
+#include "util/types_int4.h"  // IWYU pragma: export
+#include "util/types_int8.h"  // IWYU pragma: export

-#include "util/defines.h"
+#include "util/types_uint2.h"  // IWYU pragma: export
+#include "util/types_uint3.h"  // IWYU pragma: export
+#include "util/types_uint4.h"  // IWYU pragma: export

-#ifndef __KERNEL_GPU__
-#  include "util/optimization.h"
-#  include "util/simd.h"
-#endif
+#include "util/types_ushort4.h"  // IWYU pragma: export

-CCL_NAMESPACE_BEGIN
+#include "util/types_float2.h"  // IWYU pragma: export
+#include "util/types_float3.h"  // IWYU pragma: export
+#include "util/types_float4.h"  // IWYU pragma: export
+#include "util/types_float8.h"  // IWYU pragma: export

-/* Types
- *
- * Define simpler unsigned type names, and integer with defined number of bits.
- * Also vector types, named to be compatible with OpenCL builtin types, while
- * working for CUDA and C++ too. */
-
-/* Shorter Unsigned Names */
-
-typedef unsigned char uchar;
-typedef unsigned int uint;
-typedef unsigned short ushort;
-
-/* Fixed Bits Types */
-
-#ifndef __KERNEL_GPU__
-/* Generic Memory Pointer */
-
-typedef uint64_t device_ptr;
-#endif /* __KERNEL_GPU__ */
-
-ccl_device_inline size_t align_up(size_t offset, size_t alignment)
-{
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
-ccl_device_inline size_t divide_up(size_t x, size_t y)
-{
-  return (x + y - 1) / y;
-}
-
-ccl_device_inline size_t round_up(size_t x, size_t multiple)
-{
-  return ((x + multiple - 1) / multiple) * multiple;
-}
-
-ccl_device_inline size_t round_down(size_t x, size_t multiple)
-{
-  return (x / multiple) * multiple;
-}
-
-ccl_device_inline bool is_power_of_two(size_t x)
-{
-  return (x & (x - 1)) == 0;
-}
-
-CCL_NAMESPACE_END
-
-/* Device side printf only tested on CUDA, may work on more GPU devices. */
-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__)
-#  define __KERNEL_PRINTF__
-#endif
-
-ccl_device_inline void print_float(ccl_private const char *label, const float a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %.8f\n", label, (double)a);
-#endif
-}
-
-/* Most GPU APIs matching native vector types, so we only need to implement them for
- * CPU and oneAPI. */
-#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
-#  define __KERNEL_NATIVE_VECTOR_TYPES__
-#endif
-
-/* Vectorized types declaration. */
-#include "util/types_uchar2.h"
-#include "util/types_uchar3.h"
-#include "util/types_uchar4.h"
-
-#include "util/types_int2.h"
-#include "util/types_int3.h"
-#include "util/types_int4.h"
-#include "util/types_int8.h"
-
-#include "util/types_uint2.h"
-#include "util/types_uint3.h"
-#include "util/types_uint4.h"
-
-#include "util/types_ushort4.h"
-
-#include "util/types_float2.h"
-#include "util/types_float3.h"
-#include "util/types_float4.h"
-#include "util/types_float8.h"
-
-#include "util/types_spectrum.h"
-
-/* Vectorized types implementation. */
-#include "util/types_uchar2_impl.h"
-#include "util/types_uchar3_impl.h"
-#include "util/types_uchar4_impl.h"
-
-#include "util/types_int2_impl.h"
-#include "util/types_int3_impl.h"
-#include "util/types_int4_impl.h"
-#include "util/types_int8_impl.h"
-
-#include "util/types_uint2_impl.h"
-#include "util/types_uint3_impl.h"
-#include "util/types_uint4_impl.h"
-
-#include "util/types_float2_impl.h"
-#include "util/types_float3_impl.h"
-#include "util/types_float4_impl.h"
-#include "util/types_float8_impl.h"
-
-#endif /* __UTIL_TYPES_H__ */
+#include "util/types_spectrum.h"  // IWYU pragma: export
--- a/intern/cycles/util/types_base.h
+++ b/intern/cycles/util/types_base.h
@@ -0,0 +1,90 @@
+/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#pragma once
+
+#if !defined(__KERNEL_METAL__)
+#  include <cstdlib>
+#endif
+
+/* Standard Integer Types */
+
+#if !defined(__KERNEL_GPU__)
+#  include <cstdint>
+#  include <cstdio>
+#endif
+
+#include "util/defines.h"
+
+#ifndef __KERNEL_GPU__
+#  include "util/optimization.h"
+#  include "util/simd.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Types
+ *
+ * Define simpler unsigned type names, and integer with defined number of bits.
+ * Also vector types, named to be compatible with OpenCL builtin types, while
+ * working for CUDA and C++ too. */
+
+/* Shorter Unsigned Names */
+
+using uchar = unsigned char;
+using uint = unsigned int;
+using ushort = unsigned short;
+
+/* Fixed Bits Types */
+
+#ifndef __KERNEL_GPU__
+/* Generic Memory Pointer */
+
+using device_ptr = uint64_t;
+#endif /* __KERNEL_GPU__ */
+
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
+{
+  return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+ccl_device_inline size_t divide_up(size_t x, size_t y)
+{
+  return (x + y - 1) / y;
+}
+
+ccl_device_inline size_t round_up(size_t x, size_t multiple)
+{
+  return ((x + multiple - 1) / multiple) * multiple;
+}
+
+ccl_device_inline size_t round_down(size_t x, size_t multiple)
+{
+  return (x / multiple) * multiple;
+}
+
+ccl_device_inline bool is_power_of_two(size_t x)
+{
+  return (x & (x - 1)) == 0;
+}
+
+CCL_NAMESPACE_END
+
+/* Device side printf only tested on CUDA, may work on more GPU devices. */
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__)
+#  define __KERNEL_PRINTF__
+#endif
+
+ccl_device_inline void print_float(ccl_private const char *label, const float a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f\n", label, (double)a);
+#endif
+}
+
+/* Most GPU APIs matching native vector types, so we only need to implement them for
+ * CPU and oneAPI. */
+#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
+#  define __KERNEL_NATIVE_VECTOR_TYPES__
+#endif
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -20,10 +18,34 @@ struct float2 {
 #  endif
 };

-ccl_device_inline float2 make_float2(float x, float y);
+#  ifndef __KERNEL_GPU__
+__forceinline float float2::operator[](int i) const
+{
+  util_assert(i >= 0);
+  util_assert(i < 2);
+  return *(&x + i);
+}
+
+__forceinline float &float2::operator[](int i)
+{
+  util_assert(i >= 0);
+  util_assert(i < 2);
+  return *(&x + i);
+}
+#  endif
+
+ccl_device_inline float2 make_float2(float x, float y)
+{
+  float2 a = {x, y};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

-ccl_device_inline float2 make_float2(float3 a);
-ccl_device_inline void print_float2(ccl_private const char *label, const float2 a);
+ccl_device_inline void print_float2(ccl_private const char *label, const float2 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
+#endif
+}

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float2_impl.h
+++ b/intern/cycles/util/types_float2_impl.h
@@ -1,49 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-__forceinline float float2::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 2);
-  return *(&x + i);
-}
-
-__forceinline float &float2::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 2);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline float2 make_float2(float x, float y)
-{
-  float2 a = {x, y};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-ccl_device_inline float2 make_float2(float3 a)
-{
-  return make_float2(a.x, a.y);
-}
-
-ccl_device_inline void print_float2(ccl_private const char *label, const float2 a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -4,9 +4,9 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"
+#include "util/types_float2.h"
+#include "util/types_int4.h"

 CCL_NAMESPACE_BEGIN

@@ -37,33 +37,107 @@ struct ccl_try_align(16) float3

 #  ifdef __KERNEL_SSE__
  /* Convenient constructors and operators for SIMD, otherwise default is enough. */
-  __forceinline float3();
-  __forceinline float3(const float3 &a);
-  __forceinline explicit float3(const __m128 &a);
+  __forceinline float3() = default;
+  __forceinline float3(const float3 &a) = default;
+  __forceinline explicit float3(const __m128 &a) : m128(a) {}

-  __forceinline operator const __m128 &() const;
-  __forceinline operator __m128 &();
+  __forceinline operator const __m128 &() const
+  {
+    return m128;
+  }
+  __forceinline operator __m128 &()
+  {
+    return m128;
+  }

-  __forceinline float3 &operator=(const float3 &a);
+  __forceinline float3 &operator=(const float3 &a)
+  {
+    m128 = a.m128;
+    return *this;
+  }
 #  endif

 #  ifndef __KERNEL_GPU__
-  __forceinline float operator[](int i) const;
-  __forceinline float &operator[](int i);
+  __forceinline float operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+  __forceinline float &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline float3 make_float3(float x, float y, float z);
+ccl_device_inline float3 make_float3(float x, float y, float z)
+{
+#  if defined(__KERNEL_GPU__)
+  return {x, y, z};
+#  elif defined(__KERNEL_SSE__)
+  return float3(_mm_set_ps(0.0f, z, y, x));
+#  else
+  return {x, y, z, 0.0f};
+#  endif
+}
+
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

-ccl_device_inline float3 make_float3(float f);
-ccl_device_inline float3 make_float3(float4 a);
-ccl_device_inline float3 make_float3(float2 a);
-ccl_device_inline float3 make_float3(float2 a, float b);
-ccl_device_inline void print_float3(ccl_private const char *label, const float3 a);
+ccl_device_inline float3 make_float3(float f)
+{
+#if defined(__KERNEL_GPU__)
+  return make_float3(f, f, f);
+#elif defined(__KERNEL_SSE__)
+  return float3(_mm_set1_ps(f));
+#else
+  return {f, f, f, f};
+#endif
+}

-/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
+ccl_device_inline float3 make_float3(float2 a)
+{
+  return make_float3(a.x, a.y, 0.0f);
+}
+
+ccl_device_inline float3 make_float3(float2 a, float b)
+{
+  return make_float3(a.x, a.y, b);
+}
+
+ccl_device_inline void print_float3(ccl_private const char *label, const float3 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
+#else
+  (void)label;
+  (void)a;
+#endif
+}
+
+ccl_device_inline float2 make_float2(float3 a)
+{
+  return make_float2(a.x, a.y);
+}
+
+ccl_device_inline int4 make_int4(const float3 f)
+{
+#if defined(__KERNEL_GPU__)
+  return make_int4((int)f.x, (int)f.y, (int)f.z, 0);
+#elif defined(__KERNEL_SSE__)
+  return int4(_mm_cvtps_epi32(f.m128));
+#else
+  return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
+#endif
+}
+
+/* Packed float3
+ *
+ * Smaller float3 for storage. For math operations this must be converted to float3, so that on the
 * CPU SIMD instructions can be used. */
+
 #if defined(__KERNEL_METAL__)
 /* Metal has native packed_float3. */
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) || defined(__KERNEL_ONEAPI__)
@@ -71,7 +145,7 @@ ccl_device_inline void print_float3(ccl_private const char *label, const float3
 typedef float3 packed_float3;
 #else
 struct packed_float3 {
-  ccl_device_inline_method packed_float3(){};
+  ccl_device_inline_method packed_float3() = default;

  ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z) {}

--- a/intern/cycles/util/types_float3_impl.h
+++ b/intern/cycles/util/types_float3_impl.h
@@ -1,100 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifdef __KERNEL_SSE__
-__forceinline float3::float3() {}
-
-__forceinline float3::float3(const float3 &a) : m128(a.m128) {}
-
-__forceinline float3::float3(const __m128 &a) : m128(a) {}
-
-__forceinline float3::operator const __m128 &() const
-{
-  return m128;
-}
-
-__forceinline float3::operator __m128 &()
-{
-  return m128;
-}
-
-__forceinline float3 &float3::operator=(const float3 &a)
-{
-  m128 = a.m128;
-  return *this;
-}
-#  endif /* __KERNEL_SSE__ */
-
-#  ifndef __KERNEL_GPU__
-__forceinline float float3::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline float &float3::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 3);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline float3 make_float3(float x, float y, float z)
-{
-#  if defined(__KERNEL_GPU__)
-  return {x, y, z};
-#  elif defined(__KERNEL_SSE__)
-  return float3(_mm_set_ps(0.0f, z, y, x));
-#  else
-  return {x, y, z, 0.0f};
-#  endif
-}
-
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-ccl_device_inline float3 make_float3(float f)
-{
-#if defined(__KERNEL_GPU__)
-  return make_float3(f, f, f);
-#elif defined(__KERNEL_SSE__)
-  return float3(_mm_set1_ps(f));
-#else
-  return {f, f, f, f};
-#endif
-}
-
-ccl_device_inline float3 make_float3(float4 a)
-{
-  return make_float3(a.x, a.y, a.z);
-}
-
-ccl_device_inline float3 make_float3(float2 a)
-{
-  return make_float3(a.x, a.y, 0.0f);
-}
-
-ccl_device_inline float3 make_float3(float2 a, float b)
-{
-  return make_float3(a.x, a.y, b);
-}
-
-ccl_device_inline void print_float3(ccl_private const char *label, const float3 a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float4.h
+++ b/intern/cycles/util/types_float4.h
@@ -4,9 +4,9 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"
+#include "util/types_float3.h"
+#include "util/types_int4.h"

 CCL_NAMESPACE_BEGIN

@@ -23,30 +23,103 @@ struct ccl_try_align(16) float4
    };
  };

-  __forceinline float4();
-  __forceinline explicit float4(const __m128 &a);
+  __forceinline float4() = default;
+  __forceinline float4(const float4 &a) = default;
+  __forceinline explicit float4(const __m128 &a) : m128(a) {}

-  __forceinline operator const __m128 &() const;
-  __forceinline operator __m128 &();
+  __forceinline operator const __m128 &() const
+  {
+    return m128;
+  }
+  __forceinline operator __m128 &()
+  {
+    return m128;
+  }

-  __forceinline float4 &operator=(const float4 &a);
+  __forceinline float4 &operator=(const float4 &a)
+  {
+    m128 = a.m128;
+    return *this;
+  }

 #  else  /* __KERNEL_SSE__ */
  float x, y, z, w;
 #  endif /* __KERNEL_SSE__ */

 #  ifndef __KERNEL_GPU__
-  __forceinline float operator[](int i) const;
-  __forceinline float &operator[](int i);
+  __forceinline float operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 4);
+    return *(&x + i);
+  }
+  __forceinline float &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 4);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline float4 make_float4(float x, float y, float z, float w);
+ccl_device_inline float4 make_float4(float x, float y, float z, float w)
+{
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_set_ps(w, z, y, x));
+#  else
+  return {x, y, z, w};
+#  endif
+}
+
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

-ccl_device_inline float4 make_float4(float f);
-ccl_device_inline float4 make_float4(float3 a, float b);
-ccl_device_inline float4 make_float4(const int4 i);
-ccl_device_inline void print_float4(ccl_private const char *label, const float4 a);
+ccl_device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+  return float4(_mm_set1_ps(f));
+#else
+  return make_float4(f, f, f, f);
+#endif
+}
+
+ccl_device_inline float4 make_float4(float3 a, float b)
+{
+  return make_float4(a.x, a.y, a.z, b);
+}
+
+ccl_device_inline float4 make_float4(float3 a)
+{
+  return make_float4(a.x, a.y, a.z, 1.0f);
+}
+
+ccl_device_inline float4 make_float4(const int4 i)
+{
+#ifdef __KERNEL_SSE__
+  return float4(_mm_cvtepi32_ps(i.m128));
+#else
+  return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w);
+#endif
+}
+
+ccl_device_inline float3 make_float3(float4 a)
+{
+  return make_float3(a.x, a.y, a.z);
+}
+
+ccl_device_inline int4 make_int4(const float4 f)
+{
+#ifdef __KERNEL_SSE__
+  return int4(_mm_cvtps_epi32(f.m128));
+#else
+  return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
+#endif
+}
+
+ccl_device_inline void print_float4(ccl_private const char *label, const float4 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
+#endif
+}

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float4_impl.h
+++ b/intern/cycles/util/types_float4_impl.h
@@ -1,98 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifdef __KERNEL_SSE__
-__forceinline float4::float4() {}
-
-__forceinline float4::float4(const __m128 &a) : m128(a) {}
-
-__forceinline float4::operator const __m128 &() const
-{
-  return m128;
-}
-
-__forceinline float4::operator __m128 &()
-{
-  return m128;
-}
-
-__forceinline float4 &float4::operator=(const float4 &a)
-{
-  m128 = a.m128;
-  return *this;
-}
-#  endif /* __KERNEL_SSE__ */
-
-#  ifndef __KERNEL_GPU__
-__forceinline float float4::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 4);
-  return *(&x + i);
-}
-
-__forceinline float &float4::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 4);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline float4 make_float4(float x, float y, float z, float w)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_set_ps(w, z, y, x));
-#  else
-  return {x, y, z, w};
-#  endif
-}
-
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-ccl_device_inline float4 make_float4(float f)
-{
-#ifdef __KERNEL_SSE__
-  return float4(_mm_set1_ps(f));
-#else
-  return make_float4(f, f, f, f);
-#endif
-}
-
-ccl_device_inline float4 make_float4(float3 a, float b)
-{
-  return make_float4(a.x, a.y, a.z, b);
-}
-
-ccl_device_inline float4 make_float4(float3 a)
-{
-  return make_float4(a.x, a.y, a.z, 1.0f);
-}
-
-ccl_device_inline float4 make_float4(const int4 i)
-{
-#ifdef __KERNEL_SSE__
-  return float4(_mm_cvtepi32_ps(i.m128));
-#else
-  return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w);
-#endif
-}
-
-ccl_device_inline void print_float4(ccl_private const char *label, const float4 a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -7,9 +7,9 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"
+#include "util/types_float4.h"
+#include "util/types_int8.h"

 CCL_NAMESPACE_BEGIN

@@ -30,30 +30,99 @@ struct ccl_try_align(32) vfloat8
    };
  };

-  __forceinline vfloat8();
-  __forceinline vfloat8(const vfloat8 &a);
-  __forceinline explicit vfloat8(const __m256 &a);
+  __forceinline vfloat8() = default;
+  __forceinline vfloat8(const vfloat8 &a) = default;
+  __forceinline explicit vfloat8(const __m256 &a) : m256(a) {}

-  __forceinline operator const __m256 &() const;
-  __forceinline operator __m256 &();
+  __forceinline operator const __m256 &() const
+  {
+    return m256;
+  }
+  __forceinline operator __m256 &()
+  {
+    return m256;
+  }

-  __forceinline vfloat8 &operator=(const vfloat8 &a);
+  __forceinline vfloat8 &operator=(const vfloat8 &a)
+  {
+    m256 = a.m256;
+    return *this;
+  }

 #else  /* __KERNEL_AVX__ */
  float a, b, c, d, e, f, g, h;
 #endif /* __KERNEL_AVX__ */

 #ifndef __KERNEL_GPU__
-  __forceinline float operator[](int i) const;
-  __forceinline float &operator[](int i);
+  __forceinline float operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 8);
+    return *(&a + i);
+  }
+  __forceinline float &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 8);
+    return *(&a + i);
+  }
 #endif
 };

-ccl_device_inline vfloat8 make_vfloat8(float f);
-ccl_device_inline vfloat8
-make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h);
-ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b);
+ccl_device_inline vfloat8 make_vfloat8(float f)
+{
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_set1_ps(f));
+#else
+  vfloat8 r = {f, f, f, f, f, f, f, f};
+#endif
+  return r;
+}

-ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a);
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
+{
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#else
+  vfloat8 r = {a, b, c, d, e, f, g, h};
+#endif
+  return r;
+}
+
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
+#else
+  return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
+         label,
+         (double)a.a,
+         (double)a.b,
+         (double)a.c,
+         (double)a.d,
+         (double)a.e,
+         (double)a.f,
+         (double)a.g,
+         (double)a.h);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const vfloat8 f)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_cvtps_epi32(f.m256));
+#else
+  return make_vint8(
+      (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
+#endif
+}

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -1,102 +0,0 @@
-/* SPDX-FileCopyrightText: 2017 Intel Corporation
- * SPDX-FileCopyrightText: 2018-2022 Blender Foundation
- *
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Originally by Intel Corporation, modified by the Blender Foundation. */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_AVX__
-__forceinline vfloat8::vfloat8() {}
-
-__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256) {}
-
-__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f) {}
-
-__forceinline vfloat8::operator const __m256 &() const
-{
-  return m256;
-}
-
-__forceinline vfloat8::operator __m256 &()
-{
-  return m256;
-}
-
-__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f)
-{
-  m256 = f.m256;
-  return *this;
-}
-#endif /* __KERNEL_AVX__ */
-
-#ifndef __KERNEL_GPU__
-__forceinline float vfloat8::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 8);
-  return *(&a + i);
-}
-
-__forceinline float &vfloat8::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 8);
-  return *(&a + i);
-}
-#endif
-
-ccl_device_inline vfloat8 make_vfloat8(float f)
-{
-#ifdef __KERNEL_AVX__
-  vfloat8 r(_mm256_set1_ps(f));
-#else
-  vfloat8 r = {f, f, f, f, f, f, f, f};
-#endif
-  return r;
-}
-
-ccl_device_inline vfloat8
-make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
-{
-#ifdef __KERNEL_AVX__
-  vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
-#else
-  vfloat8 r = {a, b, c, d, e, f, g, h};
-#endif
-  return r;
-}
-
-ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
-{
-#ifdef __KERNEL_AVX__
-  return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
-#else
-  return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
-#endif
-}
-
-ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
-         label,
-         (double)a.a,
-         (double)a.b,
-         (double)a.c,
-         (double)a.d,
-         (double)a.e,
-         (double)a.f,
-         (double)a.g,
-         (double)a.h);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int2.h
+++ b/intern/cycles/util/types_int2.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,27 @@ struct int2 {
  int x, y;

 #  ifndef __KERNEL_GPU__
-  __forceinline int operator[](int i) const;
-  __forceinline int &operator[](int i);
+  __forceinline int operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 2);
+    return *(&x + i);
+  }
+
+  __forceinline int &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 2);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline int2 make_int2(int x, int y);
+ccl_device_inline int2 make_int2(int x, int y)
+{
+  int2 a = {x, y};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int2_impl.h
+++ b/intern/cycles/util/types_int2_impl.h
@@ -1,37 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-int int2::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 2);
-  return *(&x + i);
-}
-
-int &int2::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 2);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline int2 make_int2(int x, int y)
-{
-  int2 a = {x, y};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int3.h
+++ b/intern/cycles/util/types_int3.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -26,30 +24,77 @@ struct ccl_try_align(16) int3
    };
  };

-  __forceinline int3();
-  __forceinline int3(const int3 &a);
-  __forceinline explicit int3(const __m128i &a);
+  __forceinline int3() = default;
+  __forceinline int3(const int3 &a) = default;
+  __forceinline explicit int3(const __m128i &a) : m128(a) {}

-  __forceinline operator const __m128i &() const;
-  __forceinline operator __m128i &();
+  __forceinline operator const __m128i &() const
+  {
+    return m128;
+  }

-  __forceinline int3 &operator=(const int3 &a);
+  __forceinline operator __m128i &()
+  {
+    return m128;
+  }
+
+  __forceinline int3 &operator=(const int3 &a)
+  {
+    m128 = a.m128;
+    return *this;
+  }
 #    else  /* __KERNEL_SSE__ */
  int x, y, z, w;
 #    endif /* __KERNEL_SSE__ */
 #  endif

 #  ifndef __KERNEL_GPU__
-  __forceinline int operator[](int i) const;
-  __forceinline int &operator[](int i);
+  __forceinline int operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+
+  __forceinline int &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline int3 make_int3(int x, int y, int z);
+ccl_device_inline int3 make_int3(int x, int y, int z)
+{
+#  if defined(__KERNEL_GPU__)
+  return {x, y, z};
+#  elif defined(__KERNEL_SSE__)
+  return int3(_mm_set_epi32(0, z, y, x));
+#  else
+  return {x, y, z, 0};
+#  endif
+}
+
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

-ccl_device_inline int3 make_int3(int i);
-ccl_device_inline void print_int3(ccl_private const char *label, const int3 a);
+ccl_device_inline int3 make_int3(int i)
+{
+#if defined(__KERNEL_GPU__)
+  return make_int3(i, i, i);
+#elif defined(__KERNEL_SSE__)
+  return int3(_mm_set1_epi32(i));
+#else
+  return {i, i, i, i};
+#endif
+}
+
+ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
+#endif
+}

 #if defined(__KERNEL_METAL__)
 /* Metal has native packed_int3. */
@@ -61,7 +106,7 @@ typedef int3 packed_int3;
 struct packed_int3 {
  int x, y, z;

-  ccl_device_inline_method packed_int3(){};
+  ccl_device_inline_method packed_int3() = default;

  ccl_device_inline_method packed_int3(const int px, const int py, const int pz)
      : x(px), y(py), z(pz){};
@@ -82,12 +127,27 @@ struct packed_int3 {
  }

 #  ifndef __KERNEL_GPU__
-  __forceinline int operator[](int i) const;
-  __forceinline int &operator[](int i);
+  __forceinline int operator[](int i) const
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+
+  __forceinline int &operator[](int i)
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

 static_assert(sizeof(packed_int3) == 12, "packed_int3 expected to be exactly 12 bytes");
 #endif

+ccl_device_inline packed_int3 make_packed_int3(int x, int y, int z)
+{
+  packed_int3 a = {x, y, z};
+  return a;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int3_impl.h
+++ b/intern/cycles/util/types_int3_impl.h
@@ -1,103 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifdef __KERNEL_SSE__
-__forceinline int3::int3() {}
-
-__forceinline int3::int3(const __m128i &a) : m128(a) {}
-
-__forceinline int3::int3(const int3 &a) : m128(a.m128) {}
-
-__forceinline int3::operator const __m128i &() const
-{
-  return m128;
-}
-
-__forceinline int3::operator __m128i &()
-{
-  return m128;
-}
-
-__forceinline int3 &int3::operator=(const int3 &a)
-{
-  m128 = a.m128;
-  return *this;
-}
-#  endif /* __KERNEL_SSE__ */
-
-#  ifndef __KERNEL_GPU__
-__forceinline int int3::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline int &int3::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline int packed_int3::operator[](int i) const
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline int &packed_int3::operator[](int i)
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline int3 make_int3(int x, int y, int z)
-{
-#  if defined(__KERNEL_GPU__)
-  return {x, y, z};
-#  elif defined(__KERNEL_SSE__)
-  return int3(_mm_set_epi32(0, z, y, x));
-#  else
-  return {x, y, z, 0};
-#  endif
-}
-
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-ccl_device_inline int3 make_int3(int i)
-{
-#if defined(__KERNEL_GPU__)
-  return make_int3(i, i, i);
-#elif defined(__KERNEL_SSE__)
-  return int3(_mm_set1_epi32(i));
-#else
-  return {i, i, i, i};
-#endif
-}
-
-ccl_device_inline packed_int3 make_packed_int3(int x, int y, int z)
-{
-  packed_int3 a = {x, y, z};
-  return a;
-}
-
-ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int4.h
+++ b/intern/cycles/util/types_int4.h
@@ -4,17 +4,12 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

 #ifndef __KERNEL_NATIVE_VECTOR_TYPES__

-struct float3;
-struct float4;
-
 struct ccl_try_align(16) int4
 {
 #  ifdef __KERNEL_SSE__
@@ -25,30 +20,68 @@ struct ccl_try_align(16) int4
    };
  };

-  __forceinline int4();
-  __forceinline int4(const int4 &a);
-  __forceinline explicit int4(const __m128i &a);
+  __forceinline int4() = default;
+  __forceinline int4(const int4 &a) = default;
+  __forceinline explicit int4(const __m128i &a) : m128(a) {}

-  __forceinline operator const __m128i &() const;
-  __forceinline operator __m128i &();
+  __forceinline operator const __m128i &() const
+  {
+    return m128;
+  }
+  __forceinline operator __m128i &()
+  {
+    return m128;
+  }

-  __forceinline int4 &operator=(const int4 &a);
+  __forceinline int4 &operator=(const int4 &a)
+  {
+    m128 = a.m128;
+    return *this;
+  }
 #  else  /* __KERNEL_SSE__ */
  int x, y, z, w;
 #  endif /* __KERNEL_SSE__ */

 #  ifndef __KERNEL_GPU__
-  __forceinline int operator[](int i) const;
-  __forceinline int &operator[](int i);
+  __forceinline int operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 4);
+    return *(&x + i);
+  }
+  __forceinline int &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 4);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline int4 make_int4(int x, int y, int z, int w);
+ccl_device_inline int4 make_int4(int x, int y, int z, int w)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_set_epi32(w, z, y, x));
+#  else
+  return {x, y, z, w};
+#  endif
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

-ccl_device_inline int4 make_int4(int i);
-ccl_device_inline int4 make_int4(const float3 f);
-ccl_device_inline int4 make_int4(const float4 f);
-ccl_device_inline void print_int4(ccl_private const char *label, const int4 a);
+ccl_device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+  return int4(_mm_set1_epi32(i));
+#else
+  return make_int4(i, i, i, i);
+#endif
+}
+
+ccl_device_inline void print_int4(ccl_private const char *label, const int4 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
+#endif
+}

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int4_impl.h
+++ b/intern/cycles/util/types_int4_impl.h
@@ -1,101 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifdef __KERNEL_SSE__
-__forceinline int4::int4() {}
-
-__forceinline int4::int4(const int4 &a) : m128(a.m128) {}
-
-__forceinline int4::int4(const __m128i &a) : m128(a) {}
-
-__forceinline int4::operator const __m128i &() const
-{
-  return m128;
-}
-
-__forceinline int4::operator __m128i &()
-{
-  return m128;
-}
-
-__forceinline int4 &int4::operator=(const int4 &a)
-{
-  m128 = a.m128;
-  return *this;
-}
-#  endif /* __KERNEL_SSE__ */
-
-#  ifndef __KERNEL_GPU__
-__forceinline int int4::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 4);
-  return *(&x + i);
-}
-
-__forceinline int &int4::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 4);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline int4 make_int4(int x, int y, int z, int w)
-{
-#  ifdef __KERNEL_SSE__
-  return int4(_mm_set_epi32(w, z, y, x));
-#  else
-  return {x, y, z, w};
-#  endif
-}
-
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-ccl_device_inline int4 make_int4(int i)
-{
-#ifdef __KERNEL_SSE__
-  return int4(_mm_set1_epi32(i));
-#else
-  return make_int4(i, i, i, i);
-#endif
-}
-
-ccl_device_inline int4 make_int4(const float3 f)
-{
-#if defined(__KERNEL_GPU__)
-  return make_int4((int)f.x, (int)f.y, (int)f.z, 0);
-#elif defined(__KERNEL_SSE__)
-  return int4(_mm_cvtps_epi32(f.m128));
-#else
-  return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
-#endif
-}
-
-ccl_device_inline int4 make_int4(const float4 f)
-{
-#ifdef __KERNEL_SSE__
-  return int4(_mm_cvtps_epi32(f.m128));
-#else
-  return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
-#endif
-}
-
-ccl_device_inline void print_int4(ccl_private const char *label, const int4 a)
-{
-#ifdef __KERNEL_PRINTF__
-  printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int8.h
+++ b/intern/cycles/util/types_int8.h
@@ -4,9 +4,8 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"
+#include "util/types_int4.h"

 CCL_NAMESPACE_BEGIN

@@ -26,27 +25,69 @@ struct ccl_try_align(32) vint8
    };
  };

-  __forceinline vint8();
-  __forceinline vint8(const vint8 &a);
-  __forceinline explicit vint8(const __m256i &a);
+  __forceinline vint8() = default;
+  __forceinline vint8(const vint8 &a) = default;
+  __forceinline explicit vint8(const __m256i &a) : m256(a) {}

-  __forceinline operator const __m256i &() const;
-  __forceinline operator __m256i &();
+  __forceinline operator const __m256i &() const
+  {
+    return m256;
+  }
+  __forceinline operator __m256i &()
+  {
+    return m256;
+  }

-  __forceinline vint8 &operator=(const vint8 &a);
+  __forceinline vint8 &operator=(const vint8 &a)
+  {
+    m256 = a.m256;
+    return *this;
+  }
 #else  /* __KERNEL_AVX__ */
  int a, b, c, d, e, f, g, h;
 #endif /* __KERNEL_AVX__ */

 #ifndef __KERNEL_GPU__
-  __forceinline int operator[](int i) const;
-  __forceinline int &operator[](int i);
+  __forceinline int operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 8);
+    return *(&a + i);
+  }
+  __forceinline int &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 8);
+    return *(&a + i);
+  }
 #endif
 };

-ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h);
-ccl_device_inline vint8 make_vint8(int i);
-ccl_device_inline vint8 make_vint8(const vfloat8 f);
-ccl_device_inline vint8 make_vint8(const int4 a, const int4 b);
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
+#else
+  return {a, b, c, d, e, f, g, h};
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(int i)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set1_epi32(i));
+#else
+  return make_vint8(i, i, i, i, i, i, i, i);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
+#else
+  return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int8_impl.h
+++ b/intern/cycles/util/types_int8_impl.h
@@ -1,90 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_AVX__
-__forceinline vint8::vint8() {}
-
-__forceinline vint8::vint8(const vint8 &a) : m256(a.m256) {}
-
-__forceinline vint8::vint8(const __m256i &a) : m256(a) {}
-
-__forceinline vint8::operator const __m256i &() const
-{
-  return m256;
-}
-
-__forceinline vint8::operator __m256i &()
-{
-  return m256;
-}
-
-__forceinline vint8 &vint8::operator=(const vint8 &a)
-{
-  m256 = a.m256;
-  return *this;
-}
-#endif /* __KERNEL_AVX__ */
-
-#ifndef __KERNEL_GPU__
-__forceinline int vint8::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 8);
-  return *(&a + i);
-}
-
-__forceinline int &vint8::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 8);
-  return *(&a + i);
-}
-#endif
-
-ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
-{
-#ifdef __KERNEL_AVX__
-  return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
-#else
-  return {a, b, c, d, e, f, g, h};
-#endif
-}
-
-ccl_device_inline vint8 make_vint8(int i)
-{
-#ifdef __KERNEL_AVX__
-  return vint8(_mm256_set1_epi32(i));
-#else
-  return make_vint8(i, i, i, i, i, i, i, i);
-#endif
-}
-
-ccl_device_inline vint8 make_vint8(const vfloat8 f)
-{
-#ifdef __KERNEL_AVX__
-  return vint8(_mm256_cvtps_epi32(f.m256));
-#else
-  return make_vint8(
-      (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
-#endif
-}
-
-ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
-{
-#ifdef __KERNEL_AVX__
-  return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
-#else
-  return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
-#endif
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_spectrum.h
+++ b/intern/cycles/util/types_spectrum.h
@@ -4,25 +4,21 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_float3.h"

 CCL_NAMESPACE_BEGIN

 #define SPECTRUM_CHANNELS 3
-#define SPECTRUM_DATA_TYPE float3
-#define PACKED_SPECTRUM_DATA_TYPE packed_float3

-using Spectrum = SPECTRUM_DATA_TYPE;
-using PackedSpectrum = PACKED_SPECTRUM_DATA_TYPE;
+using Spectrum = float3;
+using PackedSpectrum = packed_float3;

-#define make_spectrum(f) CONCAT(make_, SPECTRUM_DATA_TYPE(f))
-#define load_spectrum(f) CONCAT(load_, SPECTRUM_DATA_TYPE(f))
-#define store_spectrum(s, f) CONCAT(store_, SPECTRUM_DATA_TYPE((s), (f)))
+#define make_spectrum(f) make_float3(f)
+#define load_spectrum(f) load_float3(f)
+#define store_spectrum(s, f) store_float3(f)

-#define zero_spectrum CONCAT(zero_, SPECTRUM_DATA_TYPE)
-#define one_spectrum CONCAT(one_, SPECTRUM_DATA_TYPE)
+#define zero_spectrum zero_float3
+#define one_spectrum one_float3

 #define FOREACH_SPECTRUM_CHANNEL(counter) \
  for (int counter = 0; counter < SPECTRUM_CHANNELS; counter++)
--- a/intern/cycles/util/types_uchar2.h
+++ b/intern/cycles/util/types_uchar2.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,27 @@ struct uchar2 {
  uchar x, y;

 #  ifndef __KERNEL_GPU__
-  __forceinline uchar operator[](int i) const;
-  __forceinline uchar &operator[](int i);
+  __forceinline uchar operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 2);
+    return *(&x + i);
+  }
+
+  __forceinline uchar &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 2);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
+{
+  uchar2 a = {x, y};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uchar2_impl.h
+++ b/intern/cycles/util/types_uchar2_impl.h
@@ -1,37 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-uchar uchar2::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 2);
-  return *(&x + i);
-}
-
-uchar &uchar2::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 2);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
-{
-  uchar2 a = {x, y};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uchar3.h
+++ b/intern/cycles/util/types_uchar3.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,27 @@ struct uchar3 {
  uchar x, y, z;

 #  ifndef __KERNEL_GPU__
-  __forceinline uchar operator[](int i) const;
-  __forceinline uchar &operator[](int i);
+  __forceinline uchar operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+
+  __forceinline uchar &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
+{
+  uchar3 a = {x, y, z};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uchar3_impl.h
+++ b/intern/cycles/util/types_uchar3_impl.h
@@ -1,37 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-uchar uchar3::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-uchar &uchar3::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 3);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
-{
-  uchar3 a = {x, y, z};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uchar4.h
+++ b/intern/cycles/util/types_uchar4.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,27 @@ struct uchar4 {
  uchar x, y, z, w;

 #  ifndef __KERNEL_GPU__
-  __forceinline uchar operator[](int i) const;
-  __forceinline uchar &operator[](int i);
+  __forceinline uchar operator[](int i) const
+  {
+    util_assert(i >= 0);
+    util_assert(i < 4);
+    return *(&x + i);
+  }
+
+  __forceinline uchar &operator[](int i)
+  {
+    util_assert(i >= 0);
+    util_assert(i < 4);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+{
+  uchar4 a = {x, y, z, w};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uchar4_impl.h
+++ b/intern/cycles/util/types_uchar4_impl.h
@@ -1,37 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-uchar uchar4::operator[](int i) const
-{
-  util_assert(i >= 0);
-  util_assert(i < 4);
-  return *(&x + i);
-}
-
-uchar &uchar4::operator[](int i)
-{
-  util_assert(i >= 0);
-  util_assert(i < 4);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
-{
-  uchar4 a = {x, y, z, w};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint2.h
+++ b/intern/cycles/util/types_uint2.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,25 @@ struct uint2 {
  uint x, y;

 #  ifndef __KERNEL_GPU__
-  __forceinline uint operator[](uint i) const;
-  __forceinline uint &operator[](uint i);
+  __forceinline uint operator[](uint i) const
+  {
+    util_assert(i < 2);
+    return *(&x + i);
+  }
+
+  __forceinline uint &operator[](uint i)
+  {
+    util_assert(i < 2);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline uint2 make_uint2(uint x, uint y);
+ccl_device_inline uint2 make_uint2(uint x, uint y)
+{
+  uint2 a = {x, y};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint2_impl.h
+++ b/intern/cycles/util/types_uint2_impl.h
@@ -1,35 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-__forceinline uint uint2::operator[](uint i) const
-{
-  util_assert(i < 2);
-  return *(&x + i);
-}
-
-__forceinline uint &uint2::operator[](uint i)
-{
-  util_assert(i < 2);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline uint2 make_uint2(uint x, uint y)
-{
-  uint2 a = {x, y};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint3.h
+++ b/intern/cycles/util/types_uint3.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,25 @@ struct uint3 {
  uint x, y, z;

 #  ifndef __KERNEL_GPU__
-  __forceinline uint operator[](uint i) const;
-  __forceinline uint &operator[](uint i);
+  __forceinline uint operator[](uint i) const
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+
+  __forceinline uint &operator[](uint i)
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
+{
+  uint3 a = {x, y, z};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 #if defined(__KERNEL_METAL__)
@@ -33,7 +44,7 @@ typedef uint3 packed_uint3;
 struct packed_uint3 {
  uint x, y, z;

-  ccl_device_inline_method packed_uint3(){};
+  ccl_device_inline_method packed_uint3() = default;

  ccl_device_inline_method packed_uint3(const uint px, const uint py, const uint pz)
      : x(px), y(py), z(pz){};
@@ -54,11 +65,27 @@ struct packed_uint3 {
  }

 #  ifndef __KERNEL_GPU__
-  __forceinline uint operator[](uint i) const;
-  __forceinline uint &operator[](uint i);
+  __forceinline uint operator[](uint i) const
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+
+  __forceinline uint &operator[](uint i)
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

 static_assert(sizeof(packed_uint3) == 12, "packed_uint3 expected to be exactly 12 bytes");
 #endif
+
+ccl_device_inline packed_uint3 make_packed_uint3(uint x, uint y, uint z)
+{
+  packed_uint3 a = {x, y, z};
+  return a;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint3_impl.h
+++ b/intern/cycles/util/types_uint3_impl.h
@@ -1,53 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-__forceinline uint uint3::operator[](uint i) const
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline uint &uint3::operator[](uint i)
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline uint packed_uint3::operator[](uint i) const
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline uint &packed_uint3::operator[](uint i)
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
-{
-  uint3 a = {x, y, z};
-  return a;
-}
-
-ccl_device_inline packed_uint3 make_packed_uint3(uint x, uint y, uint z)
-{
-  packed_uint3 a = {x, y, z};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint4.h
+++ b/intern/cycles/util/types_uint4.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/types_base.h"

 CCL_NAMESPACE_BEGIN

@@ -15,12 +13,25 @@ struct uint4 {
  uint x, y, z, w;

 #  ifndef __KERNEL_GPU__
-  __forceinline uint operator[](uint i) const;
-  __forceinline uint &operator[](uint i);
+  __forceinline uint operator[](uint i) const
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
+
+  __forceinline uint &operator[](uint i)
+  {
+    util_assert(i < 3);
+    return *(&x + i);
+  }
 #  endif
 };

-ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
+{
+  uint4 a = {x, y, z, w};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint4_impl.h
+++ b/intern/cycles/util/types_uint4_impl.h
@@ -1,35 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
-#  ifndef __KERNEL_GPU__
-__forceinline uint uint4::operator[](uint i) const
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-
-__forceinline uint &uint4::operator[](uint i)
-{
-  util_assert(i < 3);
-  return *(&x + i);
-}
-#  endif
-
-ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
-{
-  uint4 a = {x, y, z, w};
-  return a;
-}
-#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/util/types_ushort4.h
+++ b/intern/cycles/util/types_ushort4.h
@@ -4,9 +4,7 @@

 #pragma once

-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
+#include "util/defines.h"

 CCL_NAMESPACE_BEGIN