Cycles: several small fixes and additions for MSL

This patch contains many small leftover fixes and additions that are required for Metal-enablement: - Address space fixes and a few other small compile fixes - Addition of missing functionality to the Metal adapter headers - Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for atomic support & maths functions) Ref T92212 Differential Revision: https://developer.blender.org/D13263
2021-11-18 14:25:05 +01:00
parent c0d52db783
commit d19e35873f
28 changed files with 414 additions and 215 deletions
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
    swapped = false;
    for (int j = 0; j < num_hits - 1; ++j) {
      if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp_hit = hits[j];
+        Intersection tmp_hit = hits[j];
        float3 tmp_Ng = Ng[j];
        hits[j] = hits[j + 1];
        Ng[j] = Ng[j + 1];
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -86,7 +86,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */

--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
  if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
  }
 }

@@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  const auto can_split_mask = ccl_gpu_ballot(can_split);
  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
  if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
  }
 }
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;

    /* For each thread within a warp compute how many other active states precede it. */
-    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
-                                            ccl_gpu_thread_mask(thread_warp));
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));

    /* Last thread in warp stores number of active states for each warp. */
    if (thread_warp == ccl_gpu_warp_size - 1) {
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -85,7 +85,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */
 typedef hipTextureObject_t ccl_gpu_tex_object;
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;

 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"

 /* Qualifiers */

@@ -65,7 +66,7 @@ using namespace metal;
 #define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)

 #define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
-#define ccl_gpu_popc(x) popcount(x)
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);

 // clang-format off

@@ -124,7 +125,6 @@ kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  INIT_DEBUG_BUFFER \
  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
@@ -230,6 +230,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 #define sinhf(x) sinh(float(x))
 #define coshf(x) cosh(float(x))
 #define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))

 /* Use native functions with possibly lower precision for performance,
 * no issues found so far. */
@@ -243,6 +244,8 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \

 #define NULL 0

+#define __device__
+
 /* texture bindings and sampler setup */

 struct Texture2DParamsMetal {
@@ -257,6 +260,9 @@ struct MetalAncillaries {
  device Texture3DParamsMetal *textures_3d;
 };

+#include "util/half.h"
+#include "util/types.h"
+
 enum SamplerType {
  SamplerFilterNearest_AddressRepeat,
  SamplerFilterNearest_AddressClampEdge,
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN

 typedef struct KernelParamsMetal {

-#define KERNEL_TEX(type, name) ccl_constant type *name;
+#define KERNEL_TEX(type, name) ccl_global const type *name;
 #include "kernel/textures.h"
 #undef KERNEL_TEX

--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -87,7 +87,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */

--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -160,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);

-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+  return atomic_fetch_and_add_uint32(
+             (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
         sample_offset;
 }

--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
 * Lookup of attributes is different between OSL and SVM, as OSL is ustring
 * based while for SVM we use integer ids. */

-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}

 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@

 CCL_NAMESPACE_BEGIN

-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
 /* UV coords of triangle within patch */

 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN

 /* Linear Congruential Generator */

-ccl_device uint lcg_step_uint(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device uint lcg_step_uint(T rng)
 {
  /* implicit mod 2^32 */
  *rng = (1103515245 * (*rng) + 12345);
  return *rng;
 }

-ccl_device float lcg_step_float(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device float lcg_step_float(T rng)
 {
  /* implicit mod 2^32 */
  *rng = (1103515245 * (*rng) + 12345);
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
    /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
     * We can use this to get divide sample sequence into two classes for easier variance
     * estimation. */
-#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
-    return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
-#elif defined(__NVCC__)
-    return __popc(sample & 0xaaaaaaaa) & 1;
-#else
-    /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
-    int i = sample & 0xaaaaaaaa;
-    i = i - ((i >> 1) & 0x55555555);
-    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
-    i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
-    return i & 1;
-#endif
+    return popcount(uint(sample) & 0xaaaaaaaa) & 1;
  }
  else {
    /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN
 template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
                               ConstIntegratorGenericState state,
-                               ShaderData *sd,
+                               ccl_private ShaderData *sd,
                               ccl_global float *render_buffer,
                               uint32_t path_flag)
 {
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,

 #  endif /* __KERNEL_CUDA__ */

+#  ifdef __KERNEL_METAL__
+
+// global address space versions
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source,
+                                                   const float operand)
+{
+  volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source;
+  union {
+    int int_value;
+    float float_value;
+  } new_value, prev_value;
+  prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
+  do {
+    new_value.float_value = prev_value.float_value + operand;
+  } while (!atomic_compare_exchange_weak_explicit(source,
+                                                  &prev_value.int_value,
+                                                  new_value.int_value,
+                                                  memory_order_relaxed,
+                                                  memory_order_relaxed));
+
+  return new_value.float_value;
+}
+
+#    define atomic_fetch_and_add_uint32(p, x) \
+      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+#    define atomic_fetch_and_sub_uint32(p, x) \
+      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+#    define atomic_fetch_and_inc_uint32(p) \
+      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+#    define atomic_fetch_and_dec_uint32(p) \
+      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+#    define atomic_fetch_and_or_uint32(p, x) \
+      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+
+ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
+                                                      const float old_val,
+                                                      const float new_val)
+{
+  int prev_value;
+  prev_value = __float_as_int(old_val);
+  atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest,
+                                        &prev_value,
+                                        __float_as_int(new_val),
+                                        memory_order_relaxed,
+                                        memory_order_relaxed);
+  return __int_as_float(prev_value);
+}
+
+#    define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
+#    define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
+
+#    define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
+#    define ccl_barrier(flags) threadgroup_barrier(flags)
+
+#  endif /* __KERNEL_METAL__ */
+
 #endif /* __KERNEL_GPU__ */

 #endif /* __UTIL_ATOMIC_H__ */
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false)
  reset();
 }

+DebugFlags::Metal::Metal() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
  if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
@@ -76,6 +81,12 @@ void DebugFlags::HIP::reset()
    adaptive_compile = true;
 }

+void DebugFlags::Metal::reset()
+{
+  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
  reset();
@@ -97,6 +108,7 @@ void DebugFlags::reset()
  cpu.reset();
  cuda.reset();
  optix.reset();
+  metal.reset();
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -116,6 +116,17 @@ class DebugFlags {
    bool use_debug;
  };

+  /* Descriptor of Metal feature-set to be used. */
+  struct Metal {
+    Metal();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
+    bool adaptive_compile;
+  };
+
  /* Get instance of debug flags registry. */
  static DebugFlags &get()
  {
@@ -138,6 +149,9 @@ class DebugFlags {
  /* Requested HIP flags. */
  HIP hip;

+  /* Requested Metal flags. */
+  Metal metal;
+
 private:
  DebugFlags();

--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN

 /* Half Floats */

+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float half_to_float(half h_in)
+{
+  float f;
+  union {
+    half h;
+    uint16_t s;
+  } val;
+  val.h = h_in;
+
+  *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) |
+                             ((val.s & 0x03FF) << 13);
+
+  return f;
+}
+
+#else
+
 /* CUDA has its own half data type, no need to define then */
-#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
 * unsigned shorts. */
 class half {
@@ -53,11 +72,12 @@ class half {
 private:
  unsigned short v;
 };
-#endif
+#  endif

 struct half4 {
  half x, y, z, w;
 };
+#endif

 /* Conversion to/from half float for image textures
 *
@@ -66,7 +86,9 @@ struct half4 {

 ccl_device_inline half float_to_half_image(float f)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
  return __float2half(f);
 #else
  const uint u = __float_as_uint(f);
@@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f)

 ccl_device_inline float half_to_float_image(half h)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half_to_float(h);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
  return __half2float(h);
 #else
  const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
@@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)

 ccl_device_inline half float_to_half_display(const float f)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
  return __float2half(f);
 #else
  const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -30,9 +30,11 @@
 #  include <hip/hip_vector_types.h>
 #endif

-#include <float.h>
-#include <math.h>
-#include <stdio.h>
+#if !defined(__KERNEL_METAL__)
+#  include <float.h>
+#  include <math.h>
+#  include <stdio.h>
+#endif /* !defined(__KERNEL_METAL__) */

 #include "util/types.h"

@@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d)
  return max(max(a, b), max(c, d));
 }

+#if !defined(__KERNEL_METAL__)
 /* Int/Float conversion */

 ccl_device_inline int as_int(uint i)
@@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f)
  return u.i;
 }

-#ifndef __HIP__
+#  ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
  union {
@@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i)
  u.i = i;
  return u.f;
 }
-#endif
+#  endif

 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(f.m128));
-#else
+#  else
  return make_int4(
      __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#endif
+#  endif
 }

 ccl_device_inline float4 __int4_as_float4(int4 i)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_castsi128_ps(i.m128));
-#else
+#  else
  return make_float4(
      __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#endif
+#  endif
 }
+#endif /* !defined(__KERNEL_METAL__) */

+#if defined(__KERNEL_METAL__)
+#  define isnan_safe(v) isnan(v)
+#  define isfinite_safe(v) isfinite(v)
+#else
 template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
 {
  return ((uint64_t)ptr) & 0xFFFFFFFF;
@@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f)
  unsigned int x = __float_as_uint(f);
  return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
 }
+#endif

 ccl_device_inline float ensure_finite(float v)
 {
  return isfinite_safe(v) ? v : 0.0f;
 }

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
  return min(max(a, mn), mx);
@@ -346,16 +356,18 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
  return result;
 }

-#ifndef __KERNEL_CUDA__
-ccl_device_inline float saturatef(float a)
-{
-  return clamp(a, 0.0f, 1.0f);
-}
-#else
+#endif /* !defined(__KERNEL_METAL__) */
+
+#if defined(__KERNEL_CUDA__)
 ccl_device_inline float saturatef(float a)
 {
  return __saturatef(a);
 }
+#elif !defined(__KERNEL_METAL__)
+ccl_device_inline float saturatef(float a)
+{
+  return clamp(a, 0.0f, 1.0f);
+}
 #endif /* __KERNEL_CUDA__ */

 ccl_device_inline int float_to_int(float f)
@@ -491,6 +503,7 @@ CCL_NAMESPACE_END

 CCL_NAMESPACE_BEGIN

+#if !defined(__KERNEL_METAL__)
 /* Interpolation */

 template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -498,6 +511,8 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
  return (A)(a * ((B)1 - t) + b * t);
 }

+#endif /* __KERNEL_METAL__ */
+
 /* Triangle */

 ccl_device_inline float triangle_area(ccl_private const float3 &v1,
@@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f)

 ccl_device_inline float inversesqrtf(float f)
 {
+#if defined(__KERNEL_METAL__)
+  return (f > 0.0f) ? rsqrt(f) : 0.0f;
+#else
  return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
+#endif
 }

 ccl_device float safe_asinf(float a)
@@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits)
  return bits * (1.0f / (float)0xFFFFFFFF);
 }

+#if !defined(__KERNEL_GPU__)
+#  if defined(__GNUC__)
+#    define popcount(x) __builtin_popcount(x)
+#  else
+ccl_device_inline uint popcount(uint x)
+{
+  /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
+  uint i = x & 0xaaaaaaaa;
+  i = i - ((i >> 1) & 0x55555555);
+  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+  i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+  return i & 1;
+}
+#  endif
+#elif !defined(__KERNEL_METAL__)
+#  define popcount(x) __popc(x)
+#endif
+
 ccl_device_inline uint count_leading_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
  return __clz(x);
+#elif defined(__KERNEL_METAL__)
+  return clz(x);
 #else
  assert(x != 0);
 #  ifdef _MSC_VER
@@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
  return (__ffs(x) - 1);
+#elif defined(__KERNEL_METAL__)
+  return ctz(x);
 #else
  assert(x != 0);
 #  ifdef _MSC_VER
@@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
  return __ffs(x);
+#elif defined(__KERNEL_METAL__)
+  return (x != 0) ? ctz(x) + 1 : 0;
 #else
 #  ifdef _MSC_VER
  return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
  return x;
 #elif defined(__KERNEL_CUDA__)
  return __brev(x);
+#elif defined(__KERNEL_METAL__)
+  return reverse_bits(x);
 #elif __has_builtin(__builtin_bitreverse32)
  return __builtin_bitreverse32(x);
 #else
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
 * Declaration.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float2 operator-(const float2 &a);
 ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
 ccl_device_inline float2 floor(const float2 &a);
+#endif /* !__KERNEL_METAL__ */

 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);

@@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2()
  return make_float2(1.0f, 1.0f);
 }

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float2 operator-(const float2 &a)
 {
  return make_float2(-a.x, -a.y);
@@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a)
  return make_float2(floorf(a.x), floorf(a.y));
 }

+#endif /* !__KERNEL_METAL__ */
+
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
  return (b != 0.0f) ? a / b : zero_float2();
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
 * Declaration.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 operator-(const float3 &a);
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
 ccl_device_inline float3 floor(const float3 &a);
 ccl_device_inline float3 ceil(const float3 &a);
+ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
+#endif /* !defined(__KERNEL_METAL__) */

 ccl_device_inline float min3(float3 a);
 ccl_device_inline float max3(float3 a);
 ccl_device_inline float len(const float3 a);
 ccl_device_inline float len_squared(const float3 a);

-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
 ccl_device_inline float3 project(const float3 v, const float3 v_proj);

 ccl_device_inline float3 saturate3(float3 a);
 ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
+ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
 ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
 ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
 ccl_device_inline float3 interp(float3 a, float3 b, float t);
@@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3()
  return make_float3(1.0f, 1.0f, 1.0f);
 }

+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float3 rcp(float3 a)
+{
+  return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+#else
+
 ccl_device_inline float3 operator-(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
+#  else
  return make_float3(-a.x, -a.y, -a.z);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_mul_ps(a.m128, b.m128));
-#else
+#  else
  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator*(const float3 &a, const float f)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#else
+#  else
  return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator*(const float f, const float3 &a)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
  return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
+#  else
  return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator/(const float f, const float3 &a)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#else
+#  else
  return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)

 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(a.m128, b.m128));
-#else
+#  else
  return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)

 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_add_ps(a.m128, b.m128));
-#else
+#  else
  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)

 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_sub_ps(a.m128, b.m128));
-#else
+#  else
  return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+#  endif
 }

 ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -250,11 +261,11 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)

 ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
+#  else
  return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+#  endif
 }

 ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -269,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)

 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else
+#  else
  return a.x * b.x + a.y * b.y + a.z * b.z;
-#endif
+#  endif
 }

 ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#else
+#  else
  return a.x * b.x + a.y * b.y;
-#endif
+#  endif
 }

 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -293,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)

 ccl_device_inline float3 normalize(const float3 &a)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
  return float3(_mm_div_ps(a.m128, norm));
-#else
+#  else
  return a / len(a);
-#endif
+#  endif
 }

 ccl_device_inline float3 min(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_min_ps(a.m128, b.m128));
-#else
+#  else
  return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+#  endif
 }

 ccl_device_inline float3 max(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_max_ps(a.m128, b.m128));
-#else
+#  else
  return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+#  endif
 }

 ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -326,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &

 ccl_device_inline float3 fabs(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
-#  ifdef __KERNEL_NEON__
+#  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_NEON__
  return float3(vabsq_f32(a.m128));
-#  else
+#    else
  __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  return float3(_mm_and_ps(a.m128, mask));
-#  endif
-#else
+#    endif
+#  else
  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+#  endif
 }

 ccl_device_inline float3 sqrt(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_sqrt_ps(a));
-#else
+#  else
  return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#endif
+#  endif
 }

 ccl_device_inline float3 floor(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_floor_ps(a));
-#else
+#  else
  return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#endif
+#  endif
 }

 ccl_device_inline float3 ceil(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float3(_mm_ceil_ps(a));
-#else
+#  else
  return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#endif
+#  endif
 }

 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -372,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)

 ccl_device_inline float3 rcp(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  /* Don't use _mm_rcp_ps due to poor precision. */
  return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+#  else
  return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#endif
+#  endif
 }
+#endif /* !__KERNEL_METAL__ */

 ccl_device_inline float min3(float3 a)
 {
@@ -404,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a)
  return dot(a, a);
 }

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
  float3 unit_normal = normalize(normal);
@@ -425,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector,
 {
  return (dot(reference, incident) < 0.0f) ? vector : -vector;
 }
+#endif

 ccl_device_inline float3 project(const float3 v, const float3 v_proj)
 {
@@ -505,7 +519,11 @@ ccl_device_inline float average(const float3 a)

 ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 {
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
  return a == b;
+#endif
 }

 ccl_device_inline float3 pow3(float3 v, float e)
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
 * Declaration.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a);
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 ccl_device_inline float4 fabs(const float4 &a);
 ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
+#endif /* !__KERNEL_METAL__*/

 ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);

@@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4()
  return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
  return float4(_mm_xor_ps(a.m128, mask));
-#else
+#  else
  return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
+#  endif
 }

 ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_mul_ps(a.m128, b.m128));
-#else
+#  else
  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#endif
+#  endif
 }

 ccl_device_inline float4 operator*(const float4 &a, float f)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
  return a * make_float4(f);
-#else
+#  else
  return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#endif
+#  endif
 }

 ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)

 ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_div_ps(a.m128, b.m128));
-#else
+#  else
  return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#endif
+#  endif
 }

 ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)

 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_add_ps(a.m128, b.m128));
-#else
+#  else
  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#endif
+#  endif
 }

 ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)

 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_sub_ps(a.m128, b.m128));
-#else
+#  else
  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#endif
+#  endif
 }

 ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)

 ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#else
+#  else
  return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
+#  endif
 }

 ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#else
+#  else
  return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
+#  endif
 }

 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#else
+#  else
  return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
+#  endif
 }

 ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
+#  else
  return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
+#  endif
 }

 ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)

 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
  __m128 t = vmulq_f32(a, b);
  return vaddvq_f32(t);
-#  else
+#    else
  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#  endif
-#else
+#    endif
+#  else
  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
-#endif
+#  endif
 }

 ccl_device_inline float len_squared(const float4 &a)
@@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a)

 ccl_device_inline float4 rcp(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  /* Don't use _mm_rcp_ps due to poor precision. */
  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+#  else
  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#endif
+#  endif
 }

 ccl_device_inline float4 sqrt(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_sqrt_ps(a.m128));
-#else
+#  else
  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#endif
+#  endif
 }

 ccl_device_inline float4 sqr(const float4 &a)
@@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a)

 ccl_device_inline float4 cross(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#else
+#  else
  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#endif
+#  endif
 }

 ccl_device_inline bool is_zero(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return a == make_float4(0.0f);
-#else
+#  else
  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
+#  endif
 }

 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#if defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
  return float4(vdupq_n_f32(vaddvq_f32(a)));
-#  elif defined(__KERNEL_SSE3__)
+#    elif defined(__KERNEL_SSE3__)
  float4 h(_mm_hadd_ps(a.m128, a.m128));
  return float4(_mm_hadd_ps(h.m128, h.m128));
-#  else
+#    else
  float4 h(shuffle<1, 0, 3, 2>(a) + a);
  return shuffle<2, 3, 0, 1>(h) + h;
-#  endif
-#else
+#    endif
+#  else
  float sum = (a.x + a.y) + (a.z + a.w);
  return make_float4(sum, sum, sum, sum);
-#endif
+#  endif
 }

 ccl_device_inline float average(const float4 &a)
@@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)

 ccl_device_inline float4 min(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_min_ps(a.m128, b.m128));
-#else
+#  else
  return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
+#  endif
 }

 ccl_device_inline float4 max(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_max_ps(a.m128, b.m128));
-#else
+#  else
  return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
+#  endif
 }

 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &

 ccl_device_inline float4 fabs(const float4 &a)
 {
-#if defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
  return float4(vabsq_f32(a));
-#  else
+#    else
  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#  endif
-#else
+#    endif
+#  else
  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#endif
+#  endif
 }

 ccl_device_inline float4 floor(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return float4(_mm_floor_ps(a));
-#else
+#  else
  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#endif
+#  endif
 }

 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
  return a + t * (b - a);
 }

+#endif /* !__KERNEL_METAL__*/
+
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN
 * Declaration.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b);
 ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
 ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif /* !__KERNEL_METAL__ */

 /*******************************************************************************
 * Definition.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
  return (a.x == b.x && a.y == b.y);
@@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 {
  return make_int2(a.x / b.x, a.y / b.y);
 }
+#endif /* !__KERNEL_METAL__ */

 CCL_NAMESPACE_END

--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN
 * Declaration.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b);
 ccl_device_inline int3 max(int3 a, int3 b);
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
+#endif /*  !defined(__KERNEL_METAL__) */

 /*******************************************************************************
 * Definition.
 */

+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
  return int3(_mm_min_epi32(a.m128, b.m128));
-#else
+#  else
  return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+#  endif
 }

 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
  return int3(_mm_max_epi32(a.m128, b.m128));
-#else
+#  else
  return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+#  endif
 }

 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return min(max(a, make_int3(mn)), make_int3(mx));
-#else
+#  else
  return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
+#  endif
 }

 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return min(max(a, mn), make_int3(mx));
-#else
+#  else
  return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
+#  endif
 }

 ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)

 ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return int3(_mm_add_epi32(a.m128, b.m128));
-#else
+#  else
  return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+#  endif
 }

 ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
  return int3(_mm_sub_epi32(a.m128, b.m128));
-#else
+#  else
  return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+#  endif
 }
+#endif /* !__KERNEL_METAL__ */

 CCL_NAMESPACE_END

--- a/intern/cycles/util/math_matrix.h
+++ b/intern/cycles/util/math_matrix.h
@@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
 {
  for (int row = 0; row < n; row++) {
    for (int col = 0; col <= row; col++) {
-      MATHS(A, row, col, 1) += v[row] * v[col] * weight;
+      atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
    }
  }
 }
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -313,7 +313,7 @@ static char *path_specials(const string &sub)
  if (env_shader_path != NULL && sub == "shader") {
    return env_shader_path;
  }
-  else if (env_shader_path != NULL && sub == "source") {
+  else if (env_source_path != NULL && sub == "source") {
    return env_source_path;
  }
  return NULL;
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -53,6 +53,15 @@ typedef struct DecomposedTransform {

 /* Functions */

+#ifdef __KERNEL_METAL__
+/* transform_point specialized for ccl_global */
+ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a)
+{
+  ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t);
+  return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
+}
+#endif
+
 ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a)
 {
  /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
@@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
  tmp += w;

  return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+  ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+  return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
 #else
  float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w,
                         a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w,
@@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
  tmp = madd(shuffle<2>(aa), z, tmp);

  return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+  ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+  return (a * b).xyz;
 #else
  float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z,
                         a.x * t->y.x + a.y * t->y.y + a.z * t->y.z,
@@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm,
 }

 /* Interpolate from array of decomposed transforms. */
-ccl_device void transform_motion_array_interpolate(Transform *tfm,
-                                                   const DecomposedTransform *motion,
+ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
+                                                   ccl_global const DecomposedTransform *motion,
                                                   uint numsteps,
                                                   float time)
 {
@@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
  int step = min((int)(time * maxstep), maxstep - 1);
  float t = time * maxstep - step;

-  const DecomposedTransform *a = motion + step;
-  const DecomposedTransform *b = motion + step + 1;
+  ccl_global const DecomposedTransform *a = motion + step;
+  ccl_global const DecomposedTransform *b = motion + step + 1;

  /* Interpolate rotation, translation and scale. */
  DecomposedTransform decomp;
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -17,7 +17,9 @@
 #ifndef __UTIL_TYPES_H__
 #define __UTIL_TYPES_H__

-#include <stdlib.h>
+#if !defined(__KERNEL_METAL__)
+#  include <stdlib.h>
+#endif

 /* Standard Integer Types */