Cycles: several small fixes and additions for MSL
This patch contains many small leftover fixes and additions that are required for Metal-enablement: - Address space fixes and a few other small compile fixes - Addition of missing functionality to the Metal adapter headers - Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for atomic support & maths functions) Ref T92212 Differential Revision: https://developer.blender.org/D13263
This commit is contained in:
committed by
Brecht Van Lommel
parent
c0d52db783
commit
d19e35873f
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
|
||||
swapped = false;
|
||||
for (int j = 0; j < num_hits - 1; ++j) {
|
||||
if (hits[j].t > hits[j + 1].t) {
|
||||
struct Intersection tmp_hit = hits[j];
|
||||
Intersection tmp_hit = hits[j];
|
||||
float3 tmp_Ng = Ng[j];
|
||||
hits[j] = hits[j + 1];
|
||||
Ng[j] = Ng[j + 1];
|
||||
|
||||
@@ -86,7 +86,6 @@ typedef unsigned long long uint64_t;
|
||||
#define ccl_gpu_syncthreads() __syncthreads()
|
||||
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
|
||||
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
|
||||
#define ccl_gpu_popc(x) __popc(x)
|
||||
|
||||
/* GPU texture objects */
|
||||
|
||||
|
||||
@@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
|
||||
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
|
||||
if (lane_id == 0) {
|
||||
atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
|
||||
atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
const auto can_split_mask = ccl_gpu_ballot(can_split);
|
||||
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
|
||||
if (lane_id == 0) {
|
||||
atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
|
||||
atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
|
||||
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
|
||||
|
||||
/* For each thread within a warp compute how many other active states precede it. */
|
||||
const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
|
||||
ccl_gpu_thread_mask(thread_warp));
|
||||
const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
|
||||
ccl_gpu_thread_mask(thread_warp));
|
||||
|
||||
/* Last thread in warp stores number of active states for each warp. */
|
||||
if (thread_warp == ccl_gpu_warp_size - 1) {
|
||||
|
||||
@@ -85,7 +85,6 @@ typedef unsigned long long uint64_t;
|
||||
#define ccl_gpu_syncthreads() __syncthreads()
|
||||
#define ccl_gpu_ballot(predicate) __ballot(predicate)
|
||||
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
|
||||
#define ccl_gpu_popc(x) __popc(x)
|
||||
|
||||
/* GPU texture objects */
|
||||
typedef hipTextureObject_t ccl_gpu_tex_object;
|
||||
|
||||
@@ -34,6 +34,7 @@ using namespace metal;
|
||||
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wsign-compare"
|
||||
#pragma clang diagnostic ignored "-Wuninitialized"
|
||||
|
||||
/* Qualifiers */
|
||||
|
||||
@@ -65,7 +66,7 @@ using namespace metal;
|
||||
#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
|
||||
|
||||
#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
|
||||
#define ccl_gpu_popc(x) popcount(x)
|
||||
#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// clang-format off
|
||||
|
||||
@@ -124,7 +125,6 @@ kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
|
||||
uint simd_group_index [[simdgroup_index_in_threadgroup]], \
|
||||
uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
|
||||
MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
|
||||
INIT_DEBUG_BUFFER \
|
||||
params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
|
||||
} \
|
||||
void kernel_gpu_##name::run(thread MetalKernelContext& context, \
|
||||
@@ -230,6 +230,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
|
||||
#define sinhf(x) sinh(float(x))
|
||||
#define coshf(x) cosh(float(x))
|
||||
#define tanhf(x) tanh(float(x))
|
||||
#define saturatef(x) saturate(float(x))
|
||||
|
||||
/* Use native functions with possibly lower precision for performance,
|
||||
* no issues found so far. */
|
||||
@@ -243,6 +244,8 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
|
||||
|
||||
#define NULL 0
|
||||
|
||||
#define __device__
|
||||
|
||||
/* texture bindings and sampler setup */
|
||||
|
||||
struct Texture2DParamsMetal {
|
||||
@@ -257,6 +260,9 @@ struct MetalAncillaries {
|
||||
device Texture3DParamsMetal *textures_3d;
|
||||
};
|
||||
|
||||
#include "util/half.h"
|
||||
#include "util/types.h"
|
||||
|
||||
enum SamplerType {
|
||||
SamplerFilterNearest_AddressRepeat,
|
||||
SamplerFilterNearest_AddressClampEdge,
|
||||
|
||||
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
typedef struct KernelParamsMetal {
|
||||
|
||||
#define KERNEL_TEX(type, name) ccl_constant type *name;
|
||||
#define KERNEL_TEX(type, name) ccl_global const type *name;
|
||||
#include "kernel/textures.h"
|
||||
#undef KERNEL_TEX
|
||||
|
||||
|
||||
@@ -87,7 +87,6 @@ typedef unsigned long long uint64_t;
|
||||
#define ccl_gpu_syncthreads() __syncthreads()
|
||||
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
|
||||
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
|
||||
#define ccl_gpu_popc(x) __popc(x)
|
||||
|
||||
/* GPU texture objects */
|
||||
|
||||
|
||||
@@ -160,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
|
||||
|
||||
ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
|
||||
|
||||
return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
|
||||
return atomic_fetch_and_add_uint32(
|
||||
(ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
|
||||
sample_offset;
|
||||
}
|
||||
|
||||
|
||||
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
|
||||
* Lookup of attributes is different between OSL and SVM, as OSL is ustring
|
||||
* based while for SVM we use integer ids. */
|
||||
|
||||
ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
|
||||
/* Patch index for triangle, -1 if not subdivision triangle */
|
||||
|
||||
ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
|
||||
{
|
||||
return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
|
||||
}
|
||||
|
||||
ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
|
||||
{
|
||||
|
||||
@@ -20,13 +20,6 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Patch index for triangle, -1 if not subdivision triangle */
|
||||
|
||||
ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
|
||||
{
|
||||
return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
|
||||
}
|
||||
|
||||
/* UV coords of triangle within patch */
|
||||
|
||||
ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
|
||||
|
||||
@@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Linear Congruential Generator */
|
||||
|
||||
ccl_device uint lcg_step_uint(uint *rng)
|
||||
/* This is templated to handle multiple address spaces on Metal. */
|
||||
template<class T> ccl_device uint lcg_step_uint(T rng)
|
||||
{
|
||||
/* implicit mod 2^32 */
|
||||
*rng = (1103515245 * (*rng) + 12345);
|
||||
return *rng;
|
||||
}
|
||||
|
||||
ccl_device float lcg_step_float(uint *rng)
|
||||
/* This is templated to handle multiple address spaces on Metal. */
|
||||
template<class T> ccl_device float lcg_step_float(T rng)
|
||||
{
|
||||
/* implicit mod 2^32 */
|
||||
*rng = (1103515245 * (*rng) + 12345);
|
||||
|
||||
@@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
|
||||
/* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
|
||||
* We can use this to get divide sample sequence into two classes for easier variance
|
||||
* estimation. */
|
||||
#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
|
||||
return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
|
||||
#elif defined(__NVCC__)
|
||||
return __popc(sample & 0xaaaaaaaa) & 1;
|
||||
#else
|
||||
/* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
|
||||
int i = sample & 0xaaaaaaaa;
|
||||
i = i - ((i >> 1) & 0x55555555);
|
||||
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
|
||||
i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
|
||||
return i & 1;
|
||||
#endif
|
||||
return popcount(uint(sample) & 0xaaaaaaaa) & 1;
|
||||
}
|
||||
else {
|
||||
/* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
|
||||
|
||||
@@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN
|
||||
template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
|
||||
ccl_device void svm_eval_nodes(KernelGlobals kg,
|
||||
ConstIntegratorGenericState state,
|
||||
ShaderData *sd,
|
||||
ccl_private ShaderData *sd,
|
||||
ccl_global float *render_buffer,
|
||||
uint32_t path_flag)
|
||||
{
|
||||
|
||||
@@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
|
||||
|
||||
# endif /* __KERNEL_CUDA__ */
|
||||
|
||||
# ifdef __KERNEL_METAL__
|
||||
|
||||
// global address space versions
|
||||
ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source,
|
||||
const float operand)
|
||||
{
|
||||
volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source;
|
||||
union {
|
||||
int int_value;
|
||||
float float_value;
|
||||
} new_value, prev_value;
|
||||
prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
|
||||
do {
|
||||
new_value.float_value = prev_value.float_value + operand;
|
||||
} while (!atomic_compare_exchange_weak_explicit(source,
|
||||
&prev_value.int_value,
|
||||
new_value.int_value,
|
||||
memory_order_relaxed,
|
||||
memory_order_relaxed));
|
||||
|
||||
return new_value.float_value;
|
||||
}
|
||||
|
||||
# define atomic_fetch_and_add_uint32(p, x) \
|
||||
atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
|
||||
# define atomic_fetch_and_sub_uint32(p, x) \
|
||||
atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
|
||||
# define atomic_fetch_and_inc_uint32(p) \
|
||||
atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
|
||||
# define atomic_fetch_and_dec_uint32(p) \
|
||||
atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
|
||||
# define atomic_fetch_and_or_uint32(p, x) \
|
||||
atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
|
||||
|
||||
ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
|
||||
const float old_val,
|
||||
const float new_val)
|
||||
{
|
||||
int prev_value;
|
||||
prev_value = __float_as_int(old_val);
|
||||
atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest,
|
||||
&prev_value,
|
||||
__float_as_int(new_val),
|
||||
memory_order_relaxed,
|
||||
memory_order_relaxed);
|
||||
return __int_as_float(prev_value);
|
||||
}
|
||||
|
||||
# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
|
||||
# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
|
||||
|
||||
# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
|
||||
# define ccl_barrier(flags) threadgroup_barrier(flags)
|
||||
|
||||
# endif /* __KERNEL_METAL__ */
|
||||
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
|
||||
#endif /* __UTIL_ATOMIC_H__ */
|
||||
|
||||
@@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false)
|
||||
reset();
|
||||
}
|
||||
|
||||
DebugFlags::Metal::Metal() : adaptive_compile(false)
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
||||
void DebugFlags::CUDA::reset()
|
||||
{
|
||||
if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
|
||||
@@ -76,6 +81,12 @@ void DebugFlags::HIP::reset()
|
||||
adaptive_compile = true;
|
||||
}
|
||||
|
||||
void DebugFlags::Metal::reset()
|
||||
{
|
||||
if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
|
||||
adaptive_compile = true;
|
||||
}
|
||||
|
||||
DebugFlags::OptiX::OptiX()
|
||||
{
|
||||
reset();
|
||||
@@ -97,6 +108,7 @@ void DebugFlags::reset()
|
||||
cpu.reset();
|
||||
cuda.reset();
|
||||
optix.reset();
|
||||
metal.reset();
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -116,6 +116,17 @@ class DebugFlags {
|
||||
bool use_debug;
|
||||
};
|
||||
|
||||
/* Descriptor of Metal feature-set to be used. */
|
||||
struct Metal {
|
||||
Metal();
|
||||
|
||||
/* Reset flags to their defaults. */
|
||||
void reset();
|
||||
|
||||
/* Whether adaptive feature based runtime compile is enabled or not.*/
|
||||
bool adaptive_compile;
|
||||
};
|
||||
|
||||
/* Get instance of debug flags registry. */
|
||||
static DebugFlags &get()
|
||||
{
|
||||
@@ -138,6 +149,9 @@ class DebugFlags {
|
||||
/* Requested HIP flags. */
|
||||
HIP hip;
|
||||
|
||||
/* Requested Metal flags. */
|
||||
Metal metal;
|
||||
|
||||
private:
|
||||
DebugFlags();
|
||||
|
||||
|
||||
@@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Half Floats */
|
||||
|
||||
#if defined(__KERNEL_METAL__)
|
||||
|
||||
ccl_device_inline float half_to_float(half h_in)
|
||||
{
|
||||
float f;
|
||||
union {
|
||||
half h;
|
||||
uint16_t s;
|
||||
} val;
|
||||
val.h = h_in;
|
||||
|
||||
*((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) |
|
||||
((val.s & 0x03FF) << 13);
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* CUDA has its own half data type, no need to define then */
|
||||
#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
|
||||
# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
|
||||
/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
|
||||
* unsigned shorts. */
|
||||
class half {
|
||||
@@ -53,11 +72,12 @@ class half {
|
||||
private:
|
||||
unsigned short v;
|
||||
};
|
||||
#endif
|
||||
# endif
|
||||
|
||||
struct half4 {
|
||||
half x, y, z, w;
|
||||
};
|
||||
#endif
|
||||
|
||||
/* Conversion to/from half float for image textures
|
||||
*
|
||||
@@ -66,7 +86,9 @@ struct half4 {
|
||||
|
||||
ccl_device_inline half float_to_half_image(float f)
|
||||
{
|
||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
#if defined(__KERNEL_METAL__)
|
||||
return half(f);
|
||||
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
return __float2half(f);
|
||||
#else
|
||||
const uint u = __float_as_uint(f);
|
||||
@@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f)
|
||||
|
||||
ccl_device_inline float half_to_float_image(half h)
|
||||
{
|
||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
#if defined(__KERNEL_METAL__)
|
||||
return half_to_float(h);
|
||||
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
return __half2float(h);
|
||||
#else
|
||||
const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
|
||||
@@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
|
||||
|
||||
ccl_device_inline half float_to_half_display(const float f)
|
||||
{
|
||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
#if defined(__KERNEL_METAL__)
|
||||
return half(f);
|
||||
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
return __float2half(f);
|
||||
#else
|
||||
const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
|
||||
|
||||
@@ -30,9 +30,11 @@
|
||||
# include <hip/hip_vector_types.h>
|
||||
#endif
|
||||
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
# include <float.h>
|
||||
# include <math.h>
|
||||
# include <stdio.h>
|
||||
#endif /* !defined(__KERNEL_METAL__) */
|
||||
|
||||
#include "util/types.h"
|
||||
|
||||
@@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d)
|
||||
return max(max(a, b), max(c, d));
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
/* Int/Float conversion */
|
||||
|
||||
ccl_device_inline int as_int(uint i)
|
||||
@@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f)
|
||||
return u.i;
|
||||
}
|
||||
|
||||
#ifndef __HIP__
|
||||
# ifndef __HIP__
|
||||
ccl_device_inline int __float_as_int(float f)
|
||||
{
|
||||
union {
|
||||
@@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i)
|
||||
u.i = i;
|
||||
return u.f;
|
||||
}
|
||||
#endif
|
||||
# endif
|
||||
|
||||
ccl_device_inline int4 __float4_as_int4(float4 f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return int4(_mm_castps_si128(f.m128));
|
||||
#else
|
||||
# else
|
||||
return make_int4(
|
||||
__float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 __int4_as_float4(int4 i)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_castsi128_ps(i.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(
|
||||
__int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
#endif /* !defined(__KERNEL_METAL__) */
|
||||
|
||||
#if defined(__KERNEL_METAL__)
|
||||
# define isnan_safe(v) isnan(v)
|
||||
# define isfinite_safe(v) isfinite(v)
|
||||
#else
|
||||
template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
|
||||
{
|
||||
return ((uint64_t)ptr) & 0xFFFFFFFF;
|
||||
@@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f)
|
||||
unsigned int x = __float_as_uint(f);
|
||||
return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
|
||||
}
|
||||
#endif
|
||||
|
||||
ccl_device_inline float ensure_finite(float v)
|
||||
{
|
||||
return isfinite_safe(v) ? v : 0.0f;
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline int clamp(int a, int mn, int mx)
|
||||
{
|
||||
return min(max(a, mn), mx);
|
||||
@@ -346,16 +356,18 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_CUDA__
|
||||
ccl_device_inline float saturatef(float a)
|
||||
{
|
||||
return clamp(a, 0.0f, 1.0f);
|
||||
}
|
||||
#else
|
||||
#endif /* !defined(__KERNEL_METAL__) */
|
||||
|
||||
#if defined(__KERNEL_CUDA__)
|
||||
ccl_device_inline float saturatef(float a)
|
||||
{
|
||||
return __saturatef(a);
|
||||
}
|
||||
#elif !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float saturatef(float a)
|
||||
{
|
||||
return clamp(a, 0.0f, 1.0f);
|
||||
}
|
||||
#endif /* __KERNEL_CUDA__ */
|
||||
|
||||
ccl_device_inline int float_to_int(float f)
|
||||
@@ -491,6 +503,7 @@ CCL_NAMESPACE_END
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
/* Interpolation */
|
||||
|
||||
template<class A, class B> A lerp(const A &a, const A &b, const B &t)
|
||||
@@ -498,6 +511,8 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
|
||||
return (A)(a * ((B)1 - t) + b * t);
|
||||
}
|
||||
|
||||
#endif /* __KERNEL_METAL__ */
|
||||
|
||||
/* Triangle */
|
||||
|
||||
ccl_device_inline float triangle_area(ccl_private const float3 &v1,
|
||||
@@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f)
|
||||
|
||||
ccl_device_inline float inversesqrtf(float f)
|
||||
{
|
||||
#if defined(__KERNEL_METAL__)
|
||||
return (f > 0.0f) ? rsqrt(f) : 0.0f;
|
||||
#else
|
||||
return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device float safe_asinf(float a)
|
||||
@@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits)
|
||||
return bits * (1.0f / (float)0xFFFFFFFF);
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_GPU__)
|
||||
# if defined(__GNUC__)
|
||||
# define popcount(x) __builtin_popcount(x)
|
||||
# else
|
||||
ccl_device_inline uint popcount(uint x)
|
||||
{
|
||||
/* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
|
||||
uint i = x & 0xaaaaaaaa;
|
||||
i = i - ((i >> 1) & 0x55555555);
|
||||
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
|
||||
i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
|
||||
return i & 1;
|
||||
}
|
||||
# endif
|
||||
#elif !defined(__KERNEL_METAL__)
|
||||
# define popcount(x) __popc(x)
|
||||
#endif
|
||||
|
||||
ccl_device_inline uint count_leading_zeros(uint x)
|
||||
{
|
||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
|
||||
return __clz(x);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return clz(x);
|
||||
#else
|
||||
assert(x != 0);
|
||||
# ifdef _MSC_VER
|
||||
@@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
|
||||
{
|
||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
|
||||
return (__ffs(x) - 1);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return ctz(x);
|
||||
#else
|
||||
assert(x != 0);
|
||||
# ifdef _MSC_VER
|
||||
@@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x)
|
||||
{
|
||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
|
||||
return __ffs(x);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return (x != 0) ? ctz(x) + 1 : 0;
|
||||
#else
|
||||
# ifdef _MSC_VER
|
||||
return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
|
||||
@@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
|
||||
return x;
|
||||
#elif defined(__KERNEL_CUDA__)
|
||||
return __brev(x);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return reverse_bits(x);
|
||||
#elif __has_builtin(__builtin_bitreverse32)
|
||||
return __builtin_bitreverse32(x);
|
||||
#else
|
||||
|
||||
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
|
||||
* Declaration.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float2 operator-(const float2 &a);
|
||||
ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
|
||||
ccl_device_inline float2 operator*(const float2 &a, float f);
|
||||
@@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a);
|
||||
ccl_device_inline float2 as_float2(const float4 &a);
|
||||
ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
|
||||
ccl_device_inline float2 floor(const float2 &a);
|
||||
#endif /* !__KERNEL_METAL__ */
|
||||
|
||||
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
|
||||
|
||||
@@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2()
|
||||
return make_float2(1.0f, 1.0f);
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float2 operator-(const float2 &a)
|
||||
{
|
||||
return make_float2(-a.x, -a.y);
|
||||
@@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a)
|
||||
return make_float2(floorf(a.x), floorf(a.y));
|
||||
}
|
||||
|
||||
#endif /* !__KERNEL_METAL__ */
|
||||
|
||||
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
|
||||
{
|
||||
return (b != 0.0f) ? a / b : zero_float2();
|
||||
|
||||
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
|
||||
* Declaration.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float3 operator-(const float3 &a);
|
||||
ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
|
||||
ccl_device_inline float3 operator*(const float3 &a, const float f);
|
||||
@@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a);
|
||||
ccl_device_inline float3 sqrt(const float3 &a);
|
||||
ccl_device_inline float3 floor(const float3 &a);
|
||||
ccl_device_inline float3 ceil(const float3 &a);
|
||||
ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
|
||||
#endif /* !defined(__KERNEL_METAL__) */
|
||||
|
||||
ccl_device_inline float min3(float3 a);
|
||||
ccl_device_inline float max3(float3 a);
|
||||
ccl_device_inline float len(const float3 a);
|
||||
ccl_device_inline float len_squared(const float3 a);
|
||||
|
||||
ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
|
||||
ccl_device_inline float3 project(const float3 v, const float3 v_proj);
|
||||
|
||||
ccl_device_inline float3 saturate3(float3 a);
|
||||
ccl_device_inline float3 safe_normalize(const float3 a);
|
||||
ccl_device_inline float3 normalize_len(const float3 a, float *t);
|
||||
ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
|
||||
ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
|
||||
ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
|
||||
ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
|
||||
ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
|
||||
ccl_device_inline float3 interp(float3 a, float3 b, float t);
|
||||
@@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3()
|
||||
return make_float3(1.0f, 1.0f, 1.0f);
|
||||
}
|
||||
|
||||
#if defined(__KERNEL_METAL__)
|
||||
|
||||
ccl_device_inline float3 rcp(float3 a)
|
||||
{
|
||||
return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
ccl_device_inline float3 operator-(const float3 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
|
||||
#else
|
||||
# else
|
||||
return make_float3(-a.x, -a.y, -a.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_mul_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*(const float3 &a, const float f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
|
||||
#else
|
||||
# else
|
||||
return make_float3(a.x * f, a.y * f, a.z * f);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*(const float f, const float3 &a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE__)
|
||||
return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(a.x * f, a.y * f, a.z * f);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator/(const float f, const float3 &a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE__)
|
||||
return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(f / a.x, f / a.y, f / a.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator/(const float3 &a, const float f)
|
||||
@@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
|
||||
|
||||
ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE__)
|
||||
return float3(_mm_div_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator+(const float3 &a, const float f)
|
||||
@@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
|
||||
|
||||
ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_add_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator-(const float3 &a, const float f)
|
||||
@@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
|
||||
|
||||
ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_sub_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
|
||||
@@ -250,11 +261,11 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
|
||||
|
||||
ccl_device_inline bool operator==(const float3 &a, const float3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
|
||||
#else
|
||||
# else
|
||||
return (a.x == b.x && a.y == b.y && a.z == b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
|
||||
@@ -269,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
|
||||
|
||||
ccl_device_inline float dot(const float3 &a, const float3 &b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
|
||||
#else
|
||||
# else
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
|
||||
#else
|
||||
# else
|
||||
return a.x * b.x + a.y * b.y;
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 cross(const float3 &a, const float3 &b)
|
||||
@@ -293,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
|
||||
|
||||
ccl_device_inline float3 normalize(const float3 &a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
|
||||
return float3(_mm_div_ps(a.m128, norm));
|
||||
#else
|
||||
# else
|
||||
return a / len(a);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 min(const float3 &a, const float3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_min_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 max(const float3 &a, const float3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_max_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
|
||||
@@ -326,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
|
||||
|
||||
ccl_device_inline float3 fabs(const float3 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_NEON__
|
||||
# ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_NEON__
|
||||
return float3(vabsq_f32(a.m128));
|
||||
# else
|
||||
# else
|
||||
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
return float3(_mm_and_ps(a.m128, mask));
|
||||
# endif
|
||||
#else
|
||||
# endif
|
||||
# else
|
||||
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 sqrt(const float3 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_sqrt_ps(a));
|
||||
#else
|
||||
# else
|
||||
return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 floor(const float3 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_floor_ps(a));
|
||||
#else
|
||||
# else
|
||||
return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 ceil(const float3 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float3(_mm_ceil_ps(a));
|
||||
#else
|
||||
# else
|
||||
return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
|
||||
@@ -372,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
|
||||
|
||||
ccl_device_inline float3 rcp(const float3 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
/* Don't use _mm_rcp_ps due to poor precision. */
|
||||
return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
#endif /* !__KERNEL_METAL__ */
|
||||
|
||||
ccl_device_inline float min3(float3 a)
|
||||
{
|
||||
@@ -404,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a)
|
||||
return dot(a, a);
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
|
||||
{
|
||||
float3 unit_normal = normalize(normal);
|
||||
@@ -425,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector,
|
||||
{
|
||||
return (dot(reference, incident) < 0.0f) ? vector : -vector;
|
||||
}
|
||||
#endif
|
||||
|
||||
ccl_device_inline float3 project(const float3 v, const float3 v_proj)
|
||||
{
|
||||
@@ -505,7 +519,11 @@ ccl_device_inline float average(const float3 a)
|
||||
|
||||
ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
|
||||
{
|
||||
#if defined(__KERNEL_METAL__)
|
||||
return all(a == b);
|
||||
#else
|
||||
return a == b;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 pow3(float3 v, float e)
|
||||
|
||||
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
|
||||
* Declaration.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float4 operator-(const float4 &a);
|
||||
ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
|
||||
ccl_device_inline float4 operator*(const float4 &a, float f);
|
||||
@@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
|
||||
ccl_device_inline float4 fabs(const float4 &a);
|
||||
ccl_device_inline float4 floor(const float4 &a);
|
||||
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
|
||||
#endif /* !__KERNEL_METAL__*/
|
||||
|
||||
ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
|
||||
|
||||
@@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4()
|
||||
return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline float4 operator-(const float4 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
|
||||
return float4(_mm_xor_ps(a.m128, mask));
|
||||
#else
|
||||
# else
|
||||
return make_float4(-a.x, -a.y, -a.z, -a.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_mul_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator*(const float4 &a, float f)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_SSE__)
|
||||
return a * make_float4(f);
|
||||
#else
|
||||
# else
|
||||
return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator*(float f, const float4 &a)
|
||||
@@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
|
||||
|
||||
ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_div_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator+(const float4 &a, const float f)
|
||||
@@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
|
||||
|
||||
ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_add_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator-(const float4 &a, const float f)
|
||||
@@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
|
||||
|
||||
ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_sub_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
|
||||
@@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
|
||||
|
||||
ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
|
||||
#else
|
||||
# else
|
||||
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
|
||||
#else
|
||||
# else
|
||||
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
|
||||
#else
|
||||
# else
|
||||
return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator==(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
|
||||
#else
|
||||
# else
|
||||
return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float distance(const float4 &a, const float4 &b)
|
||||
@@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
|
||||
|
||||
ccl_device_inline float dot(const float4 &a, const float4 &b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
__m128 t = vmulq_f32(a, b);
|
||||
return vaddvq_f32(t);
|
||||
# else
|
||||
# else
|
||||
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
|
||||
# endif
|
||||
#else
|
||||
# endif
|
||||
# else
|
||||
return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float len_squared(const float4 &a)
|
||||
@@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a)
|
||||
|
||||
ccl_device_inline float4 rcp(const float4 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
/* Don't use _mm_rcp_ps due to poor precision. */
|
||||
return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 sqrt(const float4 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_sqrt_ps(a.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 sqr(const float4 &a)
|
||||
@@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a)
|
||||
|
||||
ccl_device_inline float4 cross(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
|
||||
(shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
|
||||
#else
|
||||
# else
|
||||
return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool is_zero(const float4 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return a == make_float4(0.0f);
|
||||
#else
|
||||
# else
|
||||
return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 reduce_add(const float4 &a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
# if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(vdupq_n_f32(vaddvq_f32(a)));
|
||||
# elif defined(__KERNEL_SSE3__)
|
||||
# elif defined(__KERNEL_SSE3__)
|
||||
float4 h(_mm_hadd_ps(a.m128, a.m128));
|
||||
return float4(_mm_hadd_ps(h.m128, h.m128));
|
||||
# else
|
||||
# else
|
||||
float4 h(shuffle<1, 0, 3, 2>(a) + a);
|
||||
return shuffle<2, 3, 0, 1>(h) + h;
|
||||
# endif
|
||||
#else
|
||||
# endif
|
||||
# else
|
||||
float sum = (a.x + a.y) + (a.z + a.w);
|
||||
return make_float4(sum, sum, sum, sum);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float average(const float4 &a)
|
||||
@@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
|
||||
|
||||
ccl_device_inline float4 min(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_min_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 max(const float4 &a, const float4 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_max_ps(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
|
||||
@@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
|
||||
|
||||
ccl_device_inline float4 fabs(const float4 &a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
# if defined(__KERNEL_SSE__)
|
||||
# if defined(__KERNEL_NEON__)
|
||||
return float4(vabsq_f32(a));
|
||||
# else
|
||||
# else
|
||||
return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
|
||||
# endif
|
||||
#else
|
||||
# endif
|
||||
# else
|
||||
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 floor(const float4 &a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return float4(_mm_floor_ps(a));
|
||||
#else
|
||||
# else
|
||||
return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
|
||||
@@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
|
||||
return a + t * (b - a);
|
||||
}
|
||||
|
||||
#endif /* !__KERNEL_METAL__*/
|
||||
|
||||
#ifdef __KERNEL_SSE__
|
||||
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
|
||||
__forceinline const float4 shuffle(const float4 &b)
|
||||
|
||||
@@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN
|
||||
* Declaration.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline bool operator==(const int2 a, const int2 b);
|
||||
ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
|
||||
ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
|
||||
ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
|
||||
ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
|
||||
ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
|
||||
#endif /* !__KERNEL_METAL__ */
|
||||
|
||||
/*******************************************************************************
|
||||
* Definition.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline bool operator==(const int2 a, const int2 b)
|
||||
{
|
||||
return (a.x == b.x && a.y == b.y);
|
||||
@@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
|
||||
{
|
||||
return make_int2(a.x / b.x, a.y / b.y);
|
||||
}
|
||||
#endif /* !__KERNEL_METAL__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
||||
@@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN
|
||||
* Declaration.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline int3 min(int3 a, int3 b);
|
||||
ccl_device_inline int3 max(int3 a, int3 b);
|
||||
ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
|
||||
ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
|
||||
#endif /* !defined(__KERNEL_METAL__) */
|
||||
|
||||
/*******************************************************************************
|
||||
* Definition.
|
||||
*/
|
||||
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
ccl_device_inline int3 min(int3 a, int3 b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
|
||||
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
|
||||
return int3(_mm_min_epi32(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int3 max(int3 a, int3 b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
|
||||
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
|
||||
return int3(_mm_max_epi32(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return min(max(a, make_int3(mn)), make_int3(mx));
|
||||
#else
|
||||
# else
|
||||
return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return min(max(a, mn), make_int3(mx));
|
||||
#else
|
||||
# else
|
||||
return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator==(const int3 &a, const int3 &b)
|
||||
@@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
|
||||
|
||||
ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return int3(_mm_add_epi32(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
# ifdef __KERNEL_SSE__
|
||||
return int3(_mm_sub_epi32(a.m128, b.m128));
|
||||
#else
|
||||
# else
|
||||
return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
#endif /* !__KERNEL_METAL__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
||||
@@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
|
||||
{
|
||||
for (int row = 0; row < n; row++) {
|
||||
for (int col = 0; col <= row; col++) {
|
||||
MATHS(A, row, col, 1) += v[row] * v[col] * weight;
|
||||
atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,7 +313,7 @@ static char *path_specials(const string &sub)
|
||||
if (env_shader_path != NULL && sub == "shader") {
|
||||
return env_shader_path;
|
||||
}
|
||||
else if (env_shader_path != NULL && sub == "source") {
|
||||
else if (env_source_path != NULL && sub == "source") {
|
||||
return env_source_path;
|
||||
}
|
||||
return NULL;
|
||||
|
||||
@@ -53,6 +53,15 @@ typedef struct DecomposedTransform {
|
||||
|
||||
/* Functions */
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
/* transform_point specialized for ccl_global */
|
||||
ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a)
|
||||
{
|
||||
ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t);
|
||||
return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
|
||||
}
|
||||
#endif
|
||||
|
||||
ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a)
|
||||
{
|
||||
/* TODO(sergey): Disabled for now, causes crashes in certain cases. */
|
||||
@@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
|
||||
tmp += w;
|
||||
|
||||
return float3(tmp.m128);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
|
||||
return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
|
||||
#else
|
||||
float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w,
|
||||
a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w,
|
||||
@@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
|
||||
tmp = madd(shuffle<2>(aa), z, tmp);
|
||||
|
||||
return float3(tmp.m128);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
|
||||
return (a * b).xyz;
|
||||
#else
|
||||
float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z,
|
||||
a.x * t->y.x + a.y * t->y.y + a.z * t->y.z,
|
||||
@@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm,
|
||||
}
|
||||
|
||||
/* Interpolate from array of decomposed transforms. */
|
||||
ccl_device void transform_motion_array_interpolate(Transform *tfm,
|
||||
const DecomposedTransform *motion,
|
||||
ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
|
||||
ccl_global const DecomposedTransform *motion,
|
||||
uint numsteps,
|
||||
float time)
|
||||
{
|
||||
@@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
|
||||
int step = min((int)(time * maxstep), maxstep - 1);
|
||||
float t = time * maxstep - step;
|
||||
|
||||
const DecomposedTransform *a = motion + step;
|
||||
const DecomposedTransform *b = motion + step + 1;
|
||||
ccl_global const DecomposedTransform *a = motion + step;
|
||||
ccl_global const DecomposedTransform *b = motion + step + 1;
|
||||
|
||||
/* Interpolate rotation, translation and scale. */
|
||||
DecomposedTransform decomp;
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
#ifndef __UTIL_TYPES_H__
|
||||
#define __UTIL_TYPES_H__
|
||||
|
||||
#include <stdlib.h>
|
||||
#if !defined(__KERNEL_METAL__)
|
||||
# include <stdlib.h>
|
||||
#endif
|
||||
|
||||
/* Standard Integer Types */
|
||||
|
||||
|
||||
Reference in New Issue
Block a user