diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 4fc60703ba5..3fb2f4e3ad6 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -25,8 +25,21 @@ static size_t estimate_single_state_size(const uint kernel_features) #define KERNEL_STRUCT_BEGIN(name) \ for (int array_index = 0;; array_index++) { -#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ - state_size += (kernel_features & (feature)) ? sizeof(type) : 0; + +#ifdef __INTEGRATOR_GPU_PACKED_STATE__ +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + state_size += (kernel_features & (feature)) ? sizeof(type) : 0; +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \ + KERNEL_STRUCT_BEGIN(parent_struct) \ + KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature) +#else +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + state_size += (kernel_features & (feature)) ? sizeof(type) : 0; +# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct) +#endif + #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ state_size += (kernel_features & (feature)) ? sizeof(type) : 0; #define KERNEL_STRUCT_END(name) \ @@ -50,7 +63,9 @@ static size_t estimate_single_state_size(const uint kernel_features) #include "kernel/integrator/shadow_state_template.h" #undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_BEGIN_PACKED #undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_MEMBER_PACKED #undef KERNEL_STRUCT_ARRAY_MEMBER #undef KERNEL_STRUCT_END #undef KERNEL_STRUCT_END_ARRAY @@ -128,12 +143,31 @@ void PathTraceWorkGPU::alloc_integrator_soa() for (int array_index = 0;; array_index++) { #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \ - string name_str = string_printf("%sintegrator_state_" #name, shadow ? "shadow_" : ""); \ + string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \ + shadow ? "shadow_" : ""); \ device_only_memory *array = new device_only_memory(device_, name_str.c_str()); \ array->alloc_to_device(max_num_paths_); \ integrator_state_soa_.emplace_back(array); \ - integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \ + memcpy(&integrator_state_gpu_.parent_struct.name, \ + &array->device_pointer, \ + sizeof(array->device_pointer)); \ } +#ifdef __INTEGRATOR_GPU_PACKED_STATE__ +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \ + if ((kernel_features & (feature))) { \ + string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \ + shadow ? "shadow_" : ""); \ + VLOG_DEBUG << "Skipping " << name_str \ + << " -- data is packed inside integrator_state_" #parent_struct "_packed"; \ + } +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \ + KERNEL_STRUCT_BEGIN(parent_struct) \ + KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature) +#else +# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct) +#endif + #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ if ((kernel_features & (feature)) && \ (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \ @@ -143,7 +177,9 @@ void PathTraceWorkGPU::alloc_integrator_soa() device_only_memory *array = new device_only_memory(device_, name_str.c_str()); \ array->alloc_to_device(max_num_paths_); \ integrator_state_soa_.emplace_back(array); \ - integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \ + memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \ + &array->device_pointer, \ + sizeof(array->device_pointer)); \ } #define KERNEL_STRUCT_END(name) \ (void)array_index; \ @@ -162,7 +198,9 @@ void PathTraceWorkGPU::alloc_integrator_soa() #include "kernel/integrator/shadow_state_template.h" #undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_BEGIN_PACKED #undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_MEMBER_PACKED #undef KERNEL_STRUCT_ARRAY_MEMBER #undef KERNEL_STRUCT_END #undef KERNEL_STRUCT_END_ARRAY diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 6370dfe6aa2..011d616bdba 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -66,6 +66,8 @@ using namespace metal::raytracing; #define kernel_assert(cond) +#define offsetof(t, d) __builtin_offsetof(t, d) + #define ccl_gpu_global_id_x() metal_global_id #define ccl_gpu_warp_size simdgroup_size #define ccl_gpu_thread_idx_x simd_group_index diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h index 94ffd8a069f..cd16d27dea0 100644 --- a/intern/cycles/kernel/integrator/shadow_state_template.h +++ b/intern/cycles/kernel/integrator/shadow_state_template.h @@ -56,14 +56,14 @@ KERNEL_STRUCT_END(shadow_path) /********************************** Shadow Ray *******************************/ -KERNEL_STRUCT_BEGIN(shadow_ray) -KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, int, self_light, KERNEL_FEATURE_SHADOW_LINKING) +KERNEL_STRUCT_BEGIN_PACKED(shadow_ray, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, int, self_light, KERNEL_FEATURE_SHADOW_LINKING) KERNEL_STRUCT_END(shadow_ray) /*********************** Shadow Intersection result **************************/ diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h index 8950fd01837..b6f127ed266 100644 --- a/intern/cycles/kernel/integrator/state.h +++ b/intern/cycles/kernel/integrator/state.h @@ -47,7 +47,9 @@ CCL_NAMESPACE_BEGIN * CPU rendering path state with AoS layout. */ typedef struct IntegratorShadowStateCPU { #define KERNEL_STRUCT_BEGIN(name) struct { +#define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct { #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name; +#define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER #define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER #define KERNEL_STRUCT_END(name) \ } \ @@ -58,7 +60,9 @@ typedef struct IntegratorShadowStateCPU { #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE #include "kernel/integrator/shadow_state_template.h" #undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_BEGIN_PACKED #undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_MEMBER_PACKED #undef KERNEL_STRUCT_ARRAY_MEMBER #undef KERNEL_STRUCT_END #undef KERNEL_STRUCT_END_ARRAY @@ -66,7 +70,9 @@ typedef struct IntegratorShadowStateCPU { typedef struct IntegratorStateCPU { #define KERNEL_STRUCT_BEGIN(name) struct { +#define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct { #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name; +#define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER #define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER #define KERNEL_STRUCT_END(name) \ } \ @@ -77,7 +83,9 @@ typedef struct IntegratorStateCPU { #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE #include "kernel/integrator/state_template.h" #undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_BEGIN_PACKED #undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_MEMBER_PACKED #undef KERNEL_STRUCT_ARRAY_MEMBER #undef KERNEL_STRUCT_END #undef KERNEL_STRUCT_END_ARRAY @@ -95,12 +103,75 @@ typedef struct IntegratorQueueCounter { int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]; } IntegratorQueueCounter; +#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) + +/* Generate wrapper structs for all integrator state fields. This allows us to access state + * uniformly, regardless of whether it stored in a packed struct or separate arrays. */ +# define KERNEL_STRUCT_BEGIN(name) +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + struct Wrapped_##parent_struct##_##name { \ + type name; \ + }; +# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \ + KERNEL_STRUCT_BEGIN(parent_struct) \ + KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature) +# define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_END(name) +# define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) +# define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE + +# include "kernel/integrator/shadow_state_template.h" +# include "kernel/integrator/state_template.h" + +# undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_BEGIN_PACKED +# undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_MEMBER_PACKED +# undef KERNEL_STRUCT_ARRAY_MEMBER +# undef KERNEL_STRUCT_END +# undef KERNEL_STRUCT_END_ARRAY +# undef KERNEL_STRUCT_VOLUME_STACK_SIZE + +#endif + /* Integrator State GPU * * GPU rendering path state with SoA layout. */ typedef struct IntegratorStateGPU { #define KERNEL_STRUCT_BEGIN(name) struct { -#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name; + +#ifdef __INTEGRATOR_GPU_PACKED_STATE__ + +# ifdef __KERNEL_GPU__ + +/* If we've opted in to packed layouts, generate member functions that return a pointer to a + * wrapper type so we can access state using uniform syntax. */ +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + ccl_global Wrapped_##parent_struct##_##name *name; \ + ccl_device_inline ccl_global Wrapped_##parent_struct##_##name *name##_fn() ccl_constant \ + { \ + return (ccl_global Wrapped_##parent_struct##_##name *)name; \ + } +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \ + ccl_device_inline ccl_global packed_##parent_struct *name##_fn() ccl_constant \ + { \ + return (ccl_global packed_##parent_struct *)packed; \ + } +# else +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name; +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) +# endif + +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \ + KERNEL_STRUCT_BEGIN(parent_struct) \ + KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature) + +#else +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name; +# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct) +#endif #define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER #define KERNEL_STRUCT_END(name) \ } \ @@ -115,7 +186,9 @@ typedef struct IntegratorStateGPU { #include "kernel/integrator/shadow_state_template.h" #undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_BEGIN_PACKED #undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_MEMBER_PACKED #undef KERNEL_STRUCT_ARRAY_MEMBER #undef KERNEL_STRUCT_END #undef KERNEL_STRUCT_END_ARRAY @@ -178,13 +251,23 @@ typedef int ConstIntegratorShadowState; # define INTEGRATOR_STATE_NULL -1 -# define INTEGRATOR_STATE(state, nested_struct, member) \ - kernel_integrator_state.nested_struct.member[state] +# ifdef __INTEGRATOR_GPU_PACKED_STATE__ + +/* If we've opted in to packed layouts, we use the generated accessor functions (member##_fn) to + * resolve different layouts (packed vs separate). */ +# define INTEGRATOR_STATE(state, nested_struct, member) \ + kernel_integrator_state.nested_struct.member##_fn()[state].member +# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \ + kernel_integrator_state.nested_struct[array_index].member##_fn()[state].member +# else +# define INTEGRATOR_STATE(state, nested_struct, member) \ + kernel_integrator_state.nested_struct.member[state] +# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \ + kernel_integrator_state.nested_struct[array_index].member[state] +# endif + # define INTEGRATOR_STATE_WRITE(state, nested_struct, member) \ INTEGRATOR_STATE(state, nested_struct, member) - -# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \ - kernel_integrator_state.nested_struct[array_index].member[state] # define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \ INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h index 0c84416e4e5..87317842f86 100644 --- a/intern/cycles/kernel/integrator/state_template.h +++ b/intern/cycles/kernel/integrator/state_template.h @@ -67,36 +67,36 @@ KERNEL_STRUCT_END(path) /************************************** Ray ***********************************/ -KERNEL_STRUCT_BEGIN(ray) -KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_BEGIN_PACKED(ray, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, dD, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, previous_dt, KERNEL_FEATURE_LIGHT_TREE) KERNEL_STRUCT_END(ray) /*************************** Intersection result ******************************/ /* Result from scene intersection. */ -KERNEL_STRUCT_BEGIN(isect) -KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_BEGIN_PACKED(isect, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(isect, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(isect, float, u, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(isect, float, v, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(isect, int, prim, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(isect, int, object, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(isect, int, type, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_END(isect) /*************** Subsurface closure state for subsurface kernel ***************/ -KERNEL_STRUCT_BEGIN(subsurface) -KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, albedo, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, radius, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, packed_float3, N, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_BEGIN_PACKED(subsurface, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER_PACKED(subsurface, PackedSpectrum, albedo, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER_PACKED(subsurface, PackedSpectrum, radius, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER_PACKED(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER_PACKED(subsurface, packed_float3, N, KERNEL_FEATURE_SUBSURFACE) KERNEL_STRUCT_END(subsurface) /********************************** Volume Stack ******************************/ diff --git a/intern/cycles/kernel/integrator/state_util.h b/intern/cycles/kernel/integrator/state_util.h index d24d3967af1..99ffd20f296 100644 --- a/intern/cycles/kernel/integrator/state_util.h +++ b/intern/cycles/kernel/integrator/state_util.h @@ -15,6 +15,31 @@ CCL_NAMESPACE_BEGIN ccl_device_forceinline void integrator_state_write_ray(IntegratorState state, ccl_private const Ray *ccl_restrict ray) { +#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) + static_assert(sizeof(ray->P) == sizeof(float4), "Bad assumption about float3 padding"); + /* dP and dP are packed based on the assumption that float3 is padded to 16 bytes. + * This assumption hold trues on Metal, but not CUDA. + */ + ((ccl_private float4 &)ray->P).w = ray->dP; + ((ccl_private float4 &)ray->D).w = ray->dD; + INTEGRATOR_STATE_WRITE(state, ray, packed) = (ccl_private packed_ray &)*ray; + + /* Ensure that we can correctly cast between Ray and the generated packed_ray struct. */ + static_assert(offsetof(packed_ray, P) == offsetof(Ray, P), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, D) == offsetof(Ray, D), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, tmin) == offsetof(Ray, tmin), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, tmax) == offsetof(Ray, tmax), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, time) == offsetof(Ray, time), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, dP) == 12 + offsetof(Ray, P), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, dD) == 12 + offsetof(Ray, D), + "Generated packed_ray struct is misaligned with Ray struct"); +#else INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P; INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D; INTEGRATOR_STATE_WRITE(state, ray, tmin) = ray->tmin; @@ -22,11 +47,17 @@ ccl_device_forceinline void integrator_state_write_ray(IntegratorState state, INTEGRATOR_STATE_WRITE(state, ray, time) = ray->time; INTEGRATOR_STATE_WRITE(state, ray, dP) = ray->dP; INTEGRATOR_STATE_WRITE(state, ray, dD) = ray->dD; +#endif } ccl_device_forceinline void integrator_state_read_ray(ConstIntegratorState state, ccl_private Ray *ccl_restrict ray) { +#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) + *((ccl_private packed_ray *)ray) = INTEGRATOR_STATE(state, ray, packed); + ray->dP = ((ccl_private float4 &)ray->P).w; + ray->dD = ((ccl_private float4 &)ray->D).w; +#else ray->P = INTEGRATOR_STATE(state, ray, P); ray->D = INTEGRATOR_STATE(state, ray, D); ray->tmin = INTEGRATOR_STATE(state, ray, tmin); @@ -34,6 +65,7 @@ ccl_device_forceinline void integrator_state_read_ray(ConstIntegratorState state ray->time = INTEGRATOR_STATE(state, ray, time); ray->dP = INTEGRATOR_STATE(state, ray, dP); ray->dD = INTEGRATOR_STATE(state, ray, dD); +#endif } /* Shadow Ray */ @@ -96,23 +128,46 @@ ccl_device_forceinline void integrator_state_read_shadow_ray_self( ccl_device_forceinline void integrator_state_write_isect( IntegratorState state, ccl_private const Intersection *ccl_restrict isect) { +#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) + INTEGRATOR_STATE_WRITE(state, isect, packed) = (ccl_private packed_isect &)*isect; + + /* Ensure that we can correctly cast between Intersection and the generated packed_isect struct. + */ + static_assert(offsetof(packed_isect, t) == offsetof(Intersection, t), + "Generated packed_isect struct is misaligned with Intersection struct"); + static_assert(offsetof(packed_isect, u) == offsetof(Intersection, u), + "Generated packed_isect struct is misaligned with Intersection struct"); + static_assert(offsetof(packed_isect, v) == offsetof(Intersection, v), + "Generated packed_isect struct is misaligned with Intersection struct"); + static_assert(offsetof(packed_isect, object) == offsetof(Intersection, object), + "Generated packed_isect struct is misaligned with Intersection struct"); + static_assert(offsetof(packed_isect, prim) == offsetof(Intersection, prim), + "Generated packed_isect struct is misaligned with Intersection struct"); + static_assert(offsetof(packed_isect, type) == offsetof(Intersection, type), + "Generated packed_isect struct is misaligned with Intersection struct"); +#else INTEGRATOR_STATE_WRITE(state, isect, t) = isect->t; INTEGRATOR_STATE_WRITE(state, isect, u) = isect->u; INTEGRATOR_STATE_WRITE(state, isect, v) = isect->v; INTEGRATOR_STATE_WRITE(state, isect, object) = isect->object; INTEGRATOR_STATE_WRITE(state, isect, prim) = isect->prim; INTEGRATOR_STATE_WRITE(state, isect, type) = isect->type; +#endif } ccl_device_forceinline void integrator_state_read_isect( ConstIntegratorState state, ccl_private Intersection *ccl_restrict isect) { +#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) + *((ccl_private packed_isect *)isect) = INTEGRATOR_STATE(state, isect, packed); +#else isect->prim = INTEGRATOR_STATE(state, isect, prim); isect->object = INTEGRATOR_STATE(state, isect, object); isect->type = INTEGRATOR_STATE(state, isect, type); isect->u = INTEGRATOR_STATE(state, isect, u); isect->v = INTEGRATOR_STATE(state, isect, v); isect->t = INTEGRATOR_STATE(state, isect, t); +#endif } #ifdef __VOLUME__ @@ -250,6 +305,16 @@ ccl_device_inline void integrator_state_copy_only(KernelGlobals kg, kernel_integrator_state.parent_struct.name[state]; \ } +# ifdef __INTEGRATOR_GPU_PACKED_STATE__ +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \ + KERNEL_STRUCT_BEGIN(parent_struct) \ + KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature) +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) +# else +# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct) +# endif + # define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ if (kernel_integrator_state.parent_struct[index].name != nullptr) { \ kernel_integrator_state.parent_struct[index].name[to_state] = \ @@ -272,7 +337,9 @@ ccl_device_inline void integrator_state_copy_only(KernelGlobals kg, # include "kernel/integrator/state_template.h" # undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_BEGIN_PACKED # undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_MEMBER_PACKED # undef KERNEL_STRUCT_ARRAY_MEMBER # undef KERNEL_STRUCT_END # undef KERNEL_STRUCT_END_ARRAY @@ -306,6 +373,16 @@ ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg, kernel_integrator_state.parent_struct.name[state]; \ } +# ifdef __INTEGRATOR_GPU_PACKED_STATE__ +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \ + KERNEL_STRUCT_BEGIN(parent_struct) \ + KERNEL_STRUCT_MEMBER(parent_struct, type, packed, feature) +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) +# else +# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct) +# endif + # define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ if (kernel_integrator_state.parent_struct[index].name != nullptr) { \ kernel_integrator_state.parent_struct[index].name[to_state] = \ @@ -328,7 +405,9 @@ ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg, # include "kernel/integrator/shadow_state_template.h" # undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_BEGIN_PACKED # undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_MEMBER_PACKED # undef KERNEL_STRUCT_ARRAY_MEMBER # undef KERNEL_STRUCT_END # undef KERNEL_STRUCT_END_ARRAY diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 62789987b11..7952d384680 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -16,6 +16,10 @@ # define __EMBREE__ #endif +#ifdef __APPLE__ +# include +#endif + #include "util/math.h" #include "util/math_fast.h" #include "util/math_intersect.h" @@ -731,12 +735,12 @@ typedef struct Ray { float tmax; /* end distance */ float time; /* time (for motion blur) */ - RaySelfPrimitives self; - #ifdef __RAY_DIFFERENTIALS__ float dP; float dD; #endif + + RaySelfPrimitives self; } Ray; /* Intersection */ @@ -748,6 +752,67 @@ typedef struct Intersection { int type; } Intersection; +/* On certain GPUs (Apple Silicon), splitting every integrator state field into its own separate + * array can be detrimental for cache utilisation. By enabling __INTEGRATOR_GPU_PACKED_STATE__, we + * specify that certain fields should be packed together. This improves cache hit ratios in cases + * where fields are often accessed together (e.g. "ray" and "isect"). + */ +#if defined(TARGET_CPU_ARM64) || defined(__KERNEL_METAL_APPLE__) +# define __INTEGRATOR_GPU_PACKED_STATE__ + +/* Generate packed layouts for structs declared with KERNEL_STRUCT_BEGIN_PACKED. For example the + * following template... + * + * KERNEL_STRUCT_BEGIN_PACKED(shadow_ray, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) + * KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, int, self_light, KERNEL_FEATURE_SHADOW_LINKING) + * KERNEL_STRUCT_END(shadow_ray) + * + * ...will produce the following packed struct: + * + * struct packed_shadow_ray { + * packed_float3 P; + * packed_float3 D; + * float tmin; + * float tmax; + * float time; + * float dP; + * int self_light; + * }; + */ + +# define KERNEL_STRUCT_BEGIN(name) struct dummy_##name { +# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct packed_##parent_struct { +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) +# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) type name; +# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) type name; +# define KERNEL_STRUCT_END(name) \ + } \ + ; +# define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \ + } \ + ; +# define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE + +# include "kernel/integrator/shadow_state_template.h" +# include "kernel/integrator/state_template.h" + +# undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_BEGIN_PACKED +# undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_MEMBER_PACKED +# undef KERNEL_STRUCT_ARRAY_MEMBER +# undef KERNEL_STRUCT_END +# undef KERNEL_STRUCT_END_ARRAY +# undef KERNEL_STRUCT_VOLUME_STACK_SIZE + +#endif + /* Primitives */ typedef enum PrimitiveType {