Cycles: "Struct-of-array-of-packed-structs" for parts of the integrator state
On a M3 MacBook Pro, this change increases the benchmark score by 8% (with classroom seeing a path-tracing speedup of 15%). The integrator state is currently store using struct-of-arrays, with one array per field. Such fine grained separation can result in poor GPU cache utilisation in cases where multiple fields of the same parent struct are accessed together. This PR changes the layout of the `ray`, `isect`, `subsurface`, and `shadow_ray` structs so that the data is interleaved (per parent struct) instead of separate. To try and keep this change localised, I encapsulated the layout change by extending the integrator state access macros, however maybe we want to do this more explicitly? (e.g. by updating every bit of code that accesses these parts of the state). Feedback welcome. Pull Request: https://projects.blender.org/blender/blender/pulls/122015
This commit is contained in:
committed by
Michael Jones (Apple)
parent
c7807a425a
commit
5be30b7d2b
@@ -25,8 +25,21 @@ static size_t estimate_single_state_size(const uint kernel_features)
|
||||
|
||||
#define KERNEL_STRUCT_BEGIN(name) \
|
||||
for (int array_index = 0;; array_index++) {
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
|
||||
|
||||
#ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
||||
KERNEL_STRUCT_BEGIN(parent_struct) \
|
||||
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
||||
#else
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
|
||||
#endif
|
||||
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
|
||||
state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
@@ -50,7 +63,9 @@ static size_t estimate_single_state_size(const uint kernel_features)
|
||||
#include "kernel/integrator/shadow_state_template.h"
|
||||
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
#undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
#undef KERNEL_STRUCT_END
|
||||
#undef KERNEL_STRUCT_END_ARRAY
|
||||
@@ -128,12 +143,31 @@ void PathTraceWorkGPU::alloc_integrator_soa()
|
||||
for (int array_index = 0;; array_index++) {
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
|
||||
string name_str = string_printf("%sintegrator_state_" #name, shadow ? "shadow_" : ""); \
|
||||
string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
|
||||
shadow ? "shadow_" : ""); \
|
||||
device_only_memory<type> *array = new device_only_memory<type>(device_, name_str.c_str()); \
|
||||
array->alloc_to_device(max_num_paths_); \
|
||||
integrator_state_soa_.emplace_back(array); \
|
||||
integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
|
||||
memcpy(&integrator_state_gpu_.parent_struct.name, \
|
||||
&array->device_pointer, \
|
||||
sizeof(array->device_pointer)); \
|
||||
}
|
||||
#ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
|
||||
if ((kernel_features & (feature))) { \
|
||||
string name_str = string_printf("%sintegrator_state_" #parent_struct "_" #name, \
|
||||
shadow ? "shadow_" : ""); \
|
||||
VLOG_DEBUG << "Skipping " << name_str \
|
||||
<< " -- data is packed inside integrator_state_" #parent_struct "_packed"; \
|
||||
}
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
||||
KERNEL_STRUCT_BEGIN(parent_struct) \
|
||||
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
||||
#else
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
|
||||
#endif
|
||||
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
|
||||
if ((kernel_features & (feature)) && \
|
||||
(integrator_state_gpu_.parent_struct[array_index].name == nullptr)) \
|
||||
@@ -143,7 +177,9 @@ void PathTraceWorkGPU::alloc_integrator_soa()
|
||||
device_only_memory<type> *array = new device_only_memory<type>(device_, name_str.c_str()); \
|
||||
array->alloc_to_device(max_num_paths_); \
|
||||
integrator_state_soa_.emplace_back(array); \
|
||||
integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
|
||||
memcpy(&integrator_state_gpu_.parent_struct[array_index].name, \
|
||||
&array->device_pointer, \
|
||||
sizeof(array->device_pointer)); \
|
||||
}
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
(void)array_index; \
|
||||
@@ -162,7 +198,9 @@ void PathTraceWorkGPU::alloc_integrator_soa()
|
||||
#include "kernel/integrator/shadow_state_template.h"
|
||||
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
#undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
#undef KERNEL_STRUCT_END
|
||||
#undef KERNEL_STRUCT_END_ARRAY
|
||||
|
||||
@@ -66,6 +66,8 @@ using namespace metal::raytracing;
|
||||
|
||||
#define kernel_assert(cond)
|
||||
|
||||
#define offsetof(t, d) __builtin_offsetof(t, d)
|
||||
|
||||
#define ccl_gpu_global_id_x() metal_global_id
|
||||
#define ccl_gpu_warp_size simdgroup_size
|
||||
#define ccl_gpu_thread_idx_x simd_group_index
|
||||
|
||||
@@ -56,14 +56,14 @@ KERNEL_STRUCT_END(shadow_path)
|
||||
|
||||
/********************************** Shadow Ray *******************************/
|
||||
|
||||
KERNEL_STRUCT_BEGIN(shadow_ray)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(shadow_ray, int, self_light, KERNEL_FEATURE_SHADOW_LINKING)
|
||||
KERNEL_STRUCT_BEGIN_PACKED(shadow_ray, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, int, self_light, KERNEL_FEATURE_SHADOW_LINKING)
|
||||
KERNEL_STRUCT_END(shadow_ray)
|
||||
|
||||
/*********************** Shadow Intersection result **************************/
|
||||
|
||||
@@ -47,7 +47,9 @@ CCL_NAMESPACE_BEGIN
|
||||
* CPU rendering path state with AoS layout. */
|
||||
typedef struct IntegratorShadowStateCPU {
|
||||
#define KERNEL_STRUCT_BEGIN(name) struct {
|
||||
#define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct {
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
|
||||
#define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
@@ -58,7 +60,9 @@ typedef struct IntegratorShadowStateCPU {
|
||||
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
||||
#include "kernel/integrator/shadow_state_template.h"
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
#undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
#undef KERNEL_STRUCT_END
|
||||
#undef KERNEL_STRUCT_END_ARRAY
|
||||
@@ -66,7 +70,9 @@ typedef struct IntegratorShadowStateCPU {
|
||||
|
||||
typedef struct IntegratorStateCPU {
|
||||
#define KERNEL_STRUCT_BEGIN(name) struct {
|
||||
#define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct {
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
|
||||
#define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
@@ -77,7 +83,9 @@ typedef struct IntegratorStateCPU {
|
||||
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
||||
#include "kernel/integrator/state_template.h"
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
#undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
#undef KERNEL_STRUCT_END
|
||||
#undef KERNEL_STRUCT_END_ARRAY
|
||||
@@ -95,12 +103,75 @@ typedef struct IntegratorQueueCounter {
|
||||
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
|
||||
} IntegratorQueueCounter;
|
||||
|
||||
#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__)
|
||||
|
||||
/* Generate wrapper structs for all integrator state fields. This allows us to access state
|
||||
* uniformly, regardless of whether it stored in a packed struct or separate arrays. */
|
||||
# define KERNEL_STRUCT_BEGIN(name)
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
struct Wrapped_##parent_struct##_##name { \
|
||||
type name; \
|
||||
};
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
||||
KERNEL_STRUCT_BEGIN(parent_struct) \
|
||||
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
||||
# define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_END(name)
|
||||
# define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size)
|
||||
# define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
||||
|
||||
# include "kernel/integrator/shadow_state_template.h"
|
||||
# include "kernel/integrator/state_template.h"
|
||||
|
||||
# undef KERNEL_STRUCT_BEGIN
|
||||
# undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
# undef KERNEL_STRUCT_MEMBER
|
||||
# undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
# undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
# undef KERNEL_STRUCT_END
|
||||
# undef KERNEL_STRUCT_END_ARRAY
|
||||
# undef KERNEL_STRUCT_VOLUME_STACK_SIZE
|
||||
|
||||
#endif
|
||||
|
||||
/* Integrator State GPU
|
||||
*
|
||||
* GPU rendering path state with SoA layout. */
|
||||
typedef struct IntegratorStateGPU {
|
||||
#define KERNEL_STRUCT_BEGIN(name) struct {
|
||||
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
|
||||
|
||||
#ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
||||
|
||||
# ifdef __KERNEL_GPU__
|
||||
|
||||
/* If we've opted in to packed layouts, generate member functions that return a pointer to a
|
||||
* wrapper type so we can access state using uniform syntax. */
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
||||
ccl_global Wrapped_##parent_struct##_##name *name; \
|
||||
ccl_device_inline ccl_global Wrapped_##parent_struct##_##name *name##_fn() ccl_constant \
|
||||
{ \
|
||||
return (ccl_global Wrapped_##parent_struct##_##name *)name; \
|
||||
}
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
|
||||
ccl_device_inline ccl_global packed_##parent_struct *name##_fn() ccl_constant \
|
||||
{ \
|
||||
return (ccl_global packed_##parent_struct *)packed; \
|
||||
}
|
||||
# else
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
|
||||
# endif
|
||||
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
||||
KERNEL_STRUCT_BEGIN(parent_struct) \
|
||||
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
||||
|
||||
#else
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
|
||||
#endif
|
||||
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
@@ -115,7 +186,9 @@ typedef struct IntegratorStateGPU {
|
||||
#include "kernel/integrator/shadow_state_template.h"
|
||||
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
#undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
#undef KERNEL_STRUCT_END
|
||||
#undef KERNEL_STRUCT_END_ARRAY
|
||||
@@ -178,13 +251,23 @@ typedef int ConstIntegratorShadowState;
|
||||
|
||||
# define INTEGRATOR_STATE_NULL -1
|
||||
|
||||
# define INTEGRATOR_STATE(state, nested_struct, member) \
|
||||
kernel_integrator_state.nested_struct.member[state]
|
||||
# ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
||||
|
||||
/* If we've opted in to packed layouts, we use the generated accessor functions (member##_fn) to
|
||||
* resolve different layouts (packed vs separate). */
|
||||
# define INTEGRATOR_STATE(state, nested_struct, member) \
|
||||
kernel_integrator_state.nested_struct.member##_fn()[state].member
|
||||
# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
|
||||
kernel_integrator_state.nested_struct[array_index].member##_fn()[state].member
|
||||
# else
|
||||
# define INTEGRATOR_STATE(state, nested_struct, member) \
|
||||
kernel_integrator_state.nested_struct.member[state]
|
||||
# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
|
||||
kernel_integrator_state.nested_struct[array_index].member[state]
|
||||
# endif
|
||||
|
||||
# define INTEGRATOR_STATE_WRITE(state, nested_struct, member) \
|
||||
INTEGRATOR_STATE(state, nested_struct, member)
|
||||
|
||||
# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
|
||||
kernel_integrator_state.nested_struct[array_index].member[state]
|
||||
# define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
|
||||
INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member)
|
||||
|
||||
|
||||
@@ -67,36 +67,36 @@ KERNEL_STRUCT_END(path)
|
||||
|
||||
/************************************** Ray ***********************************/
|
||||
|
||||
KERNEL_STRUCT_BEGIN(ray)
|
||||
KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_BEGIN_PACKED(ray, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(ray, float, previous_dt, KERNEL_FEATURE_LIGHT_TREE)
|
||||
KERNEL_STRUCT_END(ray)
|
||||
|
||||
/*************************** Intersection result ******************************/
|
||||
|
||||
/* Result from scene intersection. */
|
||||
KERNEL_STRUCT_BEGIN(isect)
|
||||
KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_BEGIN_PACKED(isect, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_END(isect)
|
||||
|
||||
/*************** Subsurface closure state for subsurface kernel ***************/
|
||||
|
||||
KERNEL_STRUCT_BEGIN(subsurface)
|
||||
KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, albedo, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, radius, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER(subsurface, packed_float3, N, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_BEGIN_PACKED(subsurface, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(subsurface, PackedSpectrum, albedo, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(subsurface, PackedSpectrum, radius, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_MEMBER_PACKED(subsurface, packed_float3, N, KERNEL_FEATURE_SUBSURFACE)
|
||||
KERNEL_STRUCT_END(subsurface)
|
||||
|
||||
/********************************** Volume Stack ******************************/
|
||||
|
||||
@@ -15,6 +15,31 @@ CCL_NAMESPACE_BEGIN
|
||||
ccl_device_forceinline void integrator_state_write_ray(IntegratorState state,
|
||||
ccl_private const Ray *ccl_restrict ray)
|
||||
{
|
||||
#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__)
|
||||
static_assert(sizeof(ray->P) == sizeof(float4), "Bad assumption about float3 padding");
|
||||
/* dP and dP are packed based on the assumption that float3 is padded to 16 bytes.
|
||||
* This assumption hold trues on Metal, but not CUDA.
|
||||
*/
|
||||
((ccl_private float4 &)ray->P).w = ray->dP;
|
||||
((ccl_private float4 &)ray->D).w = ray->dD;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, packed) = (ccl_private packed_ray &)*ray;
|
||||
|
||||
/* Ensure that we can correctly cast between Ray and the generated packed_ray struct. */
|
||||
static_assert(offsetof(packed_ray, P) == offsetof(Ray, P),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
static_assert(offsetof(packed_ray, D) == offsetof(Ray, D),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
static_assert(offsetof(packed_ray, tmin) == offsetof(Ray, tmin),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
static_assert(offsetof(packed_ray, tmax) == offsetof(Ray, tmax),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
static_assert(offsetof(packed_ray, time) == offsetof(Ray, time),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
static_assert(offsetof(packed_ray, dP) == 12 + offsetof(Ray, P),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
static_assert(offsetof(packed_ray, dD) == 12 + offsetof(Ray, D),
|
||||
"Generated packed_ray struct is misaligned with Ray struct");
|
||||
#else
|
||||
INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, tmin) = ray->tmin;
|
||||
@@ -22,11 +47,17 @@ ccl_device_forceinline void integrator_state_write_ray(IntegratorState state,
|
||||
INTEGRATOR_STATE_WRITE(state, ray, time) = ray->time;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, dP) = ray->dP;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, dD) = ray->dD;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_forceinline void integrator_state_read_ray(ConstIntegratorState state,
|
||||
ccl_private Ray *ccl_restrict ray)
|
||||
{
|
||||
#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__)
|
||||
*((ccl_private packed_ray *)ray) = INTEGRATOR_STATE(state, ray, packed);
|
||||
ray->dP = ((ccl_private float4 &)ray->P).w;
|
||||
ray->dD = ((ccl_private float4 &)ray->D).w;
|
||||
#else
|
||||
ray->P = INTEGRATOR_STATE(state, ray, P);
|
||||
ray->D = INTEGRATOR_STATE(state, ray, D);
|
||||
ray->tmin = INTEGRATOR_STATE(state, ray, tmin);
|
||||
@@ -34,6 +65,7 @@ ccl_device_forceinline void integrator_state_read_ray(ConstIntegratorState state
|
||||
ray->time = INTEGRATOR_STATE(state, ray, time);
|
||||
ray->dP = INTEGRATOR_STATE(state, ray, dP);
|
||||
ray->dD = INTEGRATOR_STATE(state, ray, dD);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Shadow Ray */
|
||||
@@ -96,23 +128,46 @@ ccl_device_forceinline void integrator_state_read_shadow_ray_self(
|
||||
ccl_device_forceinline void integrator_state_write_isect(
|
||||
IntegratorState state, ccl_private const Intersection *ccl_restrict isect)
|
||||
{
|
||||
#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__)
|
||||
INTEGRATOR_STATE_WRITE(state, isect, packed) = (ccl_private packed_isect &)*isect;
|
||||
|
||||
/* Ensure that we can correctly cast between Intersection and the generated packed_isect struct.
|
||||
*/
|
||||
static_assert(offsetof(packed_isect, t) == offsetof(Intersection, t),
|
||||
"Generated packed_isect struct is misaligned with Intersection struct");
|
||||
static_assert(offsetof(packed_isect, u) == offsetof(Intersection, u),
|
||||
"Generated packed_isect struct is misaligned with Intersection struct");
|
||||
static_assert(offsetof(packed_isect, v) == offsetof(Intersection, v),
|
||||
"Generated packed_isect struct is misaligned with Intersection struct");
|
||||
static_assert(offsetof(packed_isect, object) == offsetof(Intersection, object),
|
||||
"Generated packed_isect struct is misaligned with Intersection struct");
|
||||
static_assert(offsetof(packed_isect, prim) == offsetof(Intersection, prim),
|
||||
"Generated packed_isect struct is misaligned with Intersection struct");
|
||||
static_assert(offsetof(packed_isect, type) == offsetof(Intersection, type),
|
||||
"Generated packed_isect struct is misaligned with Intersection struct");
|
||||
#else
|
||||
INTEGRATOR_STATE_WRITE(state, isect, t) = isect->t;
|
||||
INTEGRATOR_STATE_WRITE(state, isect, u) = isect->u;
|
||||
INTEGRATOR_STATE_WRITE(state, isect, v) = isect->v;
|
||||
INTEGRATOR_STATE_WRITE(state, isect, object) = isect->object;
|
||||
INTEGRATOR_STATE_WRITE(state, isect, prim) = isect->prim;
|
||||
INTEGRATOR_STATE_WRITE(state, isect, type) = isect->type;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_forceinline void integrator_state_read_isect(
|
||||
ConstIntegratorState state, ccl_private Intersection *ccl_restrict isect)
|
||||
{
|
||||
#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__)
|
||||
*((ccl_private packed_isect *)isect) = INTEGRATOR_STATE(state, isect, packed);
|
||||
#else
|
||||
isect->prim = INTEGRATOR_STATE(state, isect, prim);
|
||||
isect->object = INTEGRATOR_STATE(state, isect, object);
|
||||
isect->type = INTEGRATOR_STATE(state, isect, type);
|
||||
isect->u = INTEGRATOR_STATE(state, isect, u);
|
||||
isect->v = INTEGRATOR_STATE(state, isect, v);
|
||||
isect->t = INTEGRATOR_STATE(state, isect, t);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __VOLUME__
|
||||
@@ -250,6 +305,16 @@ ccl_device_inline void integrator_state_copy_only(KernelGlobals kg,
|
||||
kernel_integrator_state.parent_struct.name[state]; \
|
||||
}
|
||||
|
||||
# ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
||||
KERNEL_STRUCT_BEGIN(parent_struct) \
|
||||
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
|
||||
# else
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
|
||||
# endif
|
||||
|
||||
# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
|
||||
if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
|
||||
kernel_integrator_state.parent_struct[index].name[to_state] = \
|
||||
@@ -272,7 +337,9 @@ ccl_device_inline void integrator_state_copy_only(KernelGlobals kg,
|
||||
# include "kernel/integrator/state_template.h"
|
||||
|
||||
# undef KERNEL_STRUCT_BEGIN
|
||||
# undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
# undef KERNEL_STRUCT_MEMBER
|
||||
# undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
# undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
# undef KERNEL_STRUCT_END
|
||||
# undef KERNEL_STRUCT_END_ARRAY
|
||||
@@ -306,6 +373,16 @@ ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg,
|
||||
kernel_integrator_state.parent_struct.name[state]; \
|
||||
}
|
||||
|
||||
# ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
||||
KERNEL_STRUCT_BEGIN(parent_struct) \
|
||||
KERNEL_STRUCT_MEMBER(parent_struct, type, packed, feature)
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
|
||||
# else
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
|
||||
# endif
|
||||
|
||||
# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
|
||||
if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
|
||||
kernel_integrator_state.parent_struct[index].name[to_state] = \
|
||||
@@ -328,7 +405,9 @@ ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg,
|
||||
# include "kernel/integrator/shadow_state_template.h"
|
||||
|
||||
# undef KERNEL_STRUCT_BEGIN
|
||||
# undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
# undef KERNEL_STRUCT_MEMBER
|
||||
# undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
# undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
# undef KERNEL_STRUCT_END
|
||||
# undef KERNEL_STRUCT_END_ARRAY
|
||||
|
||||
@@ -16,6 +16,10 @@
|
||||
# define __EMBREE__
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
# include <TargetConditionals.h>
|
||||
#endif
|
||||
|
||||
#include "util/math.h"
|
||||
#include "util/math_fast.h"
|
||||
#include "util/math_intersect.h"
|
||||
@@ -731,12 +735,12 @@ typedef struct Ray {
|
||||
float tmax; /* end distance */
|
||||
float time; /* time (for motion blur) */
|
||||
|
||||
RaySelfPrimitives self;
|
||||
|
||||
#ifdef __RAY_DIFFERENTIALS__
|
||||
float dP;
|
||||
float dD;
|
||||
#endif
|
||||
|
||||
RaySelfPrimitives self;
|
||||
} Ray;
|
||||
|
||||
/* Intersection */
|
||||
@@ -748,6 +752,67 @@ typedef struct Intersection {
|
||||
int type;
|
||||
} Intersection;
|
||||
|
||||
/* On certain GPUs (Apple Silicon), splitting every integrator state field into its own separate
|
||||
* array can be detrimental for cache utilisation. By enabling __INTEGRATOR_GPU_PACKED_STATE__, we
|
||||
* specify that certain fields should be packed together. This improves cache hit ratios in cases
|
||||
* where fields are often accessed together (e.g. "ray" and "isect").
|
||||
*/
|
||||
#if defined(TARGET_CPU_ARM64) || defined(__KERNEL_METAL_APPLE__)
|
||||
# define __INTEGRATOR_GPU_PACKED_STATE__
|
||||
|
||||
/* Generate packed layouts for structs declared with KERNEL_STRUCT_BEGIN_PACKED. For example the
|
||||
* following template...
|
||||
*
|
||||
* KERNEL_STRUCT_BEGIN_PACKED(shadow_ray, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
|
||||
* KERNEL_STRUCT_MEMBER_PACKED(shadow_ray, int, self_light, KERNEL_FEATURE_SHADOW_LINKING)
|
||||
* KERNEL_STRUCT_END(shadow_ray)
|
||||
*
|
||||
* ...will produce the following packed struct:
|
||||
*
|
||||
* struct packed_shadow_ray {
|
||||
* packed_float3 P;
|
||||
* packed_float3 D;
|
||||
* float tmin;
|
||||
* float tmax;
|
||||
* float time;
|
||||
* float dP;
|
||||
* int self_light;
|
||||
* };
|
||||
*/
|
||||
|
||||
# define KERNEL_STRUCT_BEGIN(name) struct dummy_##name {
|
||||
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct packed_##parent_struct {
|
||||
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature)
|
||||
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) type name;
|
||||
# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) type name;
|
||||
# define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
;
|
||||
# define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
|
||||
} \
|
||||
;
|
||||
# define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
||||
|
||||
# include "kernel/integrator/shadow_state_template.h"
|
||||
# include "kernel/integrator/state_template.h"
|
||||
|
||||
# undef KERNEL_STRUCT_BEGIN
|
||||
# undef KERNEL_STRUCT_BEGIN_PACKED
|
||||
# undef KERNEL_STRUCT_MEMBER
|
||||
# undef KERNEL_STRUCT_MEMBER_PACKED
|
||||
# undef KERNEL_STRUCT_ARRAY_MEMBER
|
||||
# undef KERNEL_STRUCT_END
|
||||
# undef KERNEL_STRUCT_END_ARRAY
|
||||
# undef KERNEL_STRUCT_VOLUME_STACK_SIZE
|
||||
|
||||
#endif
|
||||
|
||||
/* Primitives */
|
||||
|
||||
typedef enum PrimitiveType {
|
||||
|
||||
Reference in New Issue
Block a user