On a M3 MacBook Pro, this change increases the benchmark score by 8% (with classroom seeing a path-tracing speedup of 15%). The integrator state is currently store using struct-of-arrays, with one array per field. Such fine grained separation can result in poor GPU cache utilisation in cases where multiple fields of the same parent struct are accessed together. This PR changes the layout of the `ray`, `isect`, `subsurface`, and `shadow_ray` structs so that the data is interleaved (per parent struct) instead of separate. To try and keep this change localised, I encapsulated the layout change by extending the integrator state access macros, however maybe we want to do this more explicitly? (e.g. by updating every bit of code that accesses these parts of the state). Feedback welcome. Pull Request: https://projects.blender.org/blender/blender/pulls/122015
277 lines
10 KiB
C
277 lines
10 KiB
C
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
/* Integrator State
|
|
*
|
|
* This file defines the data structures that define the state of a path. Any state that is
|
|
* preserved and passed between kernel executions is part of this.
|
|
*
|
|
* The size of this state must be kept as small as possible, to reduce cache misses and keep memory
|
|
* usage under control on GPUs that may execute millions of kernels.
|
|
*
|
|
* Memory may be allocated and passed along in different ways depending on the device. There may
|
|
* be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer
|
|
* to every kernel, or the pointer may exist at program scope or in constant memory. To abstract
|
|
* these differences between devices and experiment with different layouts, macros are used.
|
|
*
|
|
* Use IntegratorState to pass a reference to the integrator state for the current path. These are
|
|
* defined differently on the CPU and GPU. Use ConstIntegratorState instead of const
|
|
* IntegratorState for passing state as read-only, to avoid oddities in typedef behavior.
|
|
*
|
|
* INTEGRATOR_STATE(state, x, y): read nested struct member x.y of IntegratorState
|
|
* INTEGRATOR_STATE_WRITE(state, x, y): write to nested struct member x.y of IntegratorState
|
|
*
|
|
* INTEGRATOR_STATE_ARRAY(state, x, index, y): read x[index].y
|
|
* INTEGRATOR_STATE_ARRAY_WRITE(state, x, index, y): write x[index].y
|
|
*
|
|
* INTEGRATOR_STATE_NULL: use to pass empty state to other functions.
|
|
*/
|
|
|
|
#include "kernel/types.h"
|
|
|
|
#include "util/types.h"
|
|
|
|
#ifdef __PATH_GUIDING__
|
|
# include "util/guiding.h"
|
|
#endif
|
|
|
|
#pragma once
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Data structures */
|
|
|
|
/* Integrator State
|
|
*
|
|
* CPU rendering path state with AoS layout. */
|
|
typedef struct IntegratorShadowStateCPU {
|
|
#define KERNEL_STRUCT_BEGIN(name) struct {
|
|
#define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct {
|
|
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
|
|
#define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
|
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
|
#define KERNEL_STRUCT_END(name) \
|
|
} \
|
|
name;
|
|
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
|
|
} \
|
|
name[cpu_size];
|
|
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
|
#include "kernel/integrator/shadow_state_template.h"
|
|
#undef KERNEL_STRUCT_BEGIN
|
|
#undef KERNEL_STRUCT_BEGIN_PACKED
|
|
#undef KERNEL_STRUCT_MEMBER
|
|
#undef KERNEL_STRUCT_MEMBER_PACKED
|
|
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
|
#undef KERNEL_STRUCT_END
|
|
#undef KERNEL_STRUCT_END_ARRAY
|
|
} IntegratorShadowStateCPU;
|
|
|
|
typedef struct IntegratorStateCPU {
|
|
#define KERNEL_STRUCT_BEGIN(name) struct {
|
|
#define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) struct {
|
|
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
|
|
#define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
|
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
|
#define KERNEL_STRUCT_END(name) \
|
|
} \
|
|
name;
|
|
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
|
|
} \
|
|
name[cpu_size];
|
|
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
|
#include "kernel/integrator/state_template.h"
|
|
#undef KERNEL_STRUCT_BEGIN
|
|
#undef KERNEL_STRUCT_BEGIN_PACKED
|
|
#undef KERNEL_STRUCT_MEMBER
|
|
#undef KERNEL_STRUCT_MEMBER_PACKED
|
|
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
|
#undef KERNEL_STRUCT_END
|
|
#undef KERNEL_STRUCT_END_ARRAY
|
|
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
|
|
|
|
IntegratorShadowStateCPU shadow;
|
|
IntegratorShadowStateCPU ao;
|
|
} IntegratorStateCPU;
|
|
|
|
/* Path Queue
|
|
*
|
|
* Keep track of which kernels are queued to be executed next in the path
|
|
* for GPU rendering. */
|
|
typedef struct IntegratorQueueCounter {
|
|
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
|
|
} IntegratorQueueCounter;
|
|
|
|
#if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__)
|
|
|
|
/* Generate wrapper structs for all integrator state fields. This allows us to access state
|
|
* uniformly, regardless of whether it stored in a packed struct or separate arrays. */
|
|
# define KERNEL_STRUCT_BEGIN(name)
|
|
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
|
struct Wrapped_##parent_struct##_##name { \
|
|
type name; \
|
|
};
|
|
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
|
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
|
KERNEL_STRUCT_BEGIN(parent_struct) \
|
|
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
|
# define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
|
# define KERNEL_STRUCT_END(name)
|
|
# define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size)
|
|
# define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
|
|
|
# include "kernel/integrator/shadow_state_template.h"
|
|
# include "kernel/integrator/state_template.h"
|
|
|
|
# undef KERNEL_STRUCT_BEGIN
|
|
# undef KERNEL_STRUCT_BEGIN_PACKED
|
|
# undef KERNEL_STRUCT_MEMBER
|
|
# undef KERNEL_STRUCT_MEMBER_PACKED
|
|
# undef KERNEL_STRUCT_ARRAY_MEMBER
|
|
# undef KERNEL_STRUCT_END
|
|
# undef KERNEL_STRUCT_END_ARRAY
|
|
# undef KERNEL_STRUCT_VOLUME_STACK_SIZE
|
|
|
|
#endif
|
|
|
|
/* Integrator State GPU
|
|
*
|
|
* GPU rendering path state with SoA layout. */
|
|
typedef struct IntegratorStateGPU {
|
|
#define KERNEL_STRUCT_BEGIN(name) struct {
|
|
|
|
#ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
|
|
|
# ifdef __KERNEL_GPU__
|
|
|
|
/* If we've opted in to packed layouts, generate member functions that return a pointer to a
|
|
* wrapper type so we can access state using uniform syntax. */
|
|
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
|
|
ccl_global Wrapped_##parent_struct##_##name *name; \
|
|
ccl_device_inline ccl_global Wrapped_##parent_struct##_##name *name##_fn() ccl_constant \
|
|
{ \
|
|
return (ccl_global Wrapped_##parent_struct##_##name *)name; \
|
|
}
|
|
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature) \
|
|
ccl_device_inline ccl_global packed_##parent_struct *name##_fn() ccl_constant \
|
|
{ \
|
|
return (ccl_global packed_##parent_struct *)packed; \
|
|
}
|
|
# else
|
|
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
|
|
# define KERNEL_STRUCT_MEMBER_PACKED(parent_struct, type, name, feature)
|
|
# endif
|
|
|
|
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) \
|
|
KERNEL_STRUCT_BEGIN(parent_struct) \
|
|
KERNEL_STRUCT_MEMBER(parent_struct, packed_##parent_struct, packed, feature)
|
|
|
|
#else
|
|
# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
|
|
# define KERNEL_STRUCT_MEMBER_PACKED KERNEL_STRUCT_MEMBER
|
|
# define KERNEL_STRUCT_BEGIN_PACKED(parent_struct, feature) KERNEL_STRUCT_BEGIN(parent_struct)
|
|
#endif
|
|
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
|
|
#define KERNEL_STRUCT_END(name) \
|
|
} \
|
|
name;
|
|
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
|
|
} \
|
|
name[gpu_size];
|
|
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
|
|
|
|
#include "kernel/integrator/state_template.h"
|
|
|
|
#include "kernel/integrator/shadow_state_template.h"
|
|
|
|
#undef KERNEL_STRUCT_BEGIN
|
|
#undef KERNEL_STRUCT_BEGIN_PACKED
|
|
#undef KERNEL_STRUCT_MEMBER
|
|
#undef KERNEL_STRUCT_MEMBER_PACKED
|
|
#undef KERNEL_STRUCT_ARRAY_MEMBER
|
|
#undef KERNEL_STRUCT_END
|
|
#undef KERNEL_STRUCT_END_ARRAY
|
|
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
|
|
|
|
/* Count number of queued kernels. */
|
|
ccl_global IntegratorQueueCounter *queue_counter;
|
|
|
|
/* Count number of kernels queued for specific shaders. */
|
|
ccl_global int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
|
|
|
|
/* Index of shadow path which will be used by a next shadow path. */
|
|
ccl_global int *next_shadow_path_index;
|
|
|
|
/* Index of main path which will be used by a next shadow catcher split. */
|
|
ccl_global int *next_main_path_index;
|
|
|
|
/* Partition/key offsets used when writing sorted active indices. */
|
|
ccl_global int *sort_partition_key_offsets;
|
|
|
|
/* Divisor used to partition active indices by locality when sorting by material. */
|
|
uint sort_partition_divisor;
|
|
} IntegratorStateGPU;
|
|
|
|
/* Abstraction
|
|
*
|
|
* Macros to access data structures on different devices.
|
|
*
|
|
* Note that there is a special access function for the shadow catcher state. This access is to
|
|
* happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
|
|
* from a kernel which operates on a shadow catcher state will cause bad memory access. */
|
|
|
|
#ifndef __KERNEL_GPU__
|
|
|
|
/* Scalar access on CPU. */
|
|
|
|
typedef IntegratorStateCPU *ccl_restrict IntegratorState;
|
|
typedef const IntegratorStateCPU *ccl_restrict ConstIntegratorState;
|
|
typedef IntegratorShadowStateCPU *ccl_restrict IntegratorShadowState;
|
|
typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;
|
|
|
|
# define INTEGRATOR_STATE_NULL nullptr
|
|
|
|
# define INTEGRATOR_STATE(state, nested_struct, member) ((state)->nested_struct.member)
|
|
# define INTEGRATOR_STATE_WRITE(state, nested_struct, member) ((state)->nested_struct.member)
|
|
|
|
# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
|
|
((state)->nested_struct[array_index].member)
|
|
# define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
|
|
((state)->nested_struct[array_index].member)
|
|
|
|
#else /* !__KERNEL_GPU__ */
|
|
|
|
/* Array access on GPU with Structure-of-Arrays. */
|
|
|
|
typedef int IntegratorState;
|
|
typedef int ConstIntegratorState;
|
|
typedef int IntegratorShadowState;
|
|
typedef int ConstIntegratorShadowState;
|
|
|
|
# define INTEGRATOR_STATE_NULL -1
|
|
|
|
# ifdef __INTEGRATOR_GPU_PACKED_STATE__
|
|
|
|
/* If we've opted in to packed layouts, we use the generated accessor functions (member##_fn) to
|
|
* resolve different layouts (packed vs separate). */
|
|
# define INTEGRATOR_STATE(state, nested_struct, member) \
|
|
kernel_integrator_state.nested_struct.member##_fn()[state].member
|
|
# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
|
|
kernel_integrator_state.nested_struct[array_index].member##_fn()[state].member
|
|
# else
|
|
# define INTEGRATOR_STATE(state, nested_struct, member) \
|
|
kernel_integrator_state.nested_struct.member[state]
|
|
# define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
|
|
kernel_integrator_state.nested_struct[array_index].member[state]
|
|
# endif
|
|
|
|
# define INTEGRATOR_STATE_WRITE(state, nested_struct, member) \
|
|
INTEGRATOR_STATE(state, nested_struct, member)
|
|
# define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
|
|
INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member)
|
|
|
|
#endif /* !__KERNEL_GPU__ */
|
|
|
|
CCL_NAMESPACE_END
|