Files
test/intern/cycles/kernel/integrator/state_flow.h
Brecht Van Lommel 523bbf7065 Cycles: generalize shader sorting / locality heuristic to all GPU devices
This was added for Metal, but also gives good results with CUDA and OptiX.
Also enable it for future Apple GPUs instead of only M1 and M2, since this has
been shown to help across multiple GPUs so the better bet seems to enable
rather than disable it.

Also moves some of the logic outside of the Metal device code, and always
enables the code in the kernel since other devices don't do dynamic compile.

Time per sample with OptiX + RTX A6000:
                                         new                  old
barbershop_interior                      0.0730s              0.0727s
bmw27                                    0.0047s              0.0053s
classroom                                0.0428s              0.0464s
fishy_cat                                0.0102s              0.0108s
junkshop                                 0.0366s              0.0395s
koro                                     0.0567s              0.0578s
monster                                  0.0206s              0.0223s
pabellon                                 0.0158s              0.0174s
sponza                                   0.0088s              0.0100s
spring                                   0.1267s              0.1280s
victor                                   0.0524s              0.0531s
wdas_cloud                               0.0817s              0.0816s

Ref D15331, T87836
2022-07-15 13:42:47 +02:00

207 lines
9.3 KiB
C

/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#pragma once
#include "kernel/types.h"
#include "util/atomic.h"
CCL_NAMESPACE_BEGIN
/* Control Flow
*
* Utilities for control flow between kernels. The implementation is different between CPU and
* GPU devices. For the latter part of the logic is handled on the host side with wavefronts.
*
* There is a main path for regular path tracing camera for path tracing. Shadows for next
* event estimation branch off from this into their own path, that may be computed in
* parallel while the main path continues. Additionally, shading kernels are sorted using
* a key for coherence.
*
* Each kernel on the main path must call one of these functions. These may not be called
* multiple times from the same kernel.
*
* integrator_path_init(kg, state, next_kernel)
* integrator_path_next(kg, state, current_kernel, next_kernel)
* integrator_path_terminate(kg, state, current_kernel)
*
* For the shadow path similar functions are used, and again each shadow kernel must call
* one of them, and only once.
*/
ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state)
{
return INTEGRATOR_STATE(state, path, queued_kernel) == 0;
}
ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state)
{
return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0;
}
#ifdef __KERNEL_GPU__
ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
IntegratorState state,
const DeviceKernel next_kernel)
{
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
}
ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
IntegratorState state,
const DeviceKernel current_kernel,
const DeviceKernel next_kernel)
{
atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
1);
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
}
ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
IntegratorState state,
const DeviceKernel current_kernel)
{
atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
}
ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
{
IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32(
&kernel_integrator_state.next_shadow_path_index[0], 1);
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
return shadow_state;
}
ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
IntegratorShadowState state,
const DeviceKernel current_kernel,
const DeviceKernel next_kernel)
{
atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
1);
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
}
ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
IntegratorShadowState state,
const DeviceKernel current_kernel)
{
atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
1);
INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
}
/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
# define INTEGRATOR_SORT_KEY(key, state) \
(key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
IntegratorState state,
const DeviceKernel next_kernel,
const uint32_t key)
{
const int key_ = INTEGRATOR_SORT_KEY(key, state);
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}
ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
IntegratorState state,
const DeviceKernel current_kernel,
const DeviceKernel next_kernel,
const uint32_t key)
{
const int key_ = INTEGRATOR_SORT_KEY(key, state);
atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
1);
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}
#else
ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
IntegratorState state,
const DeviceKernel next_kernel)
{
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
}
ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
IntegratorState state,
const DeviceKernel next_kernel,
const uint32_t key)
{
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
(void)key;
}
ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
IntegratorState state,
const DeviceKernel current_kernel,
const DeviceKernel next_kernel)
{
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
(void)current_kernel;
}
ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
IntegratorState state,
const DeviceKernel current_kernel)
{
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
(void)current_kernel;
}
ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
IntegratorState state,
const DeviceKernel current_kernel,
const DeviceKernel next_kernel,
const uint32_t key)
{
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
(void)key;
(void)current_kernel;
}
ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
{
IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
return shadow_state;
}
ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
IntegratorShadowState state,
const DeviceKernel current_kernel,
const DeviceKernel next_kernel)
{
INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
(void)current_kernel;
}
ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
IntegratorShadowState state,
const DeviceKernel current_kernel)
{
INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
(void)current_kernel;
}
#endif
CCL_NAMESPACE_END