Files
test2/intern/cycles/kernel/svm/ao.h
Michael Jones 5508b41a40 Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.

- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.

- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.

- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.

Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.

On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).

Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00

135 lines
3.4 KiB
C++

/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#include "kernel/bvh/bvh.h"
CCL_NAMESPACE_BEGIN
#ifdef __SHADER_RAYTRACE__
# ifdef __KERNEL_OPTIX__
extern "C" __device__ float __direct_callable__svm_node_ao(
# else
ccl_device float svm_ao(
# endif
KernelGlobals kg,
ConstIntegratorState state,
ccl_private ShaderData *sd,
float3 N,
float max_dist,
int num_samples,
int flags)
{
if (flags & NODE_AO_GLOBAL_RADIUS) {
max_dist = kernel_data.integrator.ao_bounces_distance;
}
/* Early out if no sampling needed. */
if (max_dist <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
return 1.0f;
}
/* Can't ray-trace from shaders like displacement, before BVH exists. */
if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
return 1.0f;
}
if (flags & NODE_AO_INSIDE) {
N = -N;
}
float3 T, B;
make_orthonormals(N, &T, &B);
/* TODO: support ray-tracing in shadow shader evaluation? */
RNGState rng_state;
path_state_rng_load(state, &rng_state);
int unoccluded = 0;
for (int sample = 0; sample < num_samples; sample++) {
const float2 rand_disk = path_branched_rng_2D(
kg, &rng_state, sample, num_samples, PRNG_SURFACE_AO);
float2 d = sample_uniform_disk(rand_disk);
float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
/* Create ray. */
Ray ray;
ray.P = sd->P;
ray.D = D.x * T + D.y * B + D.z * N;
ray.tmin = 0.0f;
ray.tmax = max_dist;
ray.time = sd->time;
ray.self.object = sd->object;
ray.self.prim = sd->prim;
ray.self.light_object = OBJECT_NONE;
ray.self.light_prim = PRIM_NONE;
ray.self.light = LAMP_NONE;
ray.dP = differential_zero_compact();
ray.dD = differential_zero_compact();
if (flags & NODE_AO_ONLY_LOCAL) {
if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
unoccluded++;
}
}
else {
if (!scene_intersect_shadow(kg, &ray, PATH_RAY_SHADOW_OPAQUE)) {
unoccluded++;
}
}
}
return ((float)unoccluded) / num_samples;
}
template<uint node_feature_mask, typename ConstIntegratorGenericState>
# if defined(__KERNEL_OPTIX__)
ccl_device_inline
# else
ccl_device_noinline
# endif
void
svm_node_ao(KernelGlobals kg,
ConstIntegratorGenericState state,
ccl_private ShaderData *sd,
ccl_private float *stack,
uint4 node)
{
uint flags, dist_offset, normal_offset, out_ao_offset;
svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
uint color_offset, out_color_offset, samples;
svm_unpack_node_uchar3(node.z, &color_offset, &out_color_offset, &samples);
float ao = 1.0f;
IF_KERNEL_NODES_FEATURE(RAYTRACE)
{
float dist = stack_load_float_default(stack, dist_offset, node.w);
float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
# ifdef __KERNEL_OPTIX__
ao = optixDirectCall<float>(0, kg, state, sd, normal, dist, samples, flags);
# else
ao = svm_ao(kg, state, sd, normal, dist, samples, flags);
# endif
}
if (stack_valid(out_ao_offset)) {
stack_store_float(stack, out_ao_offset, ao);
}
if (stack_valid(out_color_offset)) {
float3 color = stack_load_float3(stack, color_offset);
stack_store_float3(stack, out_color_offset, ao * color);
}
}
#endif /* __SHADER_RAYTRACE__ */
CCL_NAMESPACE_END