Files
test/intern/cycles/kernel/svm/util.h
Lukas Stockner 793040ad1c Cycles: Improve parameter packing for the Principled BSDF
The Principled BSDF has a ton of inputs, and the previous SVM code just always
allocated stack space for all of them. This results in a ton of additional
NODE_VALUE_x SVM nodes, which slow down execution.

However, this is not really needed for two reasons:
- First, many inputs are only used consitionally. For example, if the
  subsurface weight is zero, none of the other subsurface inputs are used.
- Many of the inputs have a "usual" value that they will have in most
  materials, so if they happen to have that value we can just indicate that
  by not allocating space for them.
  This is a bit similar to the standard "pack the fixed value and provide
  a stack offset if there's a link" pattern, except that the fixed value
  is a constant in the code and we allocate a NODE_VALUE_x if a different
  fixed value is used.

Therefore, this PR re-implements the parameter packing in a more efficient way:
- If we can determine that a component is disabled, all conditional inputs are
  disconnected (to avoid generating upstream nodes).
- If we can determine that a component is disabled, we skip allocating all
  conditional inputs on the stack.
- The inputs for which a reasonable "usual" value exists are changed to
  respect that, and to only be allocated if they differ.
- param1 and param2 (which are fixed-value-packed as on all BSDF nodes) are
  used to store IOR and roughness, which have a decent chance to be fixed
  values.
- The parameter packing is more aggressive about using uchar4, which allows
  to get rid of two SVM nodes while still storing the same inputs.

The result is a considerable speedup in scenes that make heavy use of the
Principled BSDF:

| Scene | CPU speedup | OptiX speedup |
| --- | --- | --- |
| attic | 5% | 9% |
| bistro | 5% | 8% |
| junkshop | 5% | 10% |
| monster | 3% | 4% |
| spring | 1% | 6% |

Pull Request: https://projects.blender.org/blender/blender/pulls/143910
2025-08-04 18:34:58 +02:00

161 lines
4.7 KiB
C

/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#include "kernel/globals.h"
#include "kernel/types.h"
#include "kernel/svm/types.h"
CCL_NAMESPACE_BEGIN
/* Stack */
ccl_device_inline float3 stack_load_float3(const ccl_private float *stack, const uint a)
{
kernel_assert(a + 2 < SVM_STACK_SIZE);
const ccl_private float *stack_a = stack + a;
return make_float3(stack_a[0], stack_a[1], stack_a[2]);
}
ccl_device_inline float3 stack_load_float3_default(const ccl_private float *stack,
const uint a,
const float3 value)
{
return (a == (uint)SVM_STACK_INVALID) ? value : stack_load_float3(stack, a);
}
ccl_device_inline void stack_store_float3(ccl_private float *stack, const uint a, const float3 f)
{
kernel_assert(a + 2 < SVM_STACK_SIZE);
copy_v3_v3(stack + a, f);
}
ccl_device_inline float stack_load_float(const ccl_private float *stack, const uint a)
{
kernel_assert(a < SVM_STACK_SIZE);
return stack[a];
}
ccl_device_inline float stack_load_float_default(const ccl_private float *stack,
const uint a,
const uint value)
{
return (a == (uint)SVM_STACK_INVALID) ? __uint_as_float(value) : stack_load_float(stack, a);
}
ccl_device_inline float stack_load_float_default(const ccl_private float *stack,
const uint a,
const float value)
{
return (a == (uint)SVM_STACK_INVALID) ? value : stack_load_float(stack, a);
}
ccl_device_inline void stack_store_float(ccl_private float *stack, const uint a, const float f)
{
kernel_assert(a < SVM_STACK_SIZE);
stack[a] = f;
}
ccl_device_inline int stack_load_int(const ccl_private float *stack, const uint a)
{
kernel_assert(a < SVM_STACK_SIZE);
return __float_as_int(stack[a]);
}
ccl_device_inline int stack_load_int_default(ccl_private float *stack,
const uint a,
const uint value)
{
return (a == (uint)SVM_STACK_INVALID) ? (int)value : stack_load_int(stack, a);
}
ccl_device_inline void stack_store_int(ccl_private float *stack, const uint a, const int i)
{
kernel_assert(a < SVM_STACK_SIZE);
stack[a] = __int_as_float(i);
}
ccl_device_inline bool stack_valid(const uint a)
{
return a != (uint)SVM_STACK_INVALID;
}
/* Reading Nodes */
ccl_device_inline uint4 read_node(KernelGlobals kg, ccl_private int *const offset)
{
uint4 node = kernel_data_fetch(svm_nodes, *offset);
(*offset)++;
return node;
}
ccl_device_inline float4 read_node_float(KernelGlobals kg, ccl_private int *const offset)
{
const uint4 node = kernel_data_fetch(svm_nodes, *offset);
const float4 f = make_float4(__uint_as_float(node.x),
__uint_as_float(node.y),
__uint_as_float(node.z),
__uint_as_float(node.w));
(*offset)++;
return f;
}
ccl_device_inline float4 fetch_node_float(KernelGlobals kg, const int offset)
{
const uint4 node = kernel_data_fetch(svm_nodes, offset);
return make_float4(__uint_as_float(node.x),
__uint_as_float(node.y),
__uint_as_float(node.z),
__uint_as_float(node.w));
}
ccl_device_forceinline void svm_unpack_node_uchar2(const uint i,
ccl_private uint *x,
ccl_private uint *y)
{
*x = (i & 0xFF);
*y = ((i >> 8) & 0xFF);
}
ccl_device_forceinline void svm_unpack_node_uchar3(const uint i,
ccl_private uint *x,
ccl_private uint *y,
ccl_private uint *z)
{
*x = (i & 0xFF);
*y = ((i >> 8) & 0xFF);
*z = ((i >> 16) & 0xFF);
}
ccl_device_forceinline void svm_unpack_node_uchar4(const uint i,
ccl_private uint *x,
ccl_private uint *y,
ccl_private uint *z,
ccl_private uint *w)
{
*x = (i & 0xFF);
*y = ((i >> 8) & 0xFF);
*z = ((i >> 16) & 0xFF);
*w = ((i >> 24) & 0xFF);
}
ccl_device_forceinline float3 dPdx(const ccl_private ShaderData *sd)
{
return sd->dPdu * sd->du.dx + sd->dPdv * sd->dv.dx;
}
ccl_device_forceinline float3 dPdy(const ccl_private ShaderData *sd)
{
return sd->dPdu * sd->du.dy + sd->dPdv * sd->dv.dy;
}
CCL_NAMESPACE_END