The Principled BSDF has a ton of inputs, and the previous SVM code just always allocated stack space for all of them. This results in a ton of additional NODE_VALUE_x SVM nodes, which slow down execution. However, this is not really needed for two reasons: - First, many inputs are only used consitionally. For example, if the subsurface weight is zero, none of the other subsurface inputs are used. - Many of the inputs have a "usual" value that they will have in most materials, so if they happen to have that value we can just indicate that by not allocating space for them. This is a bit similar to the standard "pack the fixed value and provide a stack offset if there's a link" pattern, except that the fixed value is a constant in the code and we allocate a NODE_VALUE_x if a different fixed value is used. Therefore, this PR re-implements the parameter packing in a more efficient way: - If we can determine that a component is disabled, all conditional inputs are disconnected (to avoid generating upstream nodes). - If we can determine that a component is disabled, we skip allocating all conditional inputs on the stack. - The inputs for which a reasonable "usual" value exists are changed to respect that, and to only be allocated if they differ. - param1 and param2 (which are fixed-value-packed as on all BSDF nodes) are used to store IOR and roughness, which have a decent chance to be fixed values. - The parameter packing is more aggressive about using uchar4, which allows to get rid of two SVM nodes while still storing the same inputs. The result is a considerable speedup in scenes that make heavy use of the Principled BSDF: | Scene | CPU speedup | OptiX speedup | | --- | --- | --- | | attic | 5% | 9% | | bistro | 5% | 8% | | junkshop | 5% | 10% | | monster | 3% | 4% | | spring | 1% | 6% | Pull Request: https://projects.blender.org/blender/blender/pulls/143910
161 lines
4.7 KiB
C
161 lines
4.7 KiB
C
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#pragma once
|
|
|
|
#include "kernel/globals.h"
|
|
#include "kernel/types.h"
|
|
|
|
#include "kernel/svm/types.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Stack */
|
|
|
|
ccl_device_inline float3 stack_load_float3(const ccl_private float *stack, const uint a)
|
|
{
|
|
kernel_assert(a + 2 < SVM_STACK_SIZE);
|
|
|
|
const ccl_private float *stack_a = stack + a;
|
|
return make_float3(stack_a[0], stack_a[1], stack_a[2]);
|
|
}
|
|
|
|
ccl_device_inline float3 stack_load_float3_default(const ccl_private float *stack,
|
|
const uint a,
|
|
const float3 value)
|
|
{
|
|
return (a == (uint)SVM_STACK_INVALID) ? value : stack_load_float3(stack, a);
|
|
}
|
|
|
|
ccl_device_inline void stack_store_float3(ccl_private float *stack, const uint a, const float3 f)
|
|
{
|
|
kernel_assert(a + 2 < SVM_STACK_SIZE);
|
|
copy_v3_v3(stack + a, f);
|
|
}
|
|
|
|
ccl_device_inline float stack_load_float(const ccl_private float *stack, const uint a)
|
|
{
|
|
kernel_assert(a < SVM_STACK_SIZE);
|
|
|
|
return stack[a];
|
|
}
|
|
|
|
ccl_device_inline float stack_load_float_default(const ccl_private float *stack,
|
|
const uint a,
|
|
const uint value)
|
|
{
|
|
return (a == (uint)SVM_STACK_INVALID) ? __uint_as_float(value) : stack_load_float(stack, a);
|
|
}
|
|
|
|
ccl_device_inline float stack_load_float_default(const ccl_private float *stack,
|
|
const uint a,
|
|
const float value)
|
|
{
|
|
return (a == (uint)SVM_STACK_INVALID) ? value : stack_load_float(stack, a);
|
|
}
|
|
|
|
ccl_device_inline void stack_store_float(ccl_private float *stack, const uint a, const float f)
|
|
{
|
|
kernel_assert(a < SVM_STACK_SIZE);
|
|
|
|
stack[a] = f;
|
|
}
|
|
|
|
ccl_device_inline int stack_load_int(const ccl_private float *stack, const uint a)
|
|
{
|
|
kernel_assert(a < SVM_STACK_SIZE);
|
|
|
|
return __float_as_int(stack[a]);
|
|
}
|
|
|
|
ccl_device_inline int stack_load_int_default(ccl_private float *stack,
|
|
const uint a,
|
|
const uint value)
|
|
{
|
|
return (a == (uint)SVM_STACK_INVALID) ? (int)value : stack_load_int(stack, a);
|
|
}
|
|
|
|
ccl_device_inline void stack_store_int(ccl_private float *stack, const uint a, const int i)
|
|
{
|
|
kernel_assert(a < SVM_STACK_SIZE);
|
|
|
|
stack[a] = __int_as_float(i);
|
|
}
|
|
|
|
ccl_device_inline bool stack_valid(const uint a)
|
|
{
|
|
return a != (uint)SVM_STACK_INVALID;
|
|
}
|
|
|
|
/* Reading Nodes */
|
|
|
|
ccl_device_inline uint4 read_node(KernelGlobals kg, ccl_private int *const offset)
|
|
{
|
|
uint4 node = kernel_data_fetch(svm_nodes, *offset);
|
|
(*offset)++;
|
|
return node;
|
|
}
|
|
|
|
ccl_device_inline float4 read_node_float(KernelGlobals kg, ccl_private int *const offset)
|
|
{
|
|
const uint4 node = kernel_data_fetch(svm_nodes, *offset);
|
|
const float4 f = make_float4(__uint_as_float(node.x),
|
|
__uint_as_float(node.y),
|
|
__uint_as_float(node.z),
|
|
__uint_as_float(node.w));
|
|
(*offset)++;
|
|
return f;
|
|
}
|
|
|
|
ccl_device_inline float4 fetch_node_float(KernelGlobals kg, const int offset)
|
|
{
|
|
const uint4 node = kernel_data_fetch(svm_nodes, offset);
|
|
return make_float4(__uint_as_float(node.x),
|
|
__uint_as_float(node.y),
|
|
__uint_as_float(node.z),
|
|
__uint_as_float(node.w));
|
|
}
|
|
|
|
ccl_device_forceinline void svm_unpack_node_uchar2(const uint i,
|
|
ccl_private uint *x,
|
|
ccl_private uint *y)
|
|
{
|
|
*x = (i & 0xFF);
|
|
*y = ((i >> 8) & 0xFF);
|
|
}
|
|
|
|
ccl_device_forceinline void svm_unpack_node_uchar3(const uint i,
|
|
ccl_private uint *x,
|
|
ccl_private uint *y,
|
|
ccl_private uint *z)
|
|
{
|
|
*x = (i & 0xFF);
|
|
*y = ((i >> 8) & 0xFF);
|
|
*z = ((i >> 16) & 0xFF);
|
|
}
|
|
|
|
ccl_device_forceinline void svm_unpack_node_uchar4(const uint i,
|
|
ccl_private uint *x,
|
|
ccl_private uint *y,
|
|
ccl_private uint *z,
|
|
ccl_private uint *w)
|
|
{
|
|
*x = (i & 0xFF);
|
|
*y = ((i >> 8) & 0xFF);
|
|
*z = ((i >> 16) & 0xFF);
|
|
*w = ((i >> 24) & 0xFF);
|
|
}
|
|
|
|
ccl_device_forceinline float3 dPdx(const ccl_private ShaderData *sd)
|
|
{
|
|
return sd->dPdu * sd->du.dx + sd->dPdv * sd->dv.dx;
|
|
}
|
|
|
|
ccl_device_forceinline float3 dPdy(const ccl_private ShaderData *sd)
|
|
{
|
|
return sd->dPdu * sd->du.dy + sd->dPdv * sd->dv.dy;
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|