Files
test/source/blender/gpu/intern/gpu_codegen.cc
Clément Foucault fb3904ce45 GPU: Shader Codegen: Split different graph includes
This allows to reduce the number of includes for each
tree graph (surface, volume, displacement) and
reduce the code size significantly for most vertex
shaders, speeding up compile time.

Rel #145347

Pull Request: https://projects.blender.org/blender/blender/pulls/146419
2025-09-22 10:24:10 +02:00

528 lines
17 KiB
C++

/* SPDX-FileCopyrightText: 2005 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup gpu
*
* Convert material node-trees to GLSL.
*/
#include "MEM_guardedalloc.h"
#include "DNA_material_types.h"
#include "BLI_span.hh"
#include "BLI_string.h"
#include "BLI_vector.hh"
#include "BKE_cryptomatte.hh"
#include "IMB_colormanagement.hh"
#include "GPU_capabilities.hh"
#include "GPU_shader.hh"
#include "GPU_uniform_buffer.hh"
#include "GPU_vertex_format.hh"
#include "gpu_codegen.hh"
#include "gpu_material_library.hh"
#include "gpu_shader_dependency_private.hh"
#include <cstdarg>
#include <cstring>
using namespace blender;
using namespace blender::gpu::shader;
/* -------------------------------------------------------------------- */
/** \name Type > string conversion
* \{ */
#if 0
# define SRC_NAME(io, link, list, type) \
link->node->name << "_" << io << BLI_findindex(&link->node->list, (const void *)link) << "_" \
<< type
#else
# define SRC_NAME(io, list, link, type) type
#endif
static std::ostream &operator<<(std::ostream &stream, const GPUInput *input)
{
switch (input->source) {
case GPU_SOURCE_FUNCTION_CALL:
case GPU_SOURCE_OUTPUT:
return stream << SRC_NAME("in", input, inputs, "tmp") << input->id;
case GPU_SOURCE_CONSTANT:
return stream << SRC_NAME("in", input, inputs, "cons") << input->id;
case GPU_SOURCE_UNIFORM:
return stream << "node_tree.u" << input->id;
case GPU_SOURCE_ATTR:
return stream << "var_attrs.v" << input->attr->id;
case GPU_SOURCE_UNIFORM_ATTR:
return stream << "UNI_ATTR(unf_attrs[resource_id].attr" << input->uniform_attr->id << ")";
case GPU_SOURCE_LAYER_ATTR:
return stream << "attr_load_layer(" << input->layer_attr->hash_code << ")";
case GPU_SOURCE_STRUCT:
return stream << "strct" << input->id;
case GPU_SOURCE_TEX:
return stream << input->texture->sampler_name;
case GPU_SOURCE_TEX_TILED_MAPPING:
return stream << input->texture->tiled_mapping_name;
default:
BLI_assert(0);
return stream;
}
}
static std::ostream &operator<<(std::ostream &stream, const GPUOutput *output)
{
return stream << SRC_NAME("out", output, outputs, "tmp") << output->id;
}
/* Print data constructor (i.e: vec2(1.0f, 1.0f)). */
static std::ostream &operator<<(std::ostream &stream, const Span<float> &span)
{
stream << (GPUType)span.size() << "(";
/* Use uint representation to allow exact same bit pattern even if NaN. This is
* because we can pass UINTs as floats for constants. */
const Span<uint32_t> uint_span = span.cast<uint32_t>();
for (const uint32_t &element : uint_span) {
char formatted_float[32];
SNPRINTF(formatted_float, "uintBitsToFloat(%uu)", element);
stream << formatted_float;
if (&element != &uint_span.last()) {
stream << ", ";
}
}
stream << ")";
return stream;
}
/* Trick type to change overload and keep a somewhat nice syntax. */
struct GPUConstant : public GPUInput {};
static std::ostream &operator<<(std::ostream &stream, const GPUConstant *input)
{
stream << Span<float>(input->vec, input->type);
return stream;
}
namespace blender::gpu::shader {
/* Needed to use the << operators from nested namespaces. :(
* https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution */
using ::operator<<;
} // namespace blender::gpu::shader
/** \} */
/* -------------------------------------------------------------------- */
/** \name GLSL code generation
* \{ */
const char *GPUCodegenCreateInfo::NameBuffer::append_sampler_name(const char name[32])
{
auto index = sampler_names.size();
sampler_names.append(std::make_unique<NameEntry>());
char *name_buffer = sampler_names[index]->data();
memcpy(name_buffer, name, 32);
return name_buffer;
}
GPUCodegen::GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_, const char *debug_name)
: mat(*mat_), graph(*graph_)
{
BLI_hash_mm2a_init(&hm2a_, GPU_material_uuid_get(&mat));
BLI_hash_mm2a_add_int(&hm2a_, GPU_material_flag(&mat));
create_info = MEM_new<GPUCodegenCreateInfo>(__func__, debug_name);
output.create_info = reinterpret_cast<GPUShaderCreateInfo *>(
static_cast<ShaderCreateInfo *>(create_info));
}
GPUCodegen::~GPUCodegen()
{
MEM_SAFE_FREE(cryptomatte_input_);
MEM_delete(create_info);
BLI_freelistN(&ubo_inputs_);
};
bool GPUCodegen::should_optimize_heuristic() const
{
/* If each of the maximal attributes are exceeded, we can optimize, but we should also ensure
* the baseline is met. */
bool do_optimize = (nodes_total_ >= 60 || textures_total_ >= 4 || uniforms_total_ >= 64) &&
(textures_total_ >= 1 && uniforms_total_ >= 8 && nodes_total_ >= 4);
return do_optimize;
}
void GPUCodegen::generate_attribs()
{
if (BLI_listbase_is_empty(&graph.attributes)) {
output.attr_load.clear();
return;
}
GPUCodegenCreateInfo &info = *create_info;
info.interface_generated = MEM_new<StageInterfaceInfo>(__func__, "codegen_iface", "var_attrs");
StageInterfaceInfo &iface = *info.interface_generated;
info.vertex_out(iface);
/* Input declaration, loading / assignment to interface and geometry shader passthrough. */
std::stringstream load_ss;
/* Index of the attribute as ordered in graph.attributes. */
int attr_n = 0;
int slot = 15;
LISTBASE_FOREACH (GPUMaterialAttribute *, attr, &graph.attributes) {
if (slot == -1) {
BLI_assert_msg(0, "Too many attributes");
break;
}
STRNCPY(info.name_buffer.attr_names[slot], attr->input_name);
SNPRINTF(info.name_buffer.var_names[slot], "v%d", attr->id);
StringRefNull attr_name = info.name_buffer.attr_names[slot];
StringRefNull var_name = info.name_buffer.var_names[slot];
GPUType input_type, iface_type;
load_ss << "var_attrs." << var_name;
if (attr->is_hair_length || attr->is_hair_intercept) {
iface_type = input_type = GPU_FLOAT;
load_ss << " = attr_load_" << input_type << "(domain, " << attr_name << ", " << attr_n
<< ");\n";
}
else {
switch (attr->type) {
case CD_ORCO:
/* Need vec4 to detect usage of default attribute. */
input_type = GPU_VEC4;
iface_type = GPU_VEC3;
load_ss << " = attr_load_orco(domain, " << attr_name << ", " << attr_n << ");\n";
break;
case CD_TANGENT:
iface_type = input_type = GPU_VEC4;
load_ss << " = attr_load_tangent(domain, " << attr_name << ", " << attr_n << ");\n";
break;
default:
iface_type = input_type = GPU_VEC4;
load_ss << " = attr_load_" << input_type << "(domain, " << attr_name << ", " << attr_n
<< ");\n";
break;
}
}
attr_n++;
info.vertex_in(slot--, to_type(input_type), attr_name);
iface.smooth(to_type(iface_type), var_name);
}
output.attr_load = load_ss.str();
}
void GPUCodegen::generate_resources()
{
GPUCodegenCreateInfo &info = *create_info;
std::stringstream ss;
/* Textures. */
int slot = 0;
LISTBASE_FOREACH (GPUMaterialTexture *, tex, &graph.textures) {
if (tex->colorband) {
const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
info.sampler(slot++, ImageType::Float1DArray, name, Frequency::BATCH);
}
else if (tex->sky) {
const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
info.sampler(0, ImageType::Float2DArray, name, Frequency::BATCH);
}
else if (tex->tiled_mapping_name[0] != '\0') {
const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
info.sampler(slot++, ImageType::Float2DArray, name, Frequency::BATCH);
const char *name_mapping = info.name_buffer.append_sampler_name(tex->tiled_mapping_name);
info.sampler(slot++, ImageType::Float1DArray, name_mapping, Frequency::BATCH);
}
else {
const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
info.sampler(slot++, ImageType::Float2D, name, Frequency::BATCH);
}
}
/* Increment heuristic. */
textures_total_ = slot;
if (!BLI_listbase_is_empty(&ubo_inputs_)) {
/* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */
ss << "struct NodeTree {\n";
LISTBASE_FOREACH (LinkData *, link, &ubo_inputs_) {
GPUInput *input = (GPUInput *)(link->data);
if (input->source == GPU_SOURCE_CRYPTOMATTE) {
ss << input->type << " crypto_hash;\n";
}
else {
ss << input->type << " u" << input->id << ";\n";
}
}
ss << "};\n\n";
info.uniform_buf(GPU_NODE_TREE_UBO_SLOT, "NodeTree", GPU_UBO_BLOCK_NAME, Frequency::BATCH);
}
if (!BLI_listbase_is_empty(&graph.uniform_attrs.list)) {
ss << "struct UniformAttrs {\n";
LISTBASE_FOREACH (GPUUniformAttr *, attr, &graph.uniform_attrs.list) {
ss << "vec4 attr" << attr->id << ";\n";
}
ss << "};\n\n";
/* TODO(fclem): Use the macro for length. Currently not working for EEVEE. */
/* DRW_RESOURCE_CHUNK_LEN = 512 */
info.uniform_buf(2, "UniformAttrs", GPU_ATTRIBUTE_UBO_BLOCK_NAME "[512]", Frequency::BATCH);
}
if (!BLI_listbase_is_empty(&graph.layer_attrs)) {
info.additional_info("draw_layer_attributes");
}
info.typedef_source_generated = ss.str();
}
void GPUCodegen::node_serialize(Set<StringRefNull> &used_libraries,
std::stringstream &eval_ss,
const GPUNode *node)
{
gpu_material_library_use_function(used_libraries, node->name);
/* Declare constants. */
LISTBASE_FOREACH (GPUInput *, input, &node->inputs) {
switch (input->source) {
case GPU_SOURCE_FUNCTION_CALL:
eval_ss << input->type << " " << input << "; " << input->function_call << input << ");\n";
break;
case GPU_SOURCE_STRUCT:
eval_ss << input->type << " " << input << " = CLOSURE_DEFAULT;\n";
break;
case GPU_SOURCE_CONSTANT:
eval_ss << input->type << " " << input << " = " << (GPUConstant *)input << ";\n";
break;
default:
break;
}
}
/* Declare temporary variables for node output storage. */
LISTBASE_FOREACH (GPUOutput *, output, &node->outputs) {
eval_ss << output->type << " " << output << ";\n";
}
/* Function call. */
eval_ss << node->name << "(";
/* Input arguments. */
LISTBASE_FOREACH (GPUInput *, input, &node->inputs) {
switch (input->source) {
case GPU_SOURCE_OUTPUT:
case GPU_SOURCE_ATTR: {
/* These inputs can have non matching types. Do conversion. */
GPUType to = input->type;
GPUType from = (input->source == GPU_SOURCE_ATTR) ? input->attr->gputype :
input->link->output->type;
if (from != to) {
/* Use defines declared inside codegen_lib (i.e: vec4_from_float). */
eval_ss << to << "_from_" << from << "(";
}
if (input->source == GPU_SOURCE_ATTR) {
eval_ss << input;
}
else {
eval_ss << input->link->output;
}
if (from != to) {
/* Special case that needs luminance coefficients as argument. */
if (from == GPU_VEC4 && to == GPU_FLOAT) {
float coefficients[3];
IMB_colormanagement_get_luminance_coefficients(coefficients);
eval_ss << ", " << Span<float>(coefficients, 3);
}
eval_ss << ")";
}
break;
}
default:
eval_ss << input;
break;
}
eval_ss << ", ";
}
/* Output arguments. */
LISTBASE_FOREACH (GPUOutput *, output, &node->outputs) {
eval_ss << output;
if (output->next) {
eval_ss << ", ";
}
}
eval_ss << ");\n\n";
/* Increment heuristic. */
nodes_total_++;
}
static Vector<StringRefNull> set_to_vector_stable(Set<StringRefNull> &set)
{
Vector<StringRefNull> source_files;
for (const StringRefNull &str : set) {
source_files.append(str);
}
/* Sort dependencies to avoid random order causing shader caching to fail (see #108289). */
std::sort(source_files.begin(), source_files.end());
return source_files;
}
GPUGraphOutput GPUCodegen::graph_serialize(GPUNodeTag tree_tag,
GPUNodeLink *output_link,
const char *output_default)
{
if (output_link == nullptr && output_default == nullptr) {
return {};
}
Set<StringRefNull> used_libraries;
std::stringstream eval_ss;
bool has_nodes = false;
/* NOTE: The node order is already top to bottom (or left to right in node editor)
* because of the evaluation order inside ntreeExecGPUNodes(). */
LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) {
if ((node->tag & tree_tag) == 0) {
continue;
}
node_serialize(used_libraries, eval_ss, node);
has_nodes = true;
}
if (!has_nodes) {
return {};
}
if (output_link) {
eval_ss << "return " << output_link->output << ";\n";
}
else {
/* Default output in case there are only AOVs. */
eval_ss << "return " << output_default << ";\n";
}
std::string str = eval_ss.str();
BLI_hash_mm2a_add(&hm2a_, reinterpret_cast<const uchar *>(str.c_str()), str.size());
return {str, set_to_vector_stable(used_libraries)};
}
GPUGraphOutput GPUCodegen::graph_serialize(GPUNodeTag tree_tag)
{
std::stringstream eval_ss;
Set<StringRefNull> used_libraries;
LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) {
if (node->tag & tree_tag) {
node_serialize(used_libraries, eval_ss, node);
}
}
std::string str = eval_ss.str();
BLI_hash_mm2a_add(&hm2a_, reinterpret_cast<const uchar *>(str.c_str()), str.size());
return {str, set_to_vector_stable(used_libraries)};
}
void GPUCodegen::generate_cryptomatte()
{
cryptomatte_input_ = MEM_callocN<GPUInput>(__func__);
cryptomatte_input_->type = GPU_FLOAT;
cryptomatte_input_->source = GPU_SOURCE_CRYPTOMATTE;
float material_hash = 0.0f;
Material *material = GPU_material_get_material(&mat);
if (material) {
bke::cryptomatte::CryptomatteHash hash(material->id.name + 2,
BLI_strnlen(material->id.name + 2, MAX_NAME - 2));
material_hash = hash.float_encoded();
}
cryptomatte_input_->vec[0] = material_hash;
BLI_addtail(&ubo_inputs_, BLI_genericNodeN(cryptomatte_input_));
}
void GPUCodegen::generate_uniform_buffer()
{
/* Extract uniform inputs. */
LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) {
LISTBASE_FOREACH (GPUInput *, input, &node->inputs) {
if (input->source == GPU_SOURCE_UNIFORM && !input->link) {
/* We handle the UBO uniforms separately. */
BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input));
uniforms_total_++;
}
}
}
if (!BLI_listbase_is_empty(&ubo_inputs_)) {
/* This sorts the inputs based on size. */
GPU_material_uniform_buffer_create(&mat, &ubo_inputs_);
}
}
/* Sets id for unique names for all inputs, resources and temp variables. */
void GPUCodegen::set_unique_ids()
{
int id = 1;
LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) {
LISTBASE_FOREACH (GPUInput *, input, &node->inputs) {
input->id = id++;
}
LISTBASE_FOREACH (GPUOutput *, output, &node->outputs) {
output->id = id++;
}
}
}
void GPUCodegen::generate_graphs()
{
set_unique_ids();
output.surface = graph_serialize(
GPU_NODE_TAG_SURFACE | GPU_NODE_TAG_AOV, graph.outlink_surface, "CLOSURE_DEFAULT");
output.volume = graph_serialize(GPU_NODE_TAG_VOLUME, graph.outlink_volume, "CLOSURE_DEFAULT");
output.displacement = graph_serialize(
GPU_NODE_TAG_DISPLACEMENT, graph.outlink_displacement, nullptr);
output.thickness = graph_serialize(GPU_NODE_TAG_THICKNESS, graph.outlink_thickness, nullptr);
if (!BLI_listbase_is_empty(&graph.outlink_compositor)) {
output.composite = graph_serialize(GPU_NODE_TAG_COMPOSITOR);
}
if (!BLI_listbase_is_empty(&graph.material_functions)) {
LISTBASE_FOREACH (GPUNodeGraphFunctionLink *, func_link, &graph.material_functions) {
std::stringstream eval_ss;
/* Untag every node in the graph to avoid serializing nodes from other functions */
LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) {
node->tag &= ~GPU_NODE_TAG_FUNCTION;
}
/* Tag only the nodes needed for the current function */
gpu_nodes_tag(func_link->outlink, GPU_NODE_TAG_FUNCTION);
GPUGraphOutput graph = graph_serialize(GPU_NODE_TAG_FUNCTION, func_link->outlink);
eval_ss << "float " << func_link->name << "() {\n" << graph.serialized << "}\n\n";
output.material_functions.append({eval_ss.str(), graph.dependencies});
}
/* Leave the function tags as they were before serialization */
LISTBASE_FOREACH (GPUNodeGraphFunctionLink *, funclink, &graph.material_functions) {
gpu_nodes_tag(funclink->outlink, GPU_NODE_TAG_FUNCTION);
}
}
LISTBASE_FOREACH (GPUMaterialAttribute *, attr, &graph.attributes) {
BLI_hash_mm2a_add(&hm2a_, (uchar *)attr->name, strlen(attr->name));
}
hash_ = BLI_hash_mm2a_end(&hm2a_);
}
/** \} */