Files
test/source/blender/gpu/intern/gpu_pass.cc
Clément Foucault 1c47e31367 GPU: Enable GL multithreaded compilation by default
This allows to reduce the waiting time caused by
shader compilation on some GPU-driver combo.

A new settings in the User Preferences make it
possible to override the default amount of worker
threads and optionally use subprocesses.

We still use only one worker thread in cases where
there is no benefit with adding more workers
(like AMD pro driver and Intel windows).

It doesn't scale as much as subprocesses for material
shader compilation but that is for other reasons
explained in #139818.

Add some heuristic to avoid too much memory usage
and / or too many stalls.

Also add some heuristic to the default number of subprocess for
the platform that shows scalling.

Historically, multithreaded compilation was prevented by the
need of context per thread inside `DRWShader` module.
Also there was no good scaling at that time. But
nowadays numbers shows different results with
good scaling with reasonable amount of threads on many
platforms.

Even if we are going for vulkan in the next release
most of the legacy hardware will still use OpenGL for
a few other releases. So it is relevant to make this
easy improvement.

See pull request for measurements.

Pull Request: https://projects.blender.org/blender/blender/pulls/139821
2025-06-09 12:36:06 +02:00

422 lines
11 KiB
C++

/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup gpu
*
* Convert material node-trees to GLSL.
*/
#include "MEM_guardedalloc.h"
#include "BLI_map.hh"
#include "BLI_span.hh"
#include "BLI_time.h"
#include "BLI_vector.hh"
#include "GPU_capabilities.hh"
#include "GPU_context.hh"
#include "GPU_pass.hh"
#include "GPU_vertex_format.hh"
#include "gpu_codegen.hh"
#include <mutex>
#include <string>
using namespace blender;
using namespace blender::gpu::shader;
static bool gpu_pass_validate(GPUCodegenCreateInfo *create_info);
/* -------------------------------------------------------------------- */
/** \name GPUPass
* \{ */
struct GPUPass {
static inline std::atomic<uint64_t> compilation_counts = 0;
GPUCodegenCreateInfo *create_info = nullptr;
BatchHandle compilation_handle = 0;
std::atomic<GPUShader *> shader = nullptr;
std::atomic<eGPUPassStatus> status = GPU_PASS_QUEUED;
/* Orphaned GPUPasses gets freed by the garbage collector. */
std::atomic<int> refcount = 1;
/* The last time the refcount was greater than 0. */
double gc_timestamp = 0.0f;
uint64_t compilation_timestamp = 0;
/** Hint that an optimized variant of this pass should be created.
* Based on a complexity heuristic from pass code generation. */
bool should_optimize = false;
bool is_optimization_pass = false;
GPUPass(GPUCodegenCreateInfo *info,
bool deferred_compilation,
bool is_optimization_pass,
bool should_optimize)
: create_info(info),
should_optimize(should_optimize),
is_optimization_pass(is_optimization_pass)
{
BLI_assert(!is_optimization_pass || !should_optimize);
if (is_optimization_pass && deferred_compilation) {
// Defer until all non optimization passes are compiled.
return;
}
GPUShaderCreateInfo *base_info = reinterpret_cast<GPUShaderCreateInfo *>(create_info);
if (deferred_compilation) {
compilation_handle = GPU_shader_batch_create_from_infos(
Span<GPUShaderCreateInfo *>(&base_info, 1), compilation_priority());
}
else {
shader = GPU_shader_create_from_info(base_info);
finalize_compilation();
}
}
~GPUPass()
{
if (compilation_handle) {
GPU_shader_batch_cancel(compilation_handle);
}
else {
BLI_assert(create_info == nullptr || (is_optimization_pass && status == GPU_PASS_QUEUED));
}
MEM_delete(create_info);
GPU_SHADER_FREE_SAFE(shader);
}
CompilationPriority compilation_priority()
{
return is_optimization_pass ? CompilationPriority::Low : CompilationPriority::Medium;
}
void finalize_compilation()
{
BLI_assert_msg(create_info, "GPUPass::finalize_compilation() called more than once.");
if (compilation_handle) {
shader = GPU_shader_batch_finalize(compilation_handle).first();
}
compilation_timestamp = ++compilation_counts;
if (!shader && !gpu_pass_validate(create_info)) {
fprintf(stderr, "GPUShader: error: too many samplers in shader.\n");
}
status = shader ? GPU_PASS_SUCCESS : GPU_PASS_FAILED;
MEM_delete(create_info);
create_info = nullptr;
}
void update(double timestamp)
{
update_compilation();
update_gc_timestamp(timestamp);
}
void update_compilation()
{
if (compilation_handle) {
if (GPU_shader_batch_is_ready(compilation_handle)) {
finalize_compilation();
}
}
else if (status == GPU_PASS_QUEUED && refcount > 0) {
BLI_assert(is_optimization_pass);
GPUShaderCreateInfo *base_info = reinterpret_cast<GPUShaderCreateInfo *>(create_info);
compilation_handle = GPU_shader_batch_create_from_infos(
Span<GPUShaderCreateInfo *>(&base_info, 1), compilation_priority());
}
}
void update_gc_timestamp(double timestamp)
{
if (refcount != 0 || gc_timestamp == 0.0f) {
gc_timestamp = timestamp;
}
}
bool should_gc(int gc_collect_rate, double timestamp)
{
BLI_assert(gc_timestamp != 0.0f);
return !compilation_handle && status != GPU_PASS_FAILED &&
(timestamp - gc_timestamp) >= gc_collect_rate;
}
};
eGPUPassStatus GPU_pass_status(GPUPass *pass)
{
return pass->status;
}
bool GPU_pass_should_optimize(GPUPass *pass)
{
/* Returns optimization heuristic prepared during
* initial codegen.
* NOTE: Only enabled on Metal, since it doesn't seem to yield any performance improvements for
* other backends. */
return (GPU_backend_get_type() == GPU_BACKEND_METAL) && pass->should_optimize;
#if 0
/* Returns optimization heuristic prepared during initial codegen. */
return pass->should_optimize;
#endif
}
GPUShader *GPU_pass_shader_get(GPUPass *pass)
{
return pass->shader;
}
void GPU_pass_acquire(GPUPass *pass)
{
int previous_refcount = pass->refcount++;
UNUSED_VARS_NDEBUG(previous_refcount);
BLI_assert(previous_refcount > 0);
}
void GPU_pass_release(GPUPass *pass)
{
int previous_refcount = pass->refcount--;
UNUSED_VARS_NDEBUG(previous_refcount);
BLI_assert(previous_refcount > 0);
}
uint64_t GPU_pass_global_compilation_count()
{
return GPUPass::compilation_counts;
}
uint64_t GPU_pass_compilation_timestamp(GPUPass *pass)
{
return pass->compilation_timestamp;
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name GPUPass Cache
*
* Internal shader cache: This prevent the shader recompilation / stall when
* using undo/redo AND also allows for GPUPass reuse if the Shader code is the
* same for 2 different Materials. Unused GPUPasses are free by Garbage collection.
* \{ */
class GPUPassCache {
/* Number of seconds with 0 users required before garbage collecting a pass.*/
static constexpr float gc_collect_rate_ = 60.0f;
/* Number of seconds without base compilations required before starting to compile optimization
* passes.*/
static constexpr float optimization_delay_ = 10.0f;
double last_base_compilation_timestamp_ = -1.0;
Map<uint32_t, std::unique_ptr<GPUPass>> passes_[GPU_MAT_ENGINE_MAX][2 /*is_optimization_pass*/];
std::mutex mutex_;
public:
void add(eGPUMaterialEngine engine,
GPUCodegen &codegen,
bool deferred_compilation,
bool is_optimization_pass)
{
std::lock_guard lock(mutex_);
passes_[engine][is_optimization_pass].add(
codegen.hash_get(),
std::make_unique<GPUPass>(codegen.create_info,
deferred_compilation,
is_optimization_pass,
codegen.should_optimize_heuristic()));
};
GPUPass *get(eGPUMaterialEngine engine,
size_t hash,
bool allow_deferred,
bool is_optimization_pass)
{
std::lock_guard lock(mutex_);
std::unique_ptr<GPUPass> *pass = passes_[engine][is_optimization_pass].lookup_ptr(hash);
if (!allow_deferred && pass && pass->get()->status == GPU_PASS_QUEUED) {
pass->get()->finalize_compilation();
}
return pass ? pass->get() : nullptr;
}
void update()
{
std::lock_guard lock(mutex_);
double timestamp = BLI_time_now_seconds();
bool base_passes_ready = true;
/* Base Passes. */
for (auto &engine_passes : passes_) {
for (std::unique_ptr<GPUPass> &pass : engine_passes[false].values()) {
pass->update(timestamp);
if (pass->status == GPU_PASS_QUEUED) {
base_passes_ready = false;
}
}
engine_passes[false].remove_if(
[&](auto item) { return item.value->should_gc(gc_collect_rate_, timestamp); });
}
/* Optimization Passes GC. */
for (auto &engine_passes : passes_) {
for (std::unique_ptr<GPUPass> &pass : engine_passes[true].values()) {
pass->update_gc_timestamp(timestamp);
}
engine_passes[true].remove_if(
/* TODO: Use lower rate for optimization passes? */
[&](auto item) { return item.value->should_gc(gc_collect_rate_, timestamp); });
}
if (!base_passes_ready) {
last_base_compilation_timestamp_ = timestamp;
return;
}
if ((timestamp - last_base_compilation_timestamp_) < optimization_delay_) {
return;
}
/* Optimization Passes Compilation. */
for (auto &engine_passes : passes_) {
for (std::unique_ptr<GPUPass> &pass : engine_passes[true].values()) {
pass->update_compilation();
}
}
}
std::mutex &get_mutex()
{
return mutex_;
}
};
static GPUPassCache *g_cache = nullptr;
void GPU_pass_ensure_its_ready(GPUPass *pass)
{
if (pass->status == GPU_PASS_QUEUED) {
std::lock_guard lock(g_cache->get_mutex());
if (pass->status == GPU_PASS_QUEUED) {
pass->finalize_compilation();
}
}
}
void GPU_pass_cache_init()
{
g_cache = MEM_new<GPUPassCache>(__func__);
}
void GPU_pass_cache_update()
{
g_cache->update();
}
void GPU_pass_cache_wait_for_all()
{
GPU_shader_batch_wait_for_all();
g_cache->update();
}
void GPU_pass_cache_free()
{
MEM_SAFE_DELETE(g_cache);
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name Compilation
* \{ */
static bool gpu_pass_validate(GPUCodegenCreateInfo *create_info)
{
int samplers_len = 0;
for (const ShaderCreateInfo::Resource &res : create_info->resources_get_all_()) {
if (res.bind_type == ShaderCreateInfo::Resource::BindType::SAMPLER) {
samplers_len++;
}
}
/* Validate against GPU limit. */
if ((samplers_len > GPU_max_textures_frag()) || (samplers_len > GPU_max_textures_vert())) {
return false;
}
return (samplers_len * 2 <= GPU_max_textures());
}
GPUPass *GPU_generate_pass(GPUMaterial *material,
GPUNodeGraph *graph,
const char *debug_name,
eGPUMaterialEngine engine,
bool deferred_compilation,
GPUCodegenCallbackFn finalize_source_cb,
void *thunk,
bool optimize_graph)
{
gpu_node_graph_prune_unused(graph);
/* If Optimize flag is passed in, we are generating an optimized
* variant of the GPUMaterial's GPUPass. */
if (optimize_graph) {
gpu_node_graph_optimize(graph);
}
/* Extract attributes before compiling so the generated VBOs are ready to accept the future
* shader. */
gpu_node_graph_finalize_uniform_attrs(graph);
GPUCodegen codegen(material, graph, debug_name);
codegen.generate_graphs();
codegen.generate_cryptomatte();
GPUPass *pass = nullptr;
if (!optimize_graph) {
/* The optimized version of the shader should not re-generate a UBO.
* The UBO will not be used for this variant. */
codegen.generate_uniform_buffer();
}
/* Cache lookup: Reuse shaders already compiled. */
pass = g_cache->get(engine, codegen.hash_get(), deferred_compilation, optimize_graph);
if (pass) {
pass->refcount++;
return pass;
}
/* The shader is not compiled, continue generating the shader strings. */
codegen.generate_attribs();
codegen.generate_resources();
codegen.generate_library();
/* Make engine add its own code and implement the generated functions. */
finalize_source_cb(thunk, material, &codegen.output);
codegen.create_info->finalize();
g_cache->add(engine, codegen, deferred_compilation, optimize_graph);
codegen.create_info = nullptr;
return g_cache->get(engine, codegen.hash_get(), deferred_compilation, optimize_graph);
}
/** \} */