GPU: Non-blocking specialization constants compilation

Update the batch specializations compilation to allow using it in an async way. The implementation has 2 main limitations: - Only one batch at a time can be processed, extra batches will be added to a queue. - Binding a specialization variant that is still being compiled will fail. Pull Request: https://projects.blender.org/blender/blender/pulls/123015
2024-06-20 18:02:44 +02:00
parent 8cb0b347ae
commit 33005ad716
9 changed files with 208 additions and 88 deletions
--- a/source/blender/draw/engines/eevee_next/eevee_instance.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_instance.cc
@@ -112,6 +112,11 @@ void Instance::init(const int2 &output_res,
  volume_probes.init();
  volume.init();
  lookdev.init(visible_rect);
+
+  /* Pre-compile specialization constants in parallel (if supported). */
+  shaders.precompile_specializations(
+      render_buffers.data.shadow_id, shadows.get_data().ray_count, shadows.get_data().step_count);
+  shaders_are_ready_ = shaders.is_ready(is_image_render());
 }

 void Instance::init_light_bake(Depsgraph *depsgraph, draw::Manager *manager)
--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
@@ -501,30 +501,6 @@ void DeferredLayerBase::gbuffer_pass_sync(Instance &inst)

 void DeferredLayer::begin_sync()
 {
-  if (GPU_use_parallel_compilation()) {
-    /* Pre-compile specialization constants in parallel. */
-    Vector<ShaderSpecialization> specializations;
-    for (int i = 0; i < 3; i++) {
-      GPUShader *sh = inst_.shaders.static_shader_get(eShaderType(DEFERRED_LIGHT_SINGLE + i));
-      for (bool use_split_indirect : {false, true}) {
-        for (bool use_lightprobe_eval : {false, true}) {
-          for (bool use_transmission : {false, true}) {
-            specializations.append(
-                {sh,
-                 {{"render_pass_shadow_id", inst_.render_buffers.data.shadow_id},
-                  {"use_split_indirect", use_split_indirect},
-                  {"use_lightprobe_eval", use_lightprobe_eval},
-                  {"use_transmission", use_transmission},
-                  {"shadow_ray_count", inst_.shadows.get_data().ray_count},
-                  {"shadow_ray_step_count", inst_.shadows.get_data().step_count}}});
-          }
-        }
-      }
-    }
-
-    GPU_shaders_precompile_specializations(specializations);
-  }
-
  {
    prepass_ps_.init();
    /* Textures. */
--- a/source/blender/draw/engines/eevee_next/eevee_shader.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.cc
@@ -90,20 +90,55 @@ ShaderModule::~ShaderModule()
 *
 * \{ */

-bool ShaderModule::is_ready(bool block)
+void ShaderModule::precompile_specializations(int render_buffers_shadow_id,
+                                              int shadow_ray_count,
+                                              int shadow_ray_step_count)
 {
-  if (compilation_handle_ == 0) {
-    return true;
+  BLI_assert(specialization_handle_ == 0);
+
+  if (!GPU_use_parallel_compilation()) {
+    return;
  }

-  if (block || GPU_shader_batch_is_ready(compilation_handle_)) {
-    Vector<GPUShader *> shaders = GPU_shader_batch_finalize(compilation_handle_);
-    for (int i : IndexRange(MAX_SHADER_TYPE)) {
-      shaders_[i] = shaders[i];
+  Vector<ShaderSpecialization> specializations;
+  for (int i = 0; i < 3; i++) {
+    GPUShader *sh = static_shader_get(eShaderType(DEFERRED_LIGHT_SINGLE + i));
+    for (bool use_split_indirect : {false, true}) {
+      for (bool use_lightprobe_eval : {false, true}) {
+        for (bool use_transmission : {false, true}) {
+          specializations.append({sh,
+                                  {{"render_pass_shadow_id", render_buffers_shadow_id},
+                                   {"use_split_indirect", use_split_indirect},
+                                   {"use_lightprobe_eval", use_lightprobe_eval},
+                                   {"use_transmission", use_transmission},
+                                   {"shadow_ray_count", shadow_ray_count},
+                                   {"shadow_ray_step_count", shadow_ray_step_count}}});
+        }
+      }
    }
  }

-  return compilation_handle_ == 0;
+  specialization_handle_ = GPU_shader_batch_specializations(specializations);
+}
+
+bool ShaderModule::is_ready(bool block)
+{
+  if (compilation_handle_) {
+    if (GPU_shader_batch_is_ready(compilation_handle_) || block) {
+      Vector<GPUShader *> shaders = GPU_shader_batch_finalize(compilation_handle_);
+      for (int i : IndexRange(MAX_SHADER_TYPE)) {
+        shaders_[i] = shaders[i];
+      }
+    }
+  }
+
+  if (specialization_handle_) {
+    while (!GPU_shader_batch_specializations_is_ready(specialization_handle_) && block) {
+      /* Block until ready. */
+    }
+  }
+
+  return compilation_handle_ == 0 && specialization_handle_ == 0;
 }

 const char *ShaderModule::static_shader_create_info_name_get(eShaderType shader_type)
--- a/source/blender/draw/engines/eevee_next/eevee_shader.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.hh
@@ -159,6 +159,7 @@ class ShaderModule {
 private:
  std::array<GPUShader *, MAX_SHADER_TYPE> shaders_;
  BatchHandle compilation_handle_ = 0;
+  SpecializationBatchHandle specialization_handle_ = 0;

  /** Shared shader module across all engine instances. */
  static ShaderModule *g_shader_module;
@@ -169,6 +170,10 @@ class ShaderModule {

  bool is_ready(bool block = false);

+  void precompile_specializations(int render_buffers_shadow_id,
+                                  int shadow_ray_count,
+                                  int shadow_ray_step_count);
+
  GPUShader *static_shader_get(eShaderType shader_type);
  GPUMaterial *material_default_shader_get(eMaterialPipeline pipeline_type,
                                           eMaterialGeometry geometry_type);
--- a/source/blender/gpu/GPU_shader.hh
+++ b/source/blender/gpu/GPU_shader.hh
@@ -220,12 +220,30 @@ void GPU_shader_constant_uint(GPUShader *sh, const char *name, unsigned int valu
 void GPU_shader_constant_float(GPUShader *sh, const char *name, float value);
 void GPU_shader_constant_bool(GPUShader *sh, const char *name, bool value);

+using SpecializationBatchHandle = int64_t;
+
 struct ShaderSpecialization {
  GPUShader *shader;
  blender::Vector<blender::gpu::shader::SpecializationConstant> constants;
 };

-void GPU_shaders_precompile_specializations(blender::Span<ShaderSpecialization> specializations);
+/**
+ * Request the compilation of multiple specialization constant variations at once,
+ * allowing the backend to use multithreaded compilation.
+ * Returns a handle that can be used to poll if all variations have been compiled.
+ * NOTE: This function is asynchronous on OpenGL, and a no-op on Vulkan and Metal.
+ * Batches are processed one by one in FIFO order.
+ * WARNING: Binding a specialization before the batch finishes will fail.
+ */
+SpecializationBatchHandle GPU_shader_batch_specializations(
+    blender::Span<ShaderSpecialization> specializations);
+
+/**
+ * Returns true if all the specializations from the batch have finished their compilation.
+ * NOTE: Polling this function is required for the compilation process to keep progressing.
+ * WARNING: Invalidates the handle if it returns true.
+ */
+bool GPU_shader_batch_specializations_is_ready(SpecializationBatchHandle &handle);

 /** \} */

--- a/source/blender/gpu/intern/gpu_shader.cc
+++ b/source/blender/gpu/intern/gpu_shader.cc
@@ -515,9 +515,15 @@ void GPU_shader_constant_bool(GPUShader *sh, const char *name, bool value)
  GPU_shader_constant_bool_ex(sh, unwrap(sh)->interface->constant_get(name)->location, value);
 }

-void GPU_shaders_precompile_specializations(Span<ShaderSpecialization> specializations)
+SpecializationBatchHandle GPU_shader_batch_specializations(
+    blender::Span<ShaderSpecialization> specializations)
 {
-  Context::get()->compiler->precompile_specializations(specializations);
+  return Context::get()->compiler->precompile_specializations(specializations);
+}
+
+bool GPU_shader_batch_specializations_is_ready(SpecializationBatchHandle &handle)
+{
+  return Context::get()->compiler->specialization_batch_is_ready(handle);
 }

 /** \} */
--- a/source/blender/gpu/intern/gpu_shader_private.hh
+++ b/source/blender/gpu/intern/gpu_shader_private.hh
@@ -181,7 +181,18 @@ class ShaderCompiler {
  virtual bool batch_is_ready(BatchHandle handle) = 0;
  virtual Vector<Shader *> batch_finalize(BatchHandle &handle) = 0;

-  virtual void precompile_specializations(Span<ShaderSpecialization> /*specializations*/){};
+  virtual SpecializationBatchHandle precompile_specializations(
+      Span<ShaderSpecialization> /*specializations*/)
+  {
+    /* No-op.*/
+    return 0;
+  };
+
+  virtual bool specialization_batch_is_ready(SpecializationBatchHandle &handle)
+  {
+    handle = 0;
+    return true;
+  };
 };

 /* Generic (fully synchronous) implementation for backends that don't implement their own
--- a/source/blender/gpu/opengl/gl_shader.cc
+++ b/source/blender/gpu/opengl/gl_shader.cc
@@ -1820,6 +1820,8 @@ BatchHandle GLShaderCompiler::batch_compile(Span<const shader::ShaderCreateInfo
 bool GLShaderCompiler::batch_is_ready(BatchHandle handle)
 {
  std::scoped_lock lock(mutex_);
+
+  BLI_assert(batches.contains(handle));
  Batch &batch = batches.lookup(handle);
  if (batch.is_ready) {
    return true;
@@ -1879,6 +1881,8 @@ Vector<Shader *> GLShaderCompiler::batch_finalize(BatchHandle &handle)
    BLI_time_sleep_ms(1);
  }
  std::scoped_lock lock(mutex_);
+
+  BLI_assert(batches.contains(handle));
  Batch batch = batches.pop(handle);
  Vector<Shader *> result;
  for (CompilationWork &item : batch.items) {
@@ -1888,24 +1892,33 @@ Vector<Shader *> GLShaderCompiler::batch_finalize(BatchHandle &handle)
  return result;
 }

-void GLShaderCompiler::precompile_specializations(Span<ShaderSpecialization> specializations)
+SpecializationBatchHandle GLShaderCompiler::precompile_specializations(
+    Span<ShaderSpecialization> specializations)
 {
  BLI_assert(GPU_use_parallel_compilation());

-  struct SpecializationWork {
-    GLShader *shader = nullptr;
-    GLuint program;
-    GLSourcesBaked sources;
+  std::scoped_lock lock(mutex_);

-    GLCompilerWorker *worker = nullptr;
-    bool do_async_compilation = false;
-    bool is_ready = false;
-  };
+  SpecializationBatchHandle handle = next_batch_handle++;

-  Vector<SpecializationWork> items;
-  items.reserve(specializations.size());
+  specialization_queue.append({handle, specializations});

-  for (auto &specialization : specializations) {
+  return handle;
+}
+
+void GLShaderCompiler::prepare_next_specialization_batch()
+{
+  BLI_assert(current_specialization_batch.is_ready && !specialization_queue.is_empty());
+
+  SpecializationRequest &next = specialization_queue.first();
+  SpecializationBatch &batch = current_specialization_batch;
+  batch.handle = next.handle;
+  batch.is_ready = false;
+  Vector<SpecializationWork> &items = batch.items;
+  items.clear();
+  items.reserve(next.specializations.size());
+
+  for (auto &specialization : next.specializations) {
    GLShader *sh = static_cast<GLShader *>(unwrap(specialization.shader));
    for (const SpecializationConstant &constant : specialization.constants) {
      const ShaderInput *input = sh->interface->constant_get(constant.name.c_str());
@@ -1932,53 +1945,72 @@ void GLShaderCompiler::precompile_specializations(Span<ShaderSpecialization> spe
    item.do_async_compilation = required_size <= sizeof(ShaderSourceHeader::sources);
  }

-  bool is_ready = false;
-  while (!is_ready) {
-    /* Loop until ready, we can't defer the compilation of required specialization constants. */
-    is_ready = true;
+  specialization_queue.remove(0);
+}

-    for (SpecializationWork &item : items) {
-      if (item.is_ready) {
-        continue;
-      }
-      std::scoped_lock lock(mutex_);
+bool GLShaderCompiler::specialization_batch_is_ready(SpecializationBatchHandle &handle)
+{
+  std::scoped_lock lock(mutex_);

-      if (!item.do_async_compilation) {
-        /* Compilation will happen locally on shader bind. */
-        glDeleteProgram(item.program);
-        item.program = 0;
-        item.shader->program_active_->program_id = 0;
-        item.shader->constants.is_dirty = true;
+  SpecializationBatch &batch = current_specialization_batch;
+
+  if (handle < batch.handle || (handle == batch.handle && batch.is_ready)) {
+    handle = 0;
+    return true;
+  }
+
+  if (batch.is_ready) {
+    prepare_next_specialization_batch();
+  }
+
+  bool is_ready = true;
+  for (SpecializationWork &item : batch.items) {
+    if (item.is_ready) {
+      continue;
+    }
+
+    if (!item.do_async_compilation) {
+      /* Compilation will happen locally on shader bind. */
+      glDeleteProgram(item.program);
+      item.program = 0;
+      item.shader->program_active_->program_id = 0;
+      item.shader->constants.is_dirty = true;
+      item.is_ready = true;
+      continue;
+    }
+
+    if (item.worker == nullptr) {
+      /* Try to acquire an available worker. */
+      item.worker = get_compiler_worker(item.sources);
+    }
+    else if (item.worker->is_ready()) {
+      /* Retrieve the binary compiled by the worker. */
+      if (item.worker->load_program_binary(item.program)) {
+        item.worker->release();
+        item.worker = nullptr;
        item.is_ready = true;
-        continue;
      }
-
-      if (item.worker == nullptr) {
-        /* Try to acquire an available worker. */
-        item.worker = get_compiler_worker(item.sources);
-      }
-      else if (item.worker->is_ready()) {
-        /* Retrieve the binary compiled by the worker. */
-        if (item.worker->load_program_binary(item.program)) {
-          item.worker->release();
-          item.worker = nullptr;
-          item.is_ready = true;
-        }
-        else {
-          /* Compilation failed, local compilation will be tried later on shader bind. */
-          item.do_async_compilation = false;
-        }
-      }
-      else if (worker_is_lost(item.worker)) {
-        /* We lost the worker, local compilation will be tried later on shader bind. */
+      else {
+        /* Compilation failed, local compilation will be tried later on shader bind. */
        item.do_async_compilation = false;
      }
+    }
+    else if (worker_is_lost(item.worker)) {
+      /* We lost the worker, local compilation will be tried later on shader bind. */
+      item.do_async_compilation = false;
+    }

-      if (!item.is_ready) {
-        is_ready = false;
-      }
+    if (!item.is_ready) {
+      is_ready = false;
    }
  }
+
+  if (is_ready) {
+    batch.is_ready = true;
+    handle = 0;
+  }
+
+  return is_ready;
 }

 /** \} */
--- a/source/blender/gpu/opengl/gl_shader.hh
+++ b/source/blender/gpu/opengl/gl_shader.hh
@@ -288,9 +288,38 @@ class GLShaderCompiler : public ShaderCompiler {
    bool is_ready = false;
  };

-  BatchHandle next_batch_handle = 1;
  Map<BatchHandle, Batch> batches;

+  struct SpecializationRequest {
+    BatchHandle handle;
+    Vector<ShaderSpecialization> specializations;
+  };
+
+  Vector<SpecializationRequest> specialization_queue;
+
+  struct SpecializationWork {
+    GLShader *shader = nullptr;
+    GLuint program;
+    GLSourcesBaked sources;
+
+    GLCompilerWorker *worker = nullptr;
+    bool do_async_compilation = false;
+    bool is_ready = false;
+  };
+
+  struct SpecializationBatch {
+    SpecializationBatchHandle handle = 0;
+    Vector<SpecializationWork> items;
+    bool is_ready = true;
+  };
+
+  SpecializationBatch current_specialization_batch;
+  void prepare_next_specialization_batch();
+
+  /* Shared accross regular and specialization batches,
+   * to prevent the use of a wrong handle type. */
+  int64_t next_batch_handle = 1;
+
  GLCompilerWorker *get_compiler_worker(const GLSourcesBaked &sources);
  bool worker_is_lost(GLCompilerWorker *&worker);

@@ -301,7 +330,10 @@ class GLShaderCompiler : public ShaderCompiler {
  virtual bool batch_is_ready(BatchHandle handle) override;
  virtual Vector<Shader *> batch_finalize(BatchHandle &handle) override;

-  virtual void precompile_specializations(Span<ShaderSpecialization> specializations) override;
+  virtual SpecializationBatchHandle precompile_specializations(
+      Span<ShaderSpecialization> specializations) override;
+
+  virtual bool specialization_batch_is_ready(SpecializationBatchHandle &handle) override;
 };

 #else