From 33005ad716aa11ed3e672eccfaced16b3c30734e Mon Sep 17 00:00:00 2001
From: Miguel Pozo <pragma37@gmail.com>
Date: Thu, 20 Jun 2024 18:02:44 +0200
Subject: [PATCH] GPU: Non-blocking specialization constants compilation

Update the batch specializations compilation to allow using it in an
async way.

The implementation has 2 main limitations:
- Only one batch at a time can be processed, extra batches will be
  added to a queue.
- Binding a specialization variant that is still being compiled will fail.

Pull Request: https://projects.blender.org/blender/blender/pulls/123015
---
 .../draw/engines/eevee_next/eevee_instance.cc |   5 +
 .../draw/engines/eevee_next/eevee_pipeline.cc |  24 ----
 .../draw/engines/eevee_next/eevee_shader.cc   |  51 +++++--
 .../draw/engines/eevee_next/eevee_shader.hh   |   5 +
 source/blender/gpu/GPU_shader.hh              |  20 ++-
 source/blender/gpu/intern/gpu_shader.cc       |  10 +-
 .../blender/gpu/intern/gpu_shader_private.hh  |  13 +-
 source/blender/gpu/opengl/gl_shader.cc        | 132 +++++++++++-------
 source/blender/gpu/opengl/gl_shader.hh        |  36 ++++-
 9 files changed, 208 insertions(+), 88 deletions(-)

diff --git a/source/blender/draw/engines/eevee_next/eevee_instance.cc b/source/blender/draw/engines/eevee_next/eevee_instance.cc
index 7d7e91f593b..41b68d41e87 100644
--- a/source/blender/draw/engines/eevee_next/eevee_instance.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_instance.cc
@@ -112,6 +112,11 @@ void Instance::init(const int2 &output_res,
   volume_probes.init();
   volume.init();
   lookdev.init(visible_rect);
+
+  /* Pre-compile specialization constants in parallel (if supported). */
+  shaders.precompile_specializations(
+      render_buffers.data.shadow_id, shadows.get_data().ray_count, shadows.get_data().step_count);
+  shaders_are_ready_ = shaders.is_ready(is_image_render());
 }
 
 void Instance::init_light_bake(Depsgraph *depsgraph, draw::Manager *manager)
diff --git a/source/blender/draw/engines/eevee_next/eevee_pipeline.cc b/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
index 98aa6f436e3..871ece23dd7 100644
--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
@@ -501,30 +501,6 @@ void DeferredLayerBase::gbuffer_pass_sync(Instance &inst)
 
 void DeferredLayer::begin_sync()
 {
-  if (GPU_use_parallel_compilation()) {
-    /* Pre-compile specialization constants in parallel. */
-    Vector<ShaderSpecialization> specializations;
-    for (int i = 0; i < 3; i++) {
-      GPUShader *sh = inst_.shaders.static_shader_get(eShaderType(DEFERRED_LIGHT_SINGLE + i));
-      for (bool use_split_indirect : {false, true}) {
-        for (bool use_lightprobe_eval : {false, true}) {
-          for (bool use_transmission : {false, true}) {
-            specializations.append(
-                {sh,
-                 {{"render_pass_shadow_id", inst_.render_buffers.data.shadow_id},
-                  {"use_split_indirect", use_split_indirect},
-                  {"use_lightprobe_eval", use_lightprobe_eval},
-                  {"use_transmission", use_transmission},
-                  {"shadow_ray_count", inst_.shadows.get_data().ray_count},
-                  {"shadow_ray_step_count", inst_.shadows.get_data().step_count}}});
-          }
-        }
-      }
-    }
-
-    GPU_shaders_precompile_specializations(specializations);
-  }
-
   {
     prepass_ps_.init();
     /* Textures. */
diff --git a/source/blender/draw/engines/eevee_next/eevee_shader.cc b/source/blender/draw/engines/eevee_next/eevee_shader.cc
index aef6ddda05f..c2be8ac7b4e 100644
--- a/source/blender/draw/engines/eevee_next/eevee_shader.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.cc
@@ -90,20 +90,55 @@ ShaderModule::~ShaderModule()
  *
  * \{ */
 
-bool ShaderModule::is_ready(bool block)
+void ShaderModule::precompile_specializations(int render_buffers_shadow_id,
+                                              int shadow_ray_count,
+                                              int shadow_ray_step_count)
 {
-  if (compilation_handle_ == 0) {
-    return true;
+  BLI_assert(specialization_handle_ == 0);
+
+  if (!GPU_use_parallel_compilation()) {
+    return;
   }
 
-  if (block || GPU_shader_batch_is_ready(compilation_handle_)) {
-    Vector<GPUShader *> shaders = GPU_shader_batch_finalize(compilation_handle_);
-    for (int i : IndexRange(MAX_SHADER_TYPE)) {
-      shaders_[i] = shaders[i];
+  Vector<ShaderSpecialization> specializations;
+  for (int i = 0; i < 3; i++) {
+    GPUShader *sh = static_shader_get(eShaderType(DEFERRED_LIGHT_SINGLE + i));
+    for (bool use_split_indirect : {false, true}) {
+      for (bool use_lightprobe_eval : {false, true}) {
+        for (bool use_transmission : {false, true}) {
+          specializations.append({sh,
+                                  {{"render_pass_shadow_id", render_buffers_shadow_id},
+                                   {"use_split_indirect", use_split_indirect},
+                                   {"use_lightprobe_eval", use_lightprobe_eval},
+                                   {"use_transmission", use_transmission},
+                                   {"shadow_ray_count", shadow_ray_count},
+                                   {"shadow_ray_step_count", shadow_ray_step_count}}});
+        }
+      }
     }
   }
 
-  return compilation_handle_ == 0;
+  specialization_handle_ = GPU_shader_batch_specializations(specializations);
+}
+
+bool ShaderModule::is_ready(bool block)
+{
+  if (compilation_handle_) {
+    if (GPU_shader_batch_is_ready(compilation_handle_) || block) {
+      Vector<GPUShader *> shaders = GPU_shader_batch_finalize(compilation_handle_);
+      for (int i : IndexRange(MAX_SHADER_TYPE)) {
+        shaders_[i] = shaders[i];
+      }
+    }
+  }
+
+  if (specialization_handle_) {
+    while (!GPU_shader_batch_specializations_is_ready(specialization_handle_) && block) {
+      /* Block until ready. */
+    }
+  }
+
+  return compilation_handle_ == 0 && specialization_handle_ == 0;
 }
 
 const char *ShaderModule::static_shader_create_info_name_get(eShaderType shader_type)
diff --git a/source/blender/draw/engines/eevee_next/eevee_shader.hh b/source/blender/draw/engines/eevee_next/eevee_shader.hh
index bc886584c76..d83e495922e 100644
--- a/source/blender/draw/engines/eevee_next/eevee_shader.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.hh
@@ -159,6 +159,7 @@ class ShaderModule {
  private:
   std::array<GPUShader *, MAX_SHADER_TYPE> shaders_;
   BatchHandle compilation_handle_ = 0;
+  SpecializationBatchHandle specialization_handle_ = 0;
 
   /** Shared shader module across all engine instances. */
   static ShaderModule *g_shader_module;
@@ -169,6 +170,10 @@ class ShaderModule {
 
   bool is_ready(bool block = false);
 
+  void precompile_specializations(int render_buffers_shadow_id,
+                                  int shadow_ray_count,
+                                  int shadow_ray_step_count);
+
   GPUShader *static_shader_get(eShaderType shader_type);
   GPUMaterial *material_default_shader_get(eMaterialPipeline pipeline_type,
                                            eMaterialGeometry geometry_type);
diff --git a/source/blender/gpu/GPU_shader.hh b/source/blender/gpu/GPU_shader.hh
index ad34231a9c8..d78a0825c30 100644
--- a/source/blender/gpu/GPU_shader.hh
+++ b/source/blender/gpu/GPU_shader.hh
@@ -220,12 +220,30 @@ void GPU_shader_constant_uint(GPUShader *sh, const char *name, unsigned int valu
 void GPU_shader_constant_float(GPUShader *sh, const char *name, float value);
 void GPU_shader_constant_bool(GPUShader *sh, const char *name, bool value);
 
+using SpecializationBatchHandle = int64_t;
+
 struct ShaderSpecialization {
   GPUShader *shader;
   blender::Vector<blender::gpu::shader::SpecializationConstant> constants;
 };
 
-void GPU_shaders_precompile_specializations(blender::Span<ShaderSpecialization> specializations);
+/**
+ * Request the compilation of multiple specialization constant variations at once,
+ * allowing the backend to use multithreaded compilation.
+ * Returns a handle that can be used to poll if all variations have been compiled.
+ * NOTE: This function is asynchronous on OpenGL, and a no-op on Vulkan and Metal.
+ * Batches are processed one by one in FIFO order.
+ * WARNING: Binding a specialization before the batch finishes will fail.
+ */
+SpecializationBatchHandle GPU_shader_batch_specializations(
+    blender::Span<ShaderSpecialization> specializations);
+
+/**
+ * Returns true if all the specializations from the batch have finished their compilation.
+ * NOTE: Polling this function is required for the compilation process to keep progressing.
+ * WARNING: Invalidates the handle if it returns true.
+ */
+bool GPU_shader_batch_specializations_is_ready(SpecializationBatchHandle &handle);
 
 /** \} */
 
diff --git a/source/blender/gpu/intern/gpu_shader.cc b/source/blender/gpu/intern/gpu_shader.cc
index c71942ebb7d..cf69d286fda 100644
--- a/source/blender/gpu/intern/gpu_shader.cc
+++ b/source/blender/gpu/intern/gpu_shader.cc
@@ -515,9 +515,15 @@ void GPU_shader_constant_bool(GPUShader *sh, const char *name, bool value)
   GPU_shader_constant_bool_ex(sh, unwrap(sh)->interface->constant_get(name)->location, value);
 }
 
-void GPU_shaders_precompile_specializations(Span<ShaderSpecialization> specializations)
+SpecializationBatchHandle GPU_shader_batch_specializations(
+    blender::Span<ShaderSpecialization> specializations)
 {
-  Context::get()->compiler->precompile_specializations(specializations);
+  return Context::get()->compiler->precompile_specializations(specializations);
+}
+
+bool GPU_shader_batch_specializations_is_ready(SpecializationBatchHandle &handle)
+{
+  return Context::get()->compiler->specialization_batch_is_ready(handle);
 }
 
 /** \} */
diff --git a/source/blender/gpu/intern/gpu_shader_private.hh b/source/blender/gpu/intern/gpu_shader_private.hh
index 19706d24303..d823ca1d2bf 100644
--- a/source/blender/gpu/intern/gpu_shader_private.hh
+++ b/source/blender/gpu/intern/gpu_shader_private.hh
@@ -181,7 +181,18 @@ class ShaderCompiler {
   virtual bool batch_is_ready(BatchHandle handle) = 0;
   virtual Vector<Shader *> batch_finalize(BatchHandle &handle) = 0;
 
-  virtual void precompile_specializations(Span<ShaderSpecialization> /*specializations*/){};
+  virtual SpecializationBatchHandle precompile_specializations(
+      Span<ShaderSpecialization> /*specializations*/)
+  {
+    /* No-op.*/
+    return 0;
+  };
+
+  virtual bool specialization_batch_is_ready(SpecializationBatchHandle &handle)
+  {
+    handle = 0;
+    return true;
+  };
 };
 
 /* Generic (fully synchronous) implementation for backends that don't implement their own
diff --git a/source/blender/gpu/opengl/gl_shader.cc b/source/blender/gpu/opengl/gl_shader.cc
index 23b43096f82..7eacdb7634b 100644
--- a/source/blender/gpu/opengl/gl_shader.cc
+++ b/source/blender/gpu/opengl/gl_shader.cc
@@ -1820,6 +1820,8 @@ BatchHandle GLShaderCompiler::batch_compile(Span<const shader::ShaderCreateInfo
 bool GLShaderCompiler::batch_is_ready(BatchHandle handle)
 {
   std::scoped_lock lock(mutex_);
+
+  BLI_assert(batches.contains(handle));
   Batch &batch = batches.lookup(handle);
   if (batch.is_ready) {
     return true;
@@ -1879,6 +1881,8 @@ Vector<Shader *> GLShaderCompiler::batch_finalize(BatchHandle &handle)
     BLI_time_sleep_ms(1);
   }
   std::scoped_lock lock(mutex_);
+
+  BLI_assert(batches.contains(handle));
   Batch batch = batches.pop(handle);
   Vector<Shader *> result;
   for (CompilationWork &item : batch.items) {
@@ -1888,24 +1892,33 @@ Vector<Shader *> GLShaderCompiler::batch_finalize(BatchHandle &handle)
   return result;
 }
 
-void GLShaderCompiler::precompile_specializations(Span<ShaderSpecialization> specializations)
+SpecializationBatchHandle GLShaderCompiler::precompile_specializations(
+    Span<ShaderSpecialization> specializations)
 {
   BLI_assert(GPU_use_parallel_compilation());
 
-  struct SpecializationWork {
-    GLShader *shader = nullptr;
-    GLuint program;
-    GLSourcesBaked sources;
+  std::scoped_lock lock(mutex_);
 
-    GLCompilerWorker *worker = nullptr;
-    bool do_async_compilation = false;
-    bool is_ready = false;
-  };
+  SpecializationBatchHandle handle = next_batch_handle++;
 
-  Vector<SpecializationWork> items;
-  items.reserve(specializations.size());
+  specialization_queue.append({handle, specializations});
 
-  for (auto &specialization : specializations) {
+  return handle;
+}
+
+void GLShaderCompiler::prepare_next_specialization_batch()
+{
+  BLI_assert(current_specialization_batch.is_ready && !specialization_queue.is_empty());
+
+  SpecializationRequest &next = specialization_queue.first();
+  SpecializationBatch &batch = current_specialization_batch;
+  batch.handle = next.handle;
+  batch.is_ready = false;
+  Vector<SpecializationWork> &items = batch.items;
+  items.clear();
+  items.reserve(next.specializations.size());
+
+  for (auto &specialization : next.specializations) {
     GLShader *sh = static_cast<GLShader *>(unwrap(specialization.shader));
     for (const SpecializationConstant &constant : specialization.constants) {
       const ShaderInput *input = sh->interface->constant_get(constant.name.c_str());
@@ -1932,53 +1945,72 @@ void GLShaderCompiler::precompile_specializations(Span<ShaderSpecialization> spe
     item.do_async_compilation = required_size <= sizeof(ShaderSourceHeader::sources);
   }
 
-  bool is_ready = false;
-  while (!is_ready) {
-    /* Loop until ready, we can't defer the compilation of required specialization constants. */
-    is_ready = true;
+  specialization_queue.remove(0);
+}
 
-    for (SpecializationWork &item : items) {
-      if (item.is_ready) {
-        continue;
-      }
-      std::scoped_lock lock(mutex_);
+bool GLShaderCompiler::specialization_batch_is_ready(SpecializationBatchHandle &handle)
+{
+  std::scoped_lock lock(mutex_);
 
-      if (!item.do_async_compilation) {
-        /* Compilation will happen locally on shader bind. */
-        glDeleteProgram(item.program);
-        item.program = 0;
-        item.shader->program_active_->program_id = 0;
-        item.shader->constants.is_dirty = true;
+  SpecializationBatch &batch = current_specialization_batch;
+
+  if (handle < batch.handle || (handle == batch.handle && batch.is_ready)) {
+    handle = 0;
+    return true;
+  }
+
+  if (batch.is_ready) {
+    prepare_next_specialization_batch();
+  }
+
+  bool is_ready = true;
+  for (SpecializationWork &item : batch.items) {
+    if (item.is_ready) {
+      continue;
+    }
+
+    if (!item.do_async_compilation) {
+      /* Compilation will happen locally on shader bind. */
+      glDeleteProgram(item.program);
+      item.program = 0;
+      item.shader->program_active_->program_id = 0;
+      item.shader->constants.is_dirty = true;
+      item.is_ready = true;
+      continue;
+    }
+
+    if (item.worker == nullptr) {
+      /* Try to acquire an available worker. */
+      item.worker = get_compiler_worker(item.sources);
+    }
+    else if (item.worker->is_ready()) {
+      /* Retrieve the binary compiled by the worker. */
+      if (item.worker->load_program_binary(item.program)) {
+        item.worker->release();
+        item.worker = nullptr;
         item.is_ready = true;
-        continue;
       }
-
-      if (item.worker == nullptr) {
-        /* Try to acquire an available worker. */
-        item.worker = get_compiler_worker(item.sources);
-      }
-      else if (item.worker->is_ready()) {
-        /* Retrieve the binary compiled by the worker. */
-        if (item.worker->load_program_binary(item.program)) {
-          item.worker->release();
-          item.worker = nullptr;
-          item.is_ready = true;
-        }
-        else {
-          /* Compilation failed, local compilation will be tried later on shader bind. */
-          item.do_async_compilation = false;
-        }
-      }
-      else if (worker_is_lost(item.worker)) {
-        /* We lost the worker, local compilation will be tried later on shader bind. */
+      else {
+        /* Compilation failed, local compilation will be tried later on shader bind. */
         item.do_async_compilation = false;
       }
+    }
+    else if (worker_is_lost(item.worker)) {
+      /* We lost the worker, local compilation will be tried later on shader bind. */
+      item.do_async_compilation = false;
+    }
 
-      if (!item.is_ready) {
-        is_ready = false;
-      }
+    if (!item.is_ready) {
+      is_ready = false;
     }
   }
+
+  if (is_ready) {
+    batch.is_ready = true;
+    handle = 0;
+  }
+
+  return is_ready;
 }
 
 /** \} */
diff --git a/source/blender/gpu/opengl/gl_shader.hh b/source/blender/gpu/opengl/gl_shader.hh
index 5688f85835e..0e5f74f2b1c 100644
--- a/source/blender/gpu/opengl/gl_shader.hh
+++ b/source/blender/gpu/opengl/gl_shader.hh
@@ -288,9 +288,38 @@ class GLShaderCompiler : public ShaderCompiler {
     bool is_ready = false;
   };
 
-  BatchHandle next_batch_handle = 1;
   Map<BatchHandle, Batch> batches;
 
+  struct SpecializationRequest {
+    BatchHandle handle;
+    Vector<ShaderSpecialization> specializations;
+  };
+
+  Vector<SpecializationRequest> specialization_queue;
+
+  struct SpecializationWork {
+    GLShader *shader = nullptr;
+    GLuint program;
+    GLSourcesBaked sources;
+
+    GLCompilerWorker *worker = nullptr;
+    bool do_async_compilation = false;
+    bool is_ready = false;
+  };
+
+  struct SpecializationBatch {
+    SpecializationBatchHandle handle = 0;
+    Vector<SpecializationWork> items;
+    bool is_ready = true;
+  };
+
+  SpecializationBatch current_specialization_batch;
+  void prepare_next_specialization_batch();
+
+  /* Shared accross regular and specialization batches,
+   * to prevent the use of a wrong handle type. */
+  int64_t next_batch_handle = 1;
+
   GLCompilerWorker *get_compiler_worker(const GLSourcesBaked &sources);
   bool worker_is_lost(GLCompilerWorker *&worker);
 
@@ -301,7 +330,10 @@ class GLShaderCompiler : public ShaderCompiler {
   virtual bool batch_is_ready(BatchHandle handle) override;
   virtual Vector<Shader *> batch_finalize(BatchHandle &handle) override;
 
-  virtual void precompile_specializations(Span<ShaderSpecialization> specializations) override;
+  virtual SpecializationBatchHandle precompile_specializations(
+      Span<ShaderSpecialization> specializations) override;
+
+  virtual bool specialization_batch_is_ready(SpecializationBatchHandle &handle) override;
 };
 
 #else