From eb3fe753925bcc888f352e135b4251b3e657ea34 Mon Sep 17 00:00:00 2001
From: Jason Fielder <jason-fielder@noreply.localhost>
Date: Mon, 30 Sep 2024 11:21:28 +0200
Subject: [PATCH] Metal: Add support for parallel compilation and
 precompilation specialisation

This speeds up EEVEE startup and material compilation time.

Authored by Apple: James McCarthy
Pull Request: https://projects.blender.org/blender/blender/pulls/125657
---
 intern/ghost/test/multitest/MultiTest.c       |   1 +
 source/blender/gpu/GPU_context.hh             |   4 +
 source/blender/gpu/GPU_shader.hh              |   5 +-
 source/blender/gpu/intern/gpu_context.cc      |  13 +
 source/blender/gpu/metal/mtl_backend.mm       |  66 +++
 source/blender/gpu/metal/mtl_capabilities.hh  |   4 +
 source/blender/gpu/metal/mtl_context.mm       |  16 +-
 .../gpu/metal/mtl_pso_descriptor_state.hh     |   6 +
 source/blender/gpu/metal/mtl_shader.hh        | 110 ++++-
 source/blender/gpu/metal/mtl_shader.mm        | 447 +++++++++++++++++-
 source/blender/gpu/tests/gpu_testing.cc       |   1 +
 .../windowmanager/intern/wm_playanim.cc       |   1 +
 .../blender/windowmanager/intern/wm_window.cc |   1 +
 13 files changed, 658 insertions(+), 17 deletions(-)
diff --git a/intern/ghost/test/multitest/MultiTest.c b/intern/ghost/test/multitest/MultiTest.c
index 9dd89fc4255..af47501bfb7 100644
--- a/intern/ghost/test/multitest/MultiTest.c
+++ b/intern/ghost/test/multitest/MultiTest.c
@@ -874,6 +874,7 @@ MultiTestApp *multitestapp_new(void)
   if (!app->sys) {
     fatal("Unable to create ghost system");
   }
+  GPU_backend_ghost_system_set(app->sys);
 
   if (!GHOST_AddEventConsumer(app->sys, consumer)) {
     fatal("Unable to add multitest event consumer ");
diff --git a/source/blender/gpu/GPU_context.hh b/source/blender/gpu/GPU_context.hh
index ab8040d7406..7c0e69c2e84 100644
--- a/source/blender/gpu/GPU_context.hh
+++ b/source/blender/gpu/GPU_context.hh
@@ -83,3 +83,7 @@ void GPU_render_end();
 /* For operations which need to run exactly once per frame -- even if there are no render updates.
  */
 void GPU_render_step();
+
+/* For when we need access to a system context in order to create a GPU context. */
+void GPU_backend_ghost_system_set(void *ghost_system_handle);
+void *GPU_backend_ghost_system_get();
diff --git a/source/blender/gpu/GPU_shader.hh b/source/blender/gpu/GPU_shader.hh
index 7f6dc0eb8d2..990cc41df16 100644
--- a/source/blender/gpu/GPU_shader.hh
+++ b/source/blender/gpu/GPU_shader.hh
@@ -231,7 +231,10 @@ struct ShaderSpecialization {
  * Request the compilation of multiple specialization constant variations at once,
  * allowing the backend to use multithreaded compilation.
  * Returns a handle that can be used to poll if all variations have been compiled.
- * NOTE: This function is asynchronous on OpenGL, and a no-op on Vulkan and Metal.
+ * A NULL handle indicates no compilation of any variant was possible (likely due to
+ * some state being currently available) and so no batch was created. Compilation
+ * of the specialiized variant will instead occur at draw/dispatch time.
+ * NOTE: This function is asynchronous on OpenGL and Metal and a no-op on Vulkan.
  * Batches are processed one by one in FIFO order.
  * WARNING: Binding a specialization before the batch finishes will fail.
  */
diff --git a/source/blender/gpu/intern/gpu_context.cc b/source/blender/gpu/intern/gpu_context.cc
index 4c544276a4d..4bfd8e43ea1 100644
--- a/source/blender/gpu/intern/gpu_context.cc
+++ b/source/blender/gpu/intern/gpu_context.cc
@@ -13,6 +13,8 @@
  * - free can be called from any thread
  */
 
+#include "GHOST_C-api.h"
+
 #include "BKE_global.hh"
 
 #include "BLI_assert.h"
@@ -241,6 +243,17 @@ static eGPUBackendType g_backend_type = GPU_BACKEND_OPENGL;
 static std::optional<eGPUBackendType> g_backend_type_override = std::nullopt;
 static std::optional<bool> g_backend_type_supported = std::nullopt;
 static GPUBackend *g_backend = nullptr;
+static GHOST_SystemHandle g_ghost_system = nullptr;
+
+void GPU_backend_ghost_system_set(void *ghost_system_handle)
+{
+  g_ghost_system = reinterpret_cast<GHOST_SystemHandle>(ghost_system_handle);
+}
+
+void *GPU_backend_ghost_system_get()
+{
+  return g_ghost_system;
+}
 
 void GPU_backend_type_selection_set(const eGPUBackendType backend)
 {
diff --git a/source/blender/gpu/metal/mtl_backend.mm b/source/blender/gpu/metal/mtl_backend.mm
index d856e9aefca..925678d92e8 100644
--- a/source/blender/gpu/metal/mtl_backend.mm
+++ b/source/blender/gpu/metal/mtl_backend.mm
@@ -28,6 +28,7 @@
 #include <Cocoa/Cocoa.h>
 #include <Metal/Metal.h>
 #include <QuartzCore/QuartzCore.h>
+#include <sys/sysctl.h>
 
 namespace blender::gpu {
 
@@ -285,6 +286,64 @@ bool supports_barycentric_whitelist(id<MTLDevice> device)
   return supported_gpu && should_support_barycentrics;
 }
 
+bool is_apple_sillicon(id<MTLDevice> device)
+{
+  NSString *gpu_name = [device name];
+  BLI_assert([gpu_name length]);
+
+  const char *vendor = [gpu_name UTF8String];
+
+  /* Known good configs. */
+  return (strstr(vendor, "Apple") || strstr(vendor, "APPLE"));
+}
+
+static int get_num_performance_cpu_cores(id<MTLDevice> device)
+{
+  const int SYSCTL_BUF_LENGTH = 16;
+  int num_performance_cores = -1;
+  unsigned char sysctl_buffer[SYSCTL_BUF_LENGTH];
+  size_t sysctl_buffer_length = SYSCTL_BUF_LENGTH;
+
+  if (is_apple_sillicon(device)) {
+    /* On Apple Silicon query the number of performance cores */
+    if (sysctlbyname("hw.perflevel0.logicalcpu", &sysctl_buffer, &sysctl_buffer_length, NULL, 0) ==
+        0)
+    {
+      num_performance_cores = sysctl_buffer[0];
+    }
+  }
+  else {
+    /* On Intel just return the logical core count */
+    if (sysctlbyname("hw.logicalcpu", &sysctl_buffer, &sysctl_buffer_length, NULL, 0) == 0) {
+      num_performance_cores = sysctl_buffer[0];
+    }
+  }
+  BLI_assert(num_performance_cores != -1);
+  return num_performance_cores;
+}
+
+static int get_num_efficiency_cpu_cores(id<MTLDevice> device)
+{
+  if (is_apple_sillicon(device)) {
+    /* On Apple Silicon query the number of efficiency cores */
+    const int SYSCTL_BUF_LENGTH = 16;
+    int num_efficiency_cores = -1;
+    unsigned char sysctl_buffer[SYSCTL_BUF_LENGTH];
+    size_t sysctl_buffer_length = SYSCTL_BUF_LENGTH;
+    if (sysctlbyname("hw.perflevel1.logicalcpu", &sysctl_buffer, &sysctl_buffer_length, NULL, 0) ==
+        0)
+    {
+      num_efficiency_cores = sysctl_buffer[0];
+    }
+
+    BLI_assert(num_efficiency_cores != -1);
+    return num_efficiency_cores;
+  }
+  else {
+    return 0;
+  }
+}
+
 bool MTLBackend::metal_is_supported()
 {
   /* Device compatibility information using Metal Feature-set tables.
@@ -392,6 +451,10 @@ void MTLBackend::capabilities_init(MTLContext *ctx)
   }
 #endif
 
+  /* CPU Info */
+  MTLBackend::capabilities.num_performance_cores = get_num_performance_cpu_cores(ctx->device);
+  MTLBackend::capabilities.num_efficiency_cores = get_num_efficiency_cpu_cores(ctx->device);
+
   /* Common Global Capabilities. */
   GCaps.max_texture_size = ([device supportsFamily:MTLGPUFamilyApple3] ||
                             MTLBackend::capabilities.supports_family_mac1) ?
@@ -430,6 +493,9 @@ void MTLBackend::capabilities_init(MTLContext *ctx)
 
   GCaps.geometry_shader_support = false;
 
+  /* Compile shaders on performance cores but leave one free so UI is still responsive */
+  GCaps.max_parallel_compilations = MTLBackend::capabilities.num_performance_cores - 1;
+
   /* Maximum buffer bindings: 31. Consider required slot for uniforms/UBOs/Vertex attributes.
    * Can use argument buffers if a higher limit is required. */
   GCaps.max_shader_storage_buffer_bindings = 14;
diff --git a/source/blender/gpu/metal/mtl_capabilities.hh b/source/blender/gpu/metal/mtl_capabilities.hh
index bc523423bbb..63393ebb255 100644
--- a/source/blender/gpu/metal/mtl_capabilities.hh
+++ b/source/blender/gpu/metal/mtl_capabilities.hh
@@ -57,6 +57,10 @@ struct MTLCapabilities {
   bool supports_family_mac_catalyst1 = false;
   bool supports_family_mac_catalyst2 = false;
   AppleGPUType gpu = APPLE_GPU_UNKNOWN;
+
+  /* CPU Info */
+  int num_performance_cores = -1;
+  int num_efficiency_cores = -1;
 };
 
 }  // namespace gpu
diff --git a/source/blender/gpu/metal/mtl_context.mm b/source/blender/gpu/metal/mtl_context.mm
index 46fa98fd221..325ee7a252c 100644
--- a/source/blender/gpu/metal/mtl_context.mm
+++ b/source/blender/gpu/metal/mtl_context.mm
@@ -268,7 +268,12 @@ MTLContext::MTLContext(void *ghost_window, void *ghost_context)
   /* Initialize samplers. */
   this->sampler_state_cache_init();
 
-  compiler = new ShaderCompilerGeneric();
+  if (GPU_use_parallel_compilation()) {
+    compiler = new MTLShaderCompiler();
+  }
+  else {
+    compiler = new ShaderCompilerGeneric();
+  }
 }
 
 MTLContext::~MTLContext()
@@ -2217,8 +2222,15 @@ const MTLComputePipelineStateInstance *MTLContext::ensure_compute_pipeline_state
     return nullptr;
   }
 
+  MTLShader *active_shader = this->pipeline_state.active_shader;
+
+  /* Set descriptor to default shader constants . */
+  MTLComputePipelineStateDescriptor compute_pipeline_descriptor(active_shader->constants.values);
+
   const MTLComputePipelineStateInstance *compute_pso_inst =
-      this->pipeline_state.active_shader->bake_compute_pipeline_state(this);
+      this->pipeline_state.active_shader->bake_compute_pipeline_state(this,
+                                                                      compute_pipeline_descriptor);
+
   if (compute_pso_inst == nullptr || compute_pso_inst->pso == nil) {
     MTL_LOG_WARNING("No valid compute PSO for compute dispatch!", );
     return nullptr;
diff --git a/source/blender/gpu/metal/mtl_pso_descriptor_state.hh b/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
index 066e6252f0e..40828498490 100644
--- a/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
+++ b/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
@@ -347,6 +347,12 @@ struct MTLComputePipelineStateDescriptor {
   /* Specialization constants map. */
   SpecializationStateDescriptor specialization_state;
 
+  MTLComputePipelineStateDescriptor() {}
+  MTLComputePipelineStateDescriptor(Vector<Shader::Constants::Value> values)
+  {
+    specialization_state.values = values;
+  }
+
   /* Comparison Operator for caching. */
   bool operator==(const MTLComputePipelineStateDescriptor &other) const
   {
diff --git a/source/blender/gpu/metal/mtl_shader.hh b/source/blender/gpu/metal/mtl_shader.hh
index a23226a55a2..5abdb095f9f 100644
--- a/source/blender/gpu/metal/mtl_shader.hh
+++ b/source/blender/gpu/metal/mtl_shader.hh
@@ -20,6 +20,7 @@
 #include <functional>
 #include <unordered_map>
 
+#include <deque>
 #include <mutex>
 #include <thread>
 
@@ -264,9 +265,14 @@ class MTLShader : public Shader {
   void *push_constant_data_ = nullptr;
   bool push_constant_modified_ = false;
 
-  /** Special definition for Max TotalThreadsPerThreadgroup tuning. */
+  /* Special definition for Max TotalThreadsPerThreadgroup tuning. */
   uint maxTotalThreadsPerThreadgroup_Tuning_ = 0;
 
+  /* Set to true when batch compiling */
+  bool async_compilation_ = false;
+
+  bool finalize_shader(const shader::ShaderCreateInfo *info = nullptr);
+
  public:
   MTLShader(MTLContext *ctx, const char *name);
   MTLShader(MTLContext *ctx,
@@ -278,7 +284,7 @@ class MTLShader : public Shader {
             NSString *fragment_function_name_);
   ~MTLShader();
 
-  void init(const shader::ShaderCreateInfo & /*info*/, bool /*is_batch_compilation*/) override {}
+  void init(const shader::ShaderCreateInfo & /*info*/, bool is_batch_compilation) override;
 
   /* Assign GLSL source. */
   void vertex_shader_from_glsl(MutableSpan<const char *> sources) override;
@@ -296,6 +302,14 @@ class MTLShader : public Shader {
   {
     return valid_;
   }
+  bool has_compute_shader_lib()
+  {
+    return (shader_library_compute_ != nil);
+  }
+  bool has_parent_shader()
+  {
+    return (parent_shader_ != nil);
+  }
   MTLRenderPipelineStateDescriptor &get_current_pipeline_state()
   {
     return current_pipeline_state_;
@@ -375,7 +389,9 @@ class MTLShader : public Shader {
       MTLPrimitiveTopologyClass prim_type,
       const MTLRenderPipelineStateDescriptor &pipeline_descriptor);
 
-  MTLComputePipelineStateInstance *bake_compute_pipeline_state(MTLContext *ctx);
+  MTLComputePipelineStateInstance *bake_compute_pipeline_state(
+      MTLContext *ctx, MTLComputePipelineStateDescriptor &compute_pipeline_descriptor);
+
   const MTLComputePipelineStateCommon &get_compute_common_state()
   {
     return compute_pso_common_state_;
@@ -392,6 +408,94 @@ class MTLShader : public Shader {
   MEM_CXX_CLASS_ALLOC_FUNCS("MTLShader");
 };
 
+class MTLParallelShaderCompiler {
+ private:
+  enum ParallelWorkType {
+    PARALLELWORKTYPE_UNSPECIFIED,
+    PARALLELWORKTYPE_COMPILE_SHADER,
+    PARALLELWORKTYPE_BAKE_PSO,
+  };
+
+  struct ParallelWork {
+    const shader::ShaderCreateInfo *info = nullptr;
+    class MTLShaderCompiler *shader_compiler = nullptr;
+    MTLShader *shader = nullptr;
+    Vector<Shader::Constants::Value> specialization_values;
+
+    ParallelWorkType work_type = PARALLELWORKTYPE_UNSPECIFIED;
+    bool is_ready = false;
+  };
+
+  struct Batch {
+    Vector<ParallelWork *> items;
+    bool is_ready = false;
+  };
+
+  std::mutex batch_mutex;
+  BatchHandle next_batch_handle = 1;
+  Map<BatchHandle, Batch> batches;
+
+  std::vector<std::thread> compile_threads;
+
+  volatile bool terminate_compile_threads;
+  std::condition_variable cond_var;
+  std::mutex queue_mutex;
+  std::deque<ParallelWork *> parallel_work_queue;
+
+  void parallel_compilation_thread_func(GPUContext *blender_gpu_context);
+  BatchHandle create_batch(size_t batch_size);
+  void add_item_to_batch(ParallelWork *work_item, BatchHandle batch_handle);
+  void add_parallel_item_to_queue(ParallelWork *add_parallel_item_to_queuework_item,
+                                  BatchHandle batch_handle);
+
+  std::atomic<int> ref_count;
+
+ public:
+  MTLParallelShaderCompiler();
+  ~MTLParallelShaderCompiler();
+
+  void create_compile_threads();
+  BatchHandle batch_compile(MTLShaderCompiler *shade_compiler,
+                            Span<const shader::ShaderCreateInfo *> &infos);
+  bool batch_is_ready(BatchHandle handle);
+  Vector<Shader *> batch_finalize(BatchHandle &handle);
+
+  SpecializationBatchHandle precompile_specializations(Span<ShaderSpecialization> specializations);
+  bool specialization_batch_is_ready(SpecializationBatchHandle &handle);
+
+  void increment_ref_count()
+  {
+    ref_count++;
+  }
+  void decrement_ref_count()
+  {
+    ref_count--;
+  }
+  int get_ref_count()
+  {
+    return ref_count;
+  }
+};
+
+class MTLShaderCompiler : public ShaderCompiler {
+ private:
+  MTLParallelShaderCompiler *parallel_shader_compiler;
+
+ public:
+  MTLShaderCompiler();
+  virtual ~MTLShaderCompiler() override;
+
+  virtual BatchHandle batch_compile(Span<const shader::ShaderCreateInfo *> &infos) override;
+  virtual bool batch_is_ready(BatchHandle handle) override;
+  virtual Vector<Shader *> batch_finalize(BatchHandle &handle) override;
+
+  virtual SpecializationBatchHandle precompile_specializations(
+      Span<ShaderSpecialization> specializations) override;
+  virtual bool specialization_batch_is_ready(SpecializationBatchHandle &handle) override;
+
+  void release_parallel_shader_compiler();
+};
+
 /* Vertex format conversion.
  * Determines whether it is possible to resize a vertex attribute type
  * during input assembly. A conversion is implied by the  difference
diff --git a/source/blender/gpu/metal/mtl_shader.mm b/source/blender/gpu/metal/mtl_shader.mm
index fb97e137bb2..376a85ecf11 100644
--- a/source/blender/gpu/metal/mtl_shader.mm
+++ b/source/blender/gpu/metal/mtl_shader.mm
@@ -8,9 +8,11 @@
 
 #include "BKE_global.hh"
 
-#include "BLI_time.h"
+#include "DNA_userdef_types.h"
 
 #include "BLI_string.h"
+#include "BLI_time.h"
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -37,7 +39,9 @@
 #include "mtl_texture.hh"
 #include "mtl_vertex_buffer.hh"
 
-extern char datatoc_mtl_shader_common_msl[];
+#include "GHOST_C-api.h"
+
+extern const char datatoc_mtl_shader_common_msl[];
 
 using namespace blender;
 using namespace blender::gpu;
@@ -168,6 +172,11 @@ MTLShader::~MTLShader()
   }
 }
 
+void MTLShader::init(const shader::ShaderCreateInfo & /*info*/, bool is_batch_compilation)
+{
+  async_compilation_ = is_batch_compilation;
+}
+
 /** \} */
 
 /* -------------------------------------------------------------------- */
@@ -462,7 +471,10 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
     /* If this is a compute shader, bake base PSO for compute straight-away.
      * NOTE: This will compile the base unspecialized variant. */
     if (is_compute) {
-      this->bake_compute_pipeline_state(context_);
+      /* Set descriptor to default shader constants */
+      MTLComputePipelineStateDescriptor compute_pipeline_descriptor(this->constants.values);
+
+      this->bake_compute_pipeline_state(context_, compute_pipeline_descriptor);
     }
   }
 
@@ -708,6 +720,8 @@ void MTLShader::push_constant_bindstate_mark_dirty(bool is_dirty)
   push_constant_modified_ = is_dirty;
 }
 
+/* Attempts to pre-generate a PSO based on the parent shaders PSO
+ * (Render shaders only) */
 void MTLShader::warm_cache(int limit)
 {
   if (parent_shader_ != nullptr) {
@@ -1450,7 +1464,8 @@ MTLRenderPipelineStateInstance *MTLShader::bake_pipeline_state(
   }
 }
 
-MTLComputePipelineStateInstance *MTLShader::bake_compute_pipeline_state(MTLContext *ctx)
+MTLComputePipelineStateInstance *MTLShader::bake_compute_pipeline_state(
+    MTLContext *ctx, MTLComputePipelineStateDescriptor &compute_pipeline_descriptor)
 {
   /* NOTE(Metal): Bakes and caches a PSO for compute. */
   BLI_assert(this);
@@ -1459,13 +1474,6 @@ MTLComputePipelineStateInstance *MTLShader::bake_compute_pipeline_state(MTLConte
   BLI_assert(this->is_valid());
   BLI_assert(shader_library_compute_ != nil);
 
-  /* Evaluate descriptor for specialization constants. */
-  MTLComputePipelineStateDescriptor compute_pipeline_descriptor;
-
-  /* Specialization configuration.
-   * NOTE: If allow_specialized is disabled, we will build the base un-specialized variant. */
-  compute_pipeline_descriptor.specialization_state = {this->constants.values};
-
   /* Check if current PSO exists in the cache. */
   pso_cache_lock_.lock();
   MTLComputePipelineStateInstance **pso_lookup = compute_pso_cache_.lookup_ptr(
@@ -1806,4 +1814,421 @@ bool MTLShader::has_transform_feedback_varying(std::string str)
           tf_output_name_list_.end());
 }
 
+/** \} */
+
+/* Since this is going to be compiling shaders in a multi-threaded fashion we
+ * don't want to create an instance per context as we want to restrict the
+ * number of simultanenous compliation threads to ensure system respsonsiveness.
+ * Hence the global shared instance. */
+MTLParallelShaderCompiler *g_shared_parallel_shader_compiler = nullptr;
+std::mutex g_shared_parallel_shader_compiler_mutex;
+
+MTLParallelShaderCompiler *get_shared_parallel_shader_compiler()
+{
+  std::scoped_lock lock(g_shared_parallel_shader_compiler_mutex);
+
+  if (!g_shared_parallel_shader_compiler) {
+    g_shared_parallel_shader_compiler = new MTLParallelShaderCompiler();
+  }
+  else {
+    g_shared_parallel_shader_compiler->increment_ref_count();
+  }
+  return g_shared_parallel_shader_compiler;
+}
+
+void release_shared_parallel_shader_compiler()
+{
+  std::scoped_lock lock(g_shared_parallel_shader_compiler_mutex);
+
+  if (!g_shared_parallel_shader_compiler) {
+    return;
+  }
+
+  g_shared_parallel_shader_compiler->decrement_ref_count();
+  if (g_shared_parallel_shader_compiler->get_ref_count() == 0) {
+    delete g_shared_parallel_shader_compiler;
+    g_shared_parallel_shader_compiler = nullptr;
+  }
+}
+
+/* -------------------------------------------------------------------- */
+/** \name MTLParallelShaderCompiler
+ * \{ */
+
+MTLParallelShaderCompiler::MTLParallelShaderCompiler()
+{
+  BLI_assert(GPU_use_parallel_compilation());
+
+  terminate_compile_threads = false;
+}
+
+MTLParallelShaderCompiler::~MTLParallelShaderCompiler()
+{
+  BLI_assert(batches.is_empty());
+  terminate_compile_threads = true;
+  cond_var.notify_all();
+
+  for (auto &thread : compile_threads) {
+    thread.join();
+  }
+}
+
+void MTLParallelShaderCompiler::create_compile_threads()
+{
+  std::unique_lock<std::mutex> lock(queue_mutex);
+
+  /* Return if the compilation threads already exist */
+  if (!compile_threads.empty()) {
+    return;
+  }
+
+  /* Limit to the number of compiler threads to (performance cores - 1) to
+   * leave one thread free for main thread/UI responsiveness */
+  const MTLCapabilities &capabilities = MTLBackend::get_capabilities();
+  int max_mtlcompiler_threads = capabilities.num_performance_cores - 1;
+
+  /* Save the main thread context */
+  GPUContext *main_thread_context = GPU_context_active_get();
+  MTLContext *metal_context = static_cast<MTLContext *>(unwrap(main_thread_context));
+  id<MTLDevice> metal_device = metal_context->device;
+
+#if defined(MAC_OS_VERSION_13_3)
+  /* Clamp the number of threads if neccessary. */
+  if (@available(macOS 13.3, *)) {
+    /* Check we've set the flag to allow more than 2 compile threads. */
+    BLI_assert(metal_device.shouldMaximizeConcurrentCompilation);
+    max_mtlcompiler_threads = MIN(int([metal_device maximumConcurrentCompilationTaskCount]),
+                                  max_mtlcompiler_threads);
+  }
+#endif
+
+  /* GPU settings for context creation. */
+  GHOST_GPUSettings gpuSettings = {0};
+  gpuSettings.context_type = GHOST_kDrawingContextTypeMetal;
+  if (G.debug & G_DEBUG_GPU) {
+    gpuSettings.flags |= GHOST_gpuDebugContext;
+  }
+  gpuSettings.preferred_device.index = U.gpu_preferred_index;
+  gpuSettings.preferred_device.vendor_id = U.gpu_preferred_vendor_id;
+  gpuSettings.preferred_device.device_id = U.gpu_preferred_device_id;
+
+  /* Spawn the compiler threads. */
+  for (int i = 0; i < max_mtlcompiler_threads; i++) {
+
+    /* Grab the system handle.  */
+    GHOST_SystemHandle ghost_system = reinterpret_cast<GHOST_SystemHandle>(
+        GPU_backend_ghost_system_get());
+    BLI_assert(ghost_system);
+
+    /* Create a Ghost GPU Context using the system handle. */
+    GHOST_ContextHandle ghost_gpu_context = GHOST_CreateGPUContext(ghost_system, gpuSettings);
+
+    /* Create a GPU context for the compile thread to use. */
+    GPUContext *per_thread_context = GPU_context_create(nullptr, ghost_gpu_context);
+
+    /* Restore the main thread context.
+     * (required as the above context creation also makes it active). */
+    GPU_context_active_set(main_thread_context);
+
+    /* Create a new thread */
+    compile_threads.push_back(std::thread([this, per_thread_context] {
+      this->parallel_compilation_thread_func(per_thread_context);
+    }));
+  }
+}
+
+void MTLParallelShaderCompiler::parallel_compilation_thread_func(GPUContext *blender_gpu_context)
+{
+  /* Contexts can only be created on the main thread so we have to
+   * pass one in and make it active here  */
+  GPU_context_active_set(blender_gpu_context);
+
+  MTLContext *metal_context = static_cast<MTLContext *>(unwrap(blender_gpu_context));
+  MTLShaderCompiler *shader_compiler = static_cast<MTLShaderCompiler *>(metal_context->compiler);
+
+  /* This context is only for compilation, it does not need it's own instance of the compiler */
+  shader_compiler->release_parallel_shader_compiler();
+
+  /* Loop until we get the terminate signal */
+  while (!terminate_compile_threads) {
+    /* Grab the next shader off of the queue or wait... */
+    ParallelWork *work_item = nullptr;
+    {
+      std::unique_lock<std::mutex> lock(queue_mutex);
+      cond_var.wait(lock,
+                    [&] { return terminate_compile_threads || !parallel_work_queue.empty(); });
+      if (terminate_compile_threads || parallel_work_queue.empty()) {
+        continue;
+      }
+      work_item = parallel_work_queue.front();
+      parallel_work_queue.pop_front();
+    }
+
+    /* Compile a shader */
+    if (work_item->work_type == PARALLELWORKTYPE_COMPILE_SHADER) {
+      BLI_assert(work_item->info);
+
+      const shader::ShaderCreateInfo *shader_info = work_item->info;
+      work_item->shader = static_cast<MTLShader *>(
+          work_item->shader_compiler->compile(*shader_info, true));
+
+      if (work_item->shader) {
+        /* Generate and cache any render PSOs if possible (typically materials only)
+         * (Finalize() will already bake a Compute PSO if possible) */
+        work_item->shader->warm_cache(-1);
+      }
+    }
+    /* Bake PSO */
+    else if (work_item->work_type == PARALLELWORKTYPE_BAKE_PSO) {
+      MTLShader *shader = work_item->shader;
+      /* Currently only support Compute */
+      BLI_assert(shader && shader->has_compute_shader_lib());
+
+      /* Create descriptor using these specialization constants. */
+      MTLComputePipelineStateDescriptor compute_pipeline_descriptor(
+          work_item->specialization_values);
+
+      shader->bake_compute_pipeline_state(metal_context, compute_pipeline_descriptor);
+    }
+    else {
+      BLI_assert(false);
+    }
+    work_item->is_ready = true;
+  }
+
+  GPU_context_discard(blender_gpu_context);
+}
+
+BatchHandle MTLParallelShaderCompiler::create_batch(size_t batch_size)
+{
+  std::scoped_lock lock(batch_mutex);
+  BatchHandle batch_handle = next_batch_handle++;
+  batches.add(batch_handle, {});
+  Batch &batch = batches.lookup(batch_handle);
+  if (batch_size) {
+    batch.items.reserve(batch_size);
+  }
+  batch.is_ready = false;
+  shader_debug_printf("Created batch %llu\n", batch_handle);
+  return batch_handle;
+}
+
+void MTLParallelShaderCompiler::add_item_to_batch(ParallelWork *work_item,
+                                                  BatchHandle batch_handle)
+{
+  std::scoped_lock lock(batch_mutex);
+  Batch &batch = batches.lookup(batch_handle);
+  batch.items.append(work_item);
+}
+
+void MTLParallelShaderCompiler::add_parallel_item_to_queue(ParallelWork *work_item,
+                                                           BatchHandle batch_handle)
+{
+  shader_debug_printf("Request add shader work\n");
+  if (!terminate_compile_threads) {
+
+    /* Defer creation of compilation threads until required */
+    if (compile_threads.empty()) {
+      create_compile_threads();
+    }
+
+    add_item_to_batch(work_item, batch_handle);
+    std::lock_guard<std::mutex> lock(queue_mutex);
+    parallel_work_queue.push_back(work_item);
+    cond_var.notify_one();
+  }
+}
+
+BatchHandle MTLParallelShaderCompiler::batch_compile(MTLShaderCompiler *shader_compiler,
+                                                     Span<const shader::ShaderCreateInfo *> &infos)
+{
+  BLI_assert(GPU_use_parallel_compilation());
+
+  BatchHandle batch_handle = create_batch(infos.size());
+
+  shader_debug_printf("Batch compile %llu shaders (Batch = %llu)\n", infos.size(), batch_handle);
+
+  /* Have to finalize all shaderInfos *before* any parallel compilation as
+   * ShaderCreateInfo::finalize() is not thread safe */
+  for (const shader::ShaderCreateInfo *info : infos) {
+    const_cast<ShaderCreateInfo *>(info)->finalize();
+  }
+
+  for (const shader::ShaderCreateInfo *info : infos) {
+    ParallelWork *work_item = new ParallelWork;
+    work_item->info = info;
+    work_item->shader_compiler = shader_compiler;
+    work_item->is_ready = false;
+    work_item->shader = nullptr;
+    work_item->work_type = PARALLELWORKTYPE_COMPILE_SHADER;
+    add_parallel_item_to_queue(work_item, batch_handle);
+  }
+
+  return batch_handle;
+}
+
+bool MTLParallelShaderCompiler::batch_is_ready(BatchHandle handle)
+{
+  std::scoped_lock lock(batch_mutex);
+  Batch &batch = batches.lookup(handle);
+  if (batch.is_ready) {
+    return true;
+  }
+
+  for (ParallelWork *item : batch.items) {
+    if (item->is_ready) {
+      continue;
+    }
+    else {
+      return false;
+    }
+  }
+
+  batch.is_ready = true;
+  shader_debug_printf("Batch %llu is now ready\n", handle);
+  return batch.is_ready;
+}
+
+Vector<Shader *> MTLParallelShaderCompiler::batch_finalize(BatchHandle &handle)
+{
+  while (!batch_is_ready(handle)) {
+    BLI_time_sleep_ms(1);
+  }
+  std::scoped_lock lock(batch_mutex);
+
+  Batch batch = batches.pop(handle);
+  Vector<Shader *> result;
+  for (ParallelWork *item : batch.items) {
+    result.append(item->shader);
+    delete item;
+  }
+  handle = 0;
+  return result;
+}
+
+SpecializationBatchHandle MTLParallelShaderCompiler::precompile_specializations(
+    Span<ShaderSpecialization> specializations)
+{
+  BLI_assert(GPU_use_parallel_compilation());
+  /* Zero indicates no batch was created */
+  SpecializationBatchHandle batch_handle = 0;
+
+  for (auto &specialization : specializations) {
+    MTLShader *sh = static_cast<MTLShader *>(unwrap(specialization.shader));
+
+    /* Specialization constants only take effect when we create the PSO.
+     * We don't have the relevant info to create a Render PSO Descriptor unless
+     * the shader has a has_parent_shader() but in that case it would (currently) be
+     * invalid to apply specialization constants. For those reasons we currently only
+     * support precompilation of Compute shaders.
+     * (technically we could call makeFunction but the benefit would likely be minimal) */
+    if (!sh->has_compute_shader_lib()) {
+      continue;
+    }
+
+    BLI_assert_msg(sh->is_valid(), "Shader must be finalized before precompiling specializations");
+
+    /* Defer batch creation until we have some work to do */
+    if (!batch_handle) {
+      batch_handle = create_batch(1);
+    }
+
+    ParallelWork *work_item = new ParallelWork;
+    work_item->info = nullptr;
+    work_item->is_ready = false;
+    work_item->shader = sh;
+    work_item->work_type = PARALLELWORKTYPE_BAKE_PSO;
+
+    /* Add the specialization constants to the work-item */
+    for (const SpecializationConstant &constant : specialization.constants) {
+      const ShaderInput *input = sh->interface->constant_get(constant.name.c_str());
+      BLI_assert_msg(input != nullptr, "The specialization constant doesn't exists");
+      work_item->specialization_values[input->location].u = constant.value.u;
+    }
+    sh->constants.is_dirty = true;
+
+    add_parallel_item_to_queue(work_item, batch_handle);
+  }
+  return batch_handle;
+}
+
+bool MTLParallelShaderCompiler::specialization_batch_is_ready(SpecializationBatchHandle &handle)
+{
+  /* Check empty batch case where we have no handle */
+  if (!handle) {
+    return true;
+  }
+
+  std::scoped_lock lock(batch_mutex);
+  Batch &batch = batches.lookup(handle);
+  if (batch.is_ready) {
+    return true;
+  }
+
+  for (ParallelWork *item : batch.items) {
+    if (item->is_ready) {
+      continue;
+    }
+    else {
+      return false;
+    }
+  }
+
+  /* Handle is zeroed once the batch is ready */
+  handle = 0;
+  batch.is_ready = true;
+  shader_debug_printf("Specialization Batch %llu is now ready\n", handle);
+  return batch.is_ready;
+}
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name MTLShaderCompiler
+ * \{ */
+
+MTLShaderCompiler::MTLShaderCompiler()
+{
+  parallel_shader_compiler = get_shared_parallel_shader_compiler();
+}
+
+MTLShaderCompiler::~MTLShaderCompiler()
+{
+  release_parallel_shader_compiler();
+}
+
+void MTLShaderCompiler::release_parallel_shader_compiler()
+{
+  if (parallel_shader_compiler) {
+    release_shared_parallel_shader_compiler();
+    parallel_shader_compiler = nullptr;
+  }
+}
+
+BatchHandle MTLShaderCompiler::batch_compile(Span<const shader::ShaderCreateInfo *> &infos)
+{
+  BLI_assert(parallel_shader_compiler);
+  return parallel_shader_compiler->batch_compile(this, infos);
+}
+bool MTLShaderCompiler::batch_is_ready(BatchHandle handle)
+{
+  return parallel_shader_compiler->batch_is_ready(handle);
+}
+Vector<Shader *> MTLShaderCompiler::batch_finalize(BatchHandle &handle)
+{
+  return parallel_shader_compiler->batch_finalize(handle);
+}
+SpecializationBatchHandle MTLShaderCompiler::precompile_specializations(
+    Span<ShaderSpecialization> specializations)
+{
+  return parallel_shader_compiler->precompile_specializations(specializations);
+}
+
+bool MTLShaderCompiler::specialization_batch_is_ready(SpecializationBatchHandle &handle)
+{
+  return parallel_shader_compiler->specialization_batch_is_ready(handle);
+}
+
+/** \} */
+
 }  // namespace blender::gpu
diff --git a/source/blender/gpu/tests/gpu_testing.cc b/source/blender/gpu/tests/gpu_testing.cc
index fbe2e74c86d..47a86b50b82 100644
--- a/source/blender/gpu/tests/gpu_testing.cc
+++ b/source/blender/gpu/tests/gpu_testing.cc
@@ -28,6 +28,7 @@ void GPUTest::SetUp()
   gpuSettings.context_type = draw_context_type;
   gpuSettings.flags = GHOST_gpuDebugContext;
   ghost_system = GHOST_CreateSystem();
+  GPU_backend_ghost_system_set(ghost_system);
   ghost_context = GHOST_CreateGPUContext(ghost_system, gpuSettings);
   GHOST_ActivateGPUContext(ghost_context);
   context = GPU_context_create(nullptr, ghost_context);
diff --git a/source/blender/windowmanager/intern/wm_playanim.cc b/source/blender/windowmanager/intern/wm_playanim.cc
index 302f9667328..c49456b20cd 100644
--- a/source/blender/windowmanager/intern/wm_playanim.cc
+++ b/source/blender/windowmanager/intern/wm_playanim.cc
@@ -1844,6 +1844,7 @@ static bool wm_main_playanim_intern(int argc, const char **argv, PlayArgs *args_
     GHOST_SetBacktraceHandler((GHOST_TBacktraceFn)BLI_system_backtrace);
 
     ps.ghost_data.system = GHOST_CreateSystem();
+    GPU_backend_ghost_system_set(ps.ghost_data.system);
 
     if (UNLIKELY(ps.ghost_data.system == nullptr)) {
       /* GHOST will have reported the back-ends that failed to load. */
diff --git a/source/blender/windowmanager/intern/wm_window.cc b/source/blender/windowmanager/intern/wm_window.cc
index 7bd6525ba91..6aacb7f7072 100644
--- a/source/blender/windowmanager/intern/wm_window.cc
+++ b/source/blender/windowmanager/intern/wm_window.cc
@@ -1889,6 +1889,7 @@ void wm_ghost_init(bContext *C)
   GHOST_SetBacktraceHandler((GHOST_TBacktraceFn)BLI_system_backtrace);
 
   g_system = GHOST_CreateSystem();
+  GPU_backend_ghost_system_set(g_system);
 
   if (UNLIKELY(g_system == nullptr)) {
     /* GHOST will have reported the back-ends that failed to load. */