diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index eff6384c85e..a27a75e48fa 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1543,6 +1543,17 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         default=False,
     )
 
+    kernel_optimization_level: EnumProperty(
+        name="Kernel Optimization",
+        description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. If optimized kernels are not available, rendering will proceed using generic kernels until the optimized set is available in the cache. This can result in additional CPU usage for a brief time (tens of seconds).",
+        default='FULL',
+        items=(
+            ('OFF', "Off", "Disable kernel optimization. Slowest rendering, no extra background CPU usage"),
+            ('INTERSECT', "Intersection only", "Optimize only intersection kernels. Faster rendering, negligible extra background CPU usage"),
+            ('FULL', "Full", "Optimize all kernels. Fastest rendering, may result in extra background CPU usage"),
+        ),
+    )
+
     def find_existing_device_entry(self, device):
         for device_entry in self.devices:
             if device_entry.id == device[2] and device_entry.type == device[1]:
@@ -1711,10 +1722,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         if compute_device_type == 'METAL':
             import platform
             # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
+            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
             if platform.machine() == 'arm64':
-                row = layout.row()
-                row.use_property_split = True
-                row.prop(self, "use_metalrt")
+                col = layout.column()
+                col.use_property_split = True
+                col.prop(self, "kernel_optimization_level")
+                col.prop(self, "use_metalrt")
 
     def draw(self, context):
         self.draw_impl(self.layout, context)
diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp
index 22beca898f1..96e7bdd03aa 100644
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -30,7 +30,7 @@ int blender_device_threads(BL::Scene &b_scene)
     return 0;
 }
 
-DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background)
+DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background, bool preview)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
@@ -113,6 +113,18 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     device.use_metalrt = true;
   }
 
+  if (preview) {
+    /* Disable specialization for preview renders. */
+    device.kernel_optimization_level = KERNEL_OPTIMIZATION_LEVEL_OFF;
+  }
+  else {
+    device.kernel_optimization_level = (KernelOptimizationLevel)get_enum(
+        cpreferences,
+        "kernel_optimization_level",
+        KERNEL_OPTIMIZATION_NUM_LEVELS,
+        KERNEL_OPTIMIZATION_LEVEL_FULL);
+  }
+
   return device;
 }
 
diff --git a/intern/cycles/blender/device.h b/intern/cycles/blender/device.h
index 7a762261829..08655743eeb 100644
--- a/intern/cycles/blender/device.h
+++ b/intern/cycles/blender/device.h
@@ -19,7 +19,8 @@ int blender_device_threads(BL::Scene &b_scene);
 /* Convert Blender settings to device specification. */
 DeviceInfo blender_device_info(BL::Preferences &b_preferences,
                                BL::Scene &b_scene,
-                               bool background);
+                               bool background,
+                               bool preview);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp
index cfc7a78143c..96cb204be4b 100644
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -754,7 +754,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
   RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr);
   BL::Scene b_scene(sceneptr);
 
-  DeviceInfo device = blender_device_info(b_preferences, b_scene, true);
+  DeviceInfo device = blender_device_info(b_preferences, b_scene, true, true);
 
   /* Get denoising parameters from view layer. */
   PointerRNA viewlayerptr;
diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp
index d87d094dc56..45fe4334f06 100644
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -866,7 +866,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* Device */
   params.threads = blender_device_threads(b_scene);
-  params.device = blender_device_info(b_preferences, b_scene, params.background);
+  params.device = blender_device_info(b_preferences, b_scene, params.background, b_engine.is_preview());
 
   /* samples */
   int samples = get_int(cscene, "samples");
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ad625fc5a47..3923698b1cd 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -57,6 +57,14 @@ enum DeviceTypeMask {
 
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
+enum KernelOptimizationLevel {
+  KERNEL_OPTIMIZATION_LEVEL_OFF = 0,
+  KERNEL_OPTIMIZATION_LEVEL_INTERSECT = 1,
+  KERNEL_OPTIMIZATION_LEVEL_FULL = 2,
+
+  KERNEL_OPTIMIZATION_NUM_LEVELS
+};
+
 class DeviceInfo {
  public:
   DeviceType type;
@@ -66,13 +74,15 @@ class DeviceInfo {
   bool display_device;        /* GPU is used as a display device. */
   bool has_nanovdb;           /* Support NanoVDB volumes. */
   bool has_light_tree;        /* Support light tree. */
-  bool has_osl;               /* Support Open Shading Language. */
-  bool has_guiding;           /* Support path guiding. */
-  bool has_profiling;         /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
-  bool has_gpu_queue;         /* Device supports GPU queue. */
-  bool use_metalrt;           /* Use MetalRT to accelerate ray queries (Metal only). */
-  DenoiserTypeMask denoisers; /* Supported denoiser types. */
+  bool has_osl;         /* Support Open Shading Language. */
+  bool has_guiding;     /* Support path guiding. */
+  bool has_profiling;   /* Supports runtime collection of profiling info. */
+  bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;   /* Device supports GPU queue. */
+  bool use_metalrt;     /* Use MetalRT to accelerate ray queries (Metal only). */
+  KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing
+                                                        kernels (Metal only). */
+  DenoiserTypeMask denoisers;                        /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
   string error_msg;
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index a6966bf167d..01578155931 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -110,10 +110,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
     case METAL_GPU_APPLE: {
       max_threads_per_threadgroup = 512;
       use_metalrt = info.use_metalrt;
-
-      /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
-       * quickly. */
-      kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
       break;
     }
   }
@@ -126,6 +122,22 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
     capture_enabled = true;
   }
 
+  if (device_vendor == METAL_GPU_APPLE) {
+    /* Set kernel_specialization_level based on user prefs. */
+    switch (info.kernel_optimization_level) {
+      case KERNEL_OPTIMIZATION_LEVEL_OFF:
+        kernel_specialization_level = PSO_GENERIC;
+        break;
+      default:
+      case KERNEL_OPTIMIZATION_LEVEL_INTERSECT:
+        kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
+        break;
+      case KERNEL_OPTIMIZATION_LEVEL_FULL:
+        kernel_specialization_level = PSO_SPECIALIZED_SHADE;
+        break;
+    }
+  }
+
   if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
     kernel_specialization_level = (MetalPipelineType)atoi(envstr);
   }
@@ -444,7 +456,7 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
                     source);
   }
 
-  const double starttime = time_dt();
+  double starttime = time_dt();
 
   NSError *error = NULL;
   id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
@@ -457,6 +469,12 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
 
   [options release];
 
+  bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") || MetalDeviceKernels::is_benchmark_warmup());
+  if (blocking_pso_build) {
+    MetalDeviceKernels::wait_for_all();
+    starttime = 0.0;
+  }
+
   /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
    * exists). */
   {
@@ -464,6 +482,8 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
     if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
       if (mtlLibrary) {
         instance->mtlLibrary[pso_type] = mtlLibrary;
+
+        starttime = time_dt();
         MetalDeviceKernels::load(instance, pso_type);
       }
       else {
@@ -472,6 +492,14 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
       }
     }
   }
+
+  if (starttime && blocking_pso_build) {
+    MetalDeviceKernels::wait_for_all();
+
+    metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+                 time_dt() - starttime,
+                 kernel_type_as_string(pso_type));
+  }
 }
 
 void MetalDevice::load_texture_info()
@@ -832,10 +860,8 @@ void MetalDevice::optimize_for_scene(Scene *scene)
   }
 
   /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
-  for (int i = 0; i < *_NSGetArgc(); i++) {
-    if (!strcmp((*_NSGetArgv())[i], "--warm-up")) {
-      specialize_in_background = false;
-    }
+  if (MetalDeviceKernels::is_benchmark_warmup()) {
+    specialize_in_background = false;
   }
 
   if (specialize_in_background) {
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index c2b9f073b12..212671f52a0 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -101,6 +101,8 @@ int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_typ
 bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type);
 bool load(MetalDevice *device, MetalPipelineType pso_type);
 const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
+void wait_for_all();
+bool is_benchmark_warmup();
 
 } /* namespace MetalDeviceKernels */
 
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index ec2025e78fe..7b26085c75f 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -116,19 +116,29 @@ struct ShaderCache {
 };
 
 bool ShaderCache::running = true;
-std::mutex g_shaderCacheMutex;
-std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
+
+const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8;
+using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
+int g_shaderCacheCount = 0;
+DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];
 
 ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
 {
-  thread_scoped_lock lock(g_shaderCacheMutex);
-  auto it = g_shaderCache.find(mtlDevice);
-  if (it != g_shaderCache.end()) {
-    return it->second.get();
+  for (int i=0; i<g_shaderCacheCount; i++) {
+    if (g_shaderCache[i].first == mtlDevice) {
+      return g_shaderCache[i].second.get();
+    }
   }
 
-  g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
-  return g_shaderCache[mtlDevice].get();
+  static thread_mutex g_shaderCacheCountMutex;
+  g_shaderCacheCountMutex.lock();
+  int index = g_shaderCacheCount++;
+  g_shaderCacheCountMutex.unlock();
+
+  assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM);
+  g_shaderCache[index].first = mtlDevice;
+  g_shaderCache[index].second = make_unique<ShaderCache>(mtlDevice);
+  return g_shaderCache[index].second.get();
 }
 
 ShaderCache::~ShaderCache()
@@ -145,7 +155,7 @@ ShaderCache::~ShaderCache()
     num_incomplete = int(incomplete_requests);
   }
 
-  if (num_incomplete) {
+  if (num_incomplete && !MetalDeviceKernels::is_benchmark_warmup()) {
     metal_printf("ShaderCache still busy (incomplete_requests = %d). Terminating...\n",
                  num_incomplete);
     std::terminate();
@@ -332,12 +342,6 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
 
 MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
 {
-  thread_scoped_lock lock(cache_mutex);
-  auto &collection = pipelines[kernel];
-  if (collection.empty()) {
-    return nullptr;
-  }
-
   /* metalrt options */
   bool use_metalrt = device->use_metalrt;
   bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR;
@@ -349,34 +353,43 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
                                device->kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
 
   MetalKernelPipeline *best_pipeline = nullptr;
-  for (auto &pipeline : collection) {
-    if (!pipeline->loaded) {
-      /* still loading - ignore */
-      continue;
-    }
+  while(!best_pipeline) {
+    {
+      thread_scoped_lock lock(cache_mutex);
+      for (auto &pipeline : pipelines[kernel]) {
+        if (!pipeline->loaded) {
+          /* still loading - ignore */
+          continue;
+        }
 
-    bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR;
-    bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK;
-    bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD;
-    bool pipeline_metalrt_motion = use_metalrt &&
-                                   pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION;
+        bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR;
+        bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK;
+        bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD;
+        bool pipeline_metalrt_motion = use_metalrt &&
+                                      pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION;
 
-    if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
-        pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
-        pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
-        pipeline_metalrt_motion != device_metalrt_motion) {
-      /* wrong combination of metalrt options */
-      continue;
-    }
+        if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
+            pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
+            pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
+            pipeline_metalrt_motion != device_metalrt_motion) {
+          /* wrong combination of metalrt options */
+          continue;
+        }
 
-    if (pipeline->pso_type != PSO_GENERIC) {
-      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
-          pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
-        best_pipeline = pipeline.get();
+        if (pipeline->pso_type != PSO_GENERIC) {
+          if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+              pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
+            best_pipeline = pipeline.get();
+          }
+        }
+        else if (!best_pipeline) {
+          best_pipeline = pipeline.get();
+        }
       }
     }
-    else if (!best_pipeline) {
-      best_pipeline = pipeline.get();
+
+    if (!best_pipeline) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
     }
   }
 
@@ -802,28 +815,26 @@ void MetalKernelPipeline::compile()
 
 bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
 {
-  const double starttime = time_dt();
   auto shader_cache = get_shader_cache(device->mtlDevice);
   for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
     shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
   }
-
-  if (getenv("CYCLES_METAL_PROFILING")) {
-    shader_cache->wait_for_all();
-    metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
-                 time_dt() - starttime,
-                 kernel_type_as_string(pso_type));
-  }
   return true;
 }
 
+void MetalDeviceKernels::wait_for_all()
+{
+  for (int i=0; i<g_shaderCacheCount; i++) {
+    g_shaderCache[i].second->wait_for_all();
+  }
+}
+
 bool MetalDeviceKernels::any_specialization_happening_now()
 {
   /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
    * only 1). */
-  thread_scoped_lock lock(g_shaderCacheMutex);
-  for (auto &it : g_shaderCache) {
-    if (it.second->incomplete_specialization_requests > 0) {
+  for (int i=0; i<g_shaderCacheCount; i++) {
+    if (g_shaderCache[i].second->incomplete_specialization_requests > 0) {
       return true;
     }
   }
@@ -854,6 +865,19 @@ const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevi
   return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
 }
 
+bool MetalDeviceKernels::is_benchmark_warmup()
+{
+  NSArray *args = [[NSProcessInfo processInfo] arguments];
+  for (int i = 0; i<args.count; i++) {
+    if (const char* arg = [[args objectAtIndex:i] cStringUsingEncoding:NSASCIIStringEncoding]) {
+      if (!strcmp(arg, "--warm-up")) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 CCL_NAMESPACE_END
 
 #endif /* WITH_METAL*/
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index b4afc3a2a28..837be0b0c23 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -202,6 +202,9 @@ MetalDeviceQueue::~MetalDeviceQueue()
   assert(mtlCommandBuffer_ == nil);
   assert(command_buffers_submitted_ == command_buffers_completed_);
 
+  close_compute_encoder();
+  close_blit_encoder();
+
   if (@available(macos 10.14, *)) {
     [shared_event_listener_ release];
     [shared_event_ release];
@@ -637,9 +640,7 @@ bool MetalDeviceQueue::synchronize()
     return false;
   }
 
-  if (mtlComputeEncoder_) {
-    close_compute_encoder();
-  }
+  close_compute_encoder();
   close_blit_encoder();
 
   if (mtlCommandBuffer_) {
@@ -855,9 +856,7 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
   if (@available(macos 10.14, *)) {
     if (timing_shared_event_) {
       /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
-      if (mtlComputeEncoder_) {
-        close_compute_encoder();
-      }
+      close_compute_encoder();
     }
 
     if (mtlComputeEncoder_) {
@@ -897,9 +896,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
     return mtlBlitEncoder_;
   }
 
-  if (mtlComputeEncoder_) {
-    close_compute_encoder();
-  }
+  close_compute_encoder();
 
   if (!mtlCommandBuffer_) {
     mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
@@ -913,12 +910,14 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
 
 void MetalDeviceQueue::close_compute_encoder()
 {
-  [mtlComputeEncoder_ endEncoding];
-  mtlComputeEncoder_ = nil;
+  if (mtlComputeEncoder_) {
+    [mtlComputeEncoder_ endEncoding];
+    mtlComputeEncoder_ = nil;
 
-  if (@available(macos 10.14, *)) {
-    if (timing_shared_event_) {
-      [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++];
+    if (@available(macos 10.14, *)) {
+      if (timing_shared_event_) {
+        [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++];
+      }
     }
   }
 }