Merge branch 'blender-v4.2-release'

2024-07-08 16:19:41 +02:00
parent 782b9a80fe 5a29be3c75
commit 4a4270d73c
11 changed files with 242 additions and 181 deletions
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -57,6 +57,9 @@ class BVHMetal : public BVH {
                             Geometry *const geom,
                             bool refit);
  bool build_TLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+
+  API_AVAILABLE(macos(11.0))
+  void set_accel_struct(id<MTLAccelerationStructure> new_accel_struct);
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -119,17 +119,27 @@ BVHMetal::BVHMetal(const BVHParams &params_,

 BVHMetal::~BVHMetal()
 {
-  /* Clear point used by enqueueing. */
-  device->release_bvh(this);
+  if (@available(macos 12.0, *)) {
+    set_accel_struct(nil);
+    if (null_BLAS) {
+      [null_BLAS release];
+    }
+  }
+}

+API_AVAILABLE(macos(11.0))
+void BVHMetal::set_accel_struct(id<MTLAccelerationStructure> new_accel_struct)
+{
  if (@available(macos 12.0, *)) {
    if (accel_struct) {
      device->stats.mem_free(accel_struct.allocatedSize);
      [accel_struct release];
+      accel_struct = nil;
    }

-    if (null_BLAS) {
-      [null_BLAS release];
+    if (new_accel_struct) {
+      accel_struct = new_accel_struct;
+      device->stats.mem_alloc(accel_struct.allocatedSize);
    }
  }
 }
@@ -325,9 +335,7 @@ bool BVHMetal::build_BLAS_mesh(Progress &progress,
                                toAccelerationStructure:accel];
          [accelEnc endEncoding];
          [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> /*command_buffer*/) {
-            uint64_t allocated_size = [accel allocatedSize];
-            device->stats.mem_alloc(allocated_size);
-            accel_struct = accel;
+            set_accel_struct(accel);
            [accel_uncompressed release];

            /* Signal that we've finished doing GPU acceleration struct build. */
@@ -338,10 +346,7 @@ bool BVHMetal::build_BLAS_mesh(Progress &progress,
      }
      else {
        /* set our acceleration structure to the uncompressed structure */
-        accel_struct = accel_uncompressed;
-
-        uint64_t allocated_size = [accel_struct allocatedSize];
-        device->stats.mem_alloc(allocated_size);
+        set_accel_struct(accel_uncompressed);

        /* Signal that we've finished doing GPU acceleration struct build. */
        g_bvh_build_throttler.release(wired_size);
@@ -663,9 +668,7 @@ bool BVHMetal::build_BLAS_hair(Progress &progress,
                                toAccelerationStructure:accel];
          [accelEnc endEncoding];
          [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> /*command_buffer*/) {
-            uint64_t allocated_size = [accel allocatedSize];
-            device->stats.mem_alloc(allocated_size);
-            accel_struct = accel;
+            set_accel_struct(accel);
            [accel_uncompressed release];

            /* Signal that we've finished doing GPU acceleration struct build. */
@@ -676,10 +679,7 @@ bool BVHMetal::build_BLAS_hair(Progress &progress,
      }
      else {
        /* set our acceleration structure to the uncompressed structure */
-        accel_struct = accel_uncompressed;
-
-        uint64_t allocated_size = [accel_struct allocatedSize];
-        device->stats.mem_alloc(allocated_size);
+        set_accel_struct(accel_uncompressed);

        /* Signal that we've finished doing GPU acceleration struct build. */
        g_bvh_build_throttler.release(wired_size);
@@ -910,9 +910,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress,
                                toAccelerationStructure:accel];
          [accelEnc endEncoding];
          [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> /*command_buffer*/) {
-            uint64_t allocated_size = [accel allocatedSize];
-            device->stats.mem_alloc(allocated_size);
-            accel_struct = accel;
+            set_accel_struct(accel);
            [accel_uncompressed release];

            /* Signal that we've finished doing GPU acceleration struct build. */
@@ -923,10 +921,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress,
      }
      else {
        /* set our acceleration structure to the uncompressed structure */
-        accel_struct = accel_uncompressed;
-
-        uint64_t allocated_size = [accel_struct allocatedSize];
-        device->stats.mem_alloc(allocated_size);
+        set_accel_struct(accel_uncompressed);

        /* Signal that we've finished doing GPU acceleration struct build. */
        g_bvh_build_throttler.release(wired_size);
@@ -1036,10 +1031,6 @@ bool BVHMetal::build_TLAS(Progress &progress,
    for (Object *ob : objects) {
      num_instances++;

-      /* Skip motion for non-traceable objects */
-      if (!ob->is_traceable())
-        continue;
-
      if (ob->use_motion()) {
        num_motion_transforms += max((size_t)1, ob->get_motion().size());
      }
@@ -1115,8 +1106,8 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Skip non-traceable objects */
      Geometry const *geom = ob->get_geometry();
      BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
-      if (!blas || !blas->accel_struct) {
-        /* Place a degenerate instance, to ensure [[instance_id]] equals ob->get_mtl_device_index()
+      if (!blas || !blas->accel_struct || !ob->is_traceable()) {
+        /* Place a degenerate instance, to ensure [[instance_id]] equals ob->get_device_index()
         * in our intersection functions */
        blas = nullptr;

@@ -1299,11 +1290,8 @@ bool BVHMetal::build_TLAS(Progress &progress,
    [instanceBuf release];
    [scratchBuf release];

-    uint64_t allocated_size = [accel allocatedSize];
-    device->stats.mem_alloc(allocated_size);
-
    /* Cache top and bottom-level acceleration structs */
-    accel_struct = accel;
+    set_accel_struct(accel);

    unique_blas_array.clear();
    unique_blas_array.reserve(all_blas.count);
@@ -1322,16 +1310,18 @@ bool BVHMetal::build(Progress &progress,
                     bool refit)
 {
  if (@available(macos 12.0, *)) {
-    if (refit && params.bvh_type != BVH_TYPE_STATIC) {
-      assert(accel_struct);
-    }
-    else {
-      if (accel_struct) {
-        device->stats.mem_free(accel_struct.allocatedSize);
-        [accel_struct release];
-        accel_struct = nil;
+    if (refit) {
+      /* It isn't valid to refit a non-existent BVH, or one which wasn't constructed as dynamic.
+       * In such cases, assert in development but try to recover in the wild. */
+      if (params.bvh_type != BVH_TYPE_DYNAMIC || !accel_struct) {
+        assert(false);
+        refit = false;
      }
    }
+
+    if (!refit) {
+      set_accel_struct(nil);
+    }
  }

  @autoreleasepool {
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -39,10 +39,19 @@ class MetalDevice : public Device {
  KernelParamsMetal launch_params = {0};

  /* MetalRT members ----------------------------------*/
-  BVHMetal *bvhMetalRT = nullptr;
+  bool use_metalrt = false;
  bool motion_blur = false;
  id<MTLArgumentEncoder> mtlASArgEncoder =
      nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
+
+  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+  id<MTLBuffer> blas_buffer = nil;
+
+  API_AVAILABLE(macos(11.0))
+  vector<id<MTLAccelerationStructure>> unique_blas_array;
+
+  API_AVAILABLE(macos(11.0))
+  id<MTLAccelerationStructure> accel_struct = nil;
  /*---------------------------------------------------*/

  uint kernel_features;
@@ -79,11 +88,6 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

-  /* BLAS encoding & lookup */
-  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
-  id<MTLBuffer> blas_buffer = nil;
-
-  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;

  int device_id = 0;
@@ -138,8 +142,6 @@ class MetalDevice : public Device {

  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  virtual void release_bvh(BVH *bvh) override;
-
  virtual void optimize_for_scene(Scene *scene) override;

  static void compile_and_load(int device_id, MetalPipelineType pso_type);
@@ -184,6 +186,10 @@ class MetalDevice : public Device {
  void tex_free(device_texture &mem);

  void flush_delayed_free_list();
+
+  void free_bvh();
+
+  void update_bvh(BVHMetal *bvh_metal);
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -267,6 +267,7 @@ MetalDevice::~MetalDevice()
    }
  }

+  free_bvh();
  flush_delayed_free_list();

  if (texture_bindings_2d) {
@@ -1372,24 +1373,7 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {

      if (bvh->params.top_level) {
-        bvhMetalRT = bvh_metal;
-
-        // allocate required buffers for BLAS array
-        uint64_t count = bvhMetalRT->blas_array.size();
-        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
-        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
-        stats.mem_alloc(blas_buffer.allocatedSize);
-
-        for (uint64_t i = 0; i < count; ++i) {
-          if (bvhMetalRT->blas_array[i]) {
-            [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
-                                          offset:i * mtlBlasArgEncoder.encodedLength];
-            [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
-          }
-        }
-        if (default_storage_mode == MTLResourceStorageModeManaged) {
-          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
-        }
+        update_bvh(bvh_metal);
      }
    }

@@ -1399,10 +1383,54 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
  }
 }

-void MetalDevice::release_bvh(BVH *bvh)
+void MetalDevice::free_bvh()
 {
-  if (bvhMetalRT == bvh) {
-    bvhMetalRT = nullptr;
+  for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
+    [blas release];
+  }
+  unique_blas_array.clear();
+
+  if (blas_buffer) {
+    [blas_buffer release];
+    blas_buffer = nil;
+  }
+
+  if (accel_struct) {
+    [accel_struct release];
+    accel_struct = nil;
+  }
+}
+
+void MetalDevice::update_bvh(BVHMetal *bvh_metal)
+{
+  free_bvh();
+
+  if (!bvh_metal) {
+    return;
+  }
+
+  accel_struct = bvh_metal->accel_struct;
+  unique_blas_array = bvh_metal->unique_blas_array;
+
+  [accel_struct retain];
+  for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
+    [blas retain];
+  }
+
+  // Allocate required buffers for BLAS array.
+  uint64_t count = bvh_metal->blas_array.size();
+  uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
+  blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:default_storage_mode];
+  stats.mem_alloc(blas_buffer.allocatedSize);
+
+  for (uint64_t i = 0; i < count; ++i) {
+    if (bvh_metal->blas_array[i]) {
+      [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
+      [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
+    }
+  }
+  if (default_storage_mode == MTLResourceStorageModeManaged) {
+    [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
  }
 }

--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -54,10 +54,12 @@ enum MetalPipelineType {

 const char *kernel_type_as_string(MetalPipelineType pso_type);

-struct MetalKernelPipeline {
-
+/* A pipeline object that can be shared between multiple instances of MetalDeviceQueue. */
+class MetalKernelPipeline {
+ public:
  void compile();

+  int pipeline_id;
  int originating_device_id;

  id<MTLLibrary> mtlLibrary = nil;
@@ -83,6 +85,28 @@ struct MetalKernelPipeline {

  string error_str;

+  NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
+};
+
+/* An actively instanced pipeline that can only be used by a single instance of MetalDeviceQueue.
+ */
+class MetalDispatchPipeline {
+ public:
+  ~MetalDispatchPipeline();
+
+  bool update(MetalDevice *metal_device, DeviceKernel kernel);
+  void free_intersection_function_tables();
+
+ private:
+  friend class MetalDeviceQueue;
+  friend struct ShaderCache;
+
+  int pipeline_id = -1;
+
+  MetalPipelineType pso_type;
+  id<MTLComputePipelineState> pipeline = nil;
+  int num_threads_per_block = 0;
+
  API_AVAILABLE(macos(11.0))
  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
 };
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -133,6 +133,9 @@ using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
 int g_shaderCacheCount = 0;
 DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];

+/* Next UID for associating a MetalDispatchPipeline with an originating MetalKernelPipeline. */
+static std::atomic_int g_next_pipeline_id = 0;
+
 ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
 {
  for (int i = 0; i < g_shaderCacheCount; i++) {
@@ -325,6 +328,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,

  /* Keep track of the originating device's ID so that we can cancel requests if the device ceases
   * to be active. */
+  pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
  pipeline->originating_device_id = device->device_id;
  memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_));
  pipeline->pso_type = pso_type;
@@ -450,6 +454,64 @@ static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nul
  return constant_values;
 }

+void MetalDispatchPipeline::free_intersection_function_tables()
+{
+  for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+    if (intersection_func_table[table]) {
+      [intersection_func_table[table] release];
+      intersection_func_table[table] = nil;
+    }
+  }
+}
+
+MetalDispatchPipeline::~MetalDispatchPipeline()
+{
+  free_intersection_function_tables();
+}
+
+bool MetalDispatchPipeline::update(MetalDevice *metal_device, DeviceKernel kernel)
+{
+  const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device,
+                                                                                   kernel);
+  if (!best_pipeline) {
+    return false;
+  }
+
+  if (pipeline_id == best_pipeline->pipeline_id) {
+    /* The best pipeline is already active - nothing to do. */
+    return true;
+  }
+  pipeline_id = best_pipeline->pipeline_id;
+  pipeline = best_pipeline->pipeline;
+  pso_type = best_pipeline->pso_type;
+  num_threads_per_block = best_pipeline->num_threads_per_block;
+
+  /* Create the MTLIntersectionFunctionTables if needed. */
+  if (best_pipeline->use_metalrt && device_kernel_has_intersection(best_pipeline->device_kernel)) {
+    free_intersection_function_tables();
+
+    for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+      @autoreleasepool {
+        MTLIntersectionFunctionTableDescriptor *ift_desc =
+            [[MTLIntersectionFunctionTableDescriptor alloc] init];
+        ift_desc.functionCount = best_pipeline->table_functions[table].count;
+        intersection_func_table[table] = [this->pipeline
+            newIntersectionFunctionTableWithDescriptor:ift_desc];
+
+        /* Finally write the function handles into this pipeline's table */
+        int size = int([best_pipeline->table_functions[table] count]);
+        for (int i = 0; i < size; i++) {
+          id<MTLFunctionHandle> handle = [pipeline
+              functionHandleWithFunction:best_pipeline->table_functions[table][i]];
+          [intersection_func_table[table] setFunction:handle atIndex:i];
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
 id<MTLFunction> MetalKernelPipeline::make_intersection_function(const char *function_name)
 {
  MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
@@ -507,7 +569,6 @@ void MetalKernelPipeline::compile()

  function.label = [@(function_name.c_str()) copy];

-  NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
  NSArray *linked_functions = nil;

  if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
@@ -754,24 +815,6 @@ void MetalKernelPipeline::compile()
  [computePipelineStateDescriptor release];
  computePipelineStateDescriptor = nil;

-  if (use_metalrt && linked_functions) {
-    for (int table = 0; table < METALRT_TABLE_NUM; table++) {
-      MTLIntersectionFunctionTableDescriptor *ift_desc =
-          [[MTLIntersectionFunctionTableDescriptor alloc] init];
-      ift_desc.functionCount = table_functions[table].count;
-      intersection_func_table[table] = [this->pipeline
-          newIntersectionFunctionTableWithDescriptor:ift_desc];
-
-      /* Finally write the function handles into this pipeline's table */
-      int size = (int)[table_functions[table] count];
-      for (int i = 0; i < size; i++) {
-        id<MTLFunctionHandle> handle = [pipeline
-            functionHandleWithFunction:table_functions[table][i]];
-        [intersection_func_table[table] setFunction:handle atIndex:i];
-      }
-    }
-  }
-
  if (!use_binary_archive) {
    metal_printf("%16s | %2d | %-55s | %7.2fs\n",
                 kernel_type_as_string(pso_type),
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -66,6 +66,7 @@ class MetalDeviceQueue : public DeviceQueue {
  id<MTLSharedEvent> shared_event_ = nil;
  API_AVAILABLE(macos(10.14), ios(14.0))
  MTLSharedEventListener *shared_event_listener_ = nil;
+  MetalDispatchPipeline active_pipelines_[DEVICE_KERNEL_NUM];

  dispatch_queue_t event_queue_;
  dispatch_semaphore_t wait_semaphore_;
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -465,13 +465,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
    }
    bytes_written = globals_offsets + sizeof(KernelParamsMetal);

-    const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(
-        metal_device_, kernel);
-    if (!metal_kernel_pso) {
+    if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
      metal_device_->set_error(
-          string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel)));
+          string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
      return false;
    }
+    MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];

    /* Encode ancillaries */
    [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
@@ -487,8 +486,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,

    if (@available(macos 12.0, *)) {
      if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
-        if (metal_device_->bvhMetalRT) {
-          id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
+        if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
          [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
          [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
                                                    offset:0
@@ -496,14 +494,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
        }

        for (int table = 0; table < METALRT_TABLE_NUM; table++) {
-          if (metal_kernel_pso->intersection_func_table[table]) {
-            [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer
-                                                                 offset:globals_offsets
-                                                                atIndex:1];
+          if (active_pipeline.intersection_func_table[table]) {
+            [active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
+                                                               offset:globals_offsets
+                                                              atIndex:1];
            [metal_device_->mtlAncillaryArgEncoder
-                setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
+                setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
                                     atIndex:4 + table];
-            [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
+            [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
                                            usage:MTLResourceUsageRead];
          }
          else {
@@ -526,24 +524,22 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
    if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
      if (@available(macos 12.0, *)) {

-        BVHMetal *bvhMetalRT = metal_device_->bvhMetalRT;
-        if (bvhMetalRT && bvhMetalRT->accel_struct) {
+        if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
          /* Mark all Accelerations resources as used */
-          [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct
-                                          usage:MTLResourceUsageRead];
+          [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
          [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
                                          usage:MTLResourceUsageRead];
-          [mtlComputeCommandEncoder useResources:bvhMetalRT->unique_blas_array.data()
-                                           count:bvhMetalRT->unique_blas_array.size()
+          [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
+                                           count:metal_device_->unique_blas_array.size()
                                           usage:MTLResourceUsageRead];
        }
      }
    }

-    [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline];
+    [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];

    /* Compute kernel launch parameters. */
-    const int num_threads_per_block = metal_kernel_pso->num_threads_per_block;
+    const int num_threads_per_block = active_pipeline.num_threads_per_block;

    int shared_mem_bytes = 0;

@@ -594,7 +590,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
          const char *errCStr = [[NSString stringWithFormat:@"%@", command_buffer.error]
              UTF8String];
          str += string_printf("(%s.%s):\n%s\n",
-                               kernel_type_as_string(metal_kernel_pso->pso_type),
+                               kernel_type_as_string(active_pipeline.pso_type),
                               device_kernel_as_string(kernel),
                               errCStr);
        }
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -41,23 +41,14 @@ struct MetalInfo {
 /* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
 class MetalBufferPool {
  struct MetalBufferListEntry {
-    MetalBufferListEntry(id<MTLBuffer> buffer, id<MTLCommandBuffer> command_buffer)
-        : buffer(buffer), command_buffer(command_buffer)
-    {
-    }
-
-    MetalBufferListEntry() = delete;
-
    id<MTLBuffer> buffer;
    id<MTLCommandBuffer> command_buffer;
  };
-  std::vector<MetalBufferListEntry> buffer_free_list;
-  std::vector<MetalBufferListEntry> buffer_in_use_list;
+  std::vector<MetalBufferListEntry> temp_buffers;
  thread_mutex buffer_mutex;
  size_t total_temp_mem_size = 0;

 public:
-  MetalBufferPool() = default;
  ~MetalBufferPool();

  id<MTLBuffer> get_buffer(id<MTLDevice> device,
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -123,53 +123,42 @@ id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
                                          const void *pointer,
                                          Stats &stats)
 {
-  id<MTLBuffer> buffer;
+  id<MTLBuffer> buffer = nil;

  MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >>
                                              MTLResourceStorageModeShift);
  MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >>
                                                 MTLResourceCPUCacheModeShift);

-  buffer_mutex.lock();
-  for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) {
-    MetalBufferListEntry bufferEntry = *entry;
-
-    /* Check if buffer matches size and storage mode and is old enough to reuse */
-    if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
-        cpuCacheMode == bufferEntry.buffer.cpuCacheMode)
-    {
-      buffer = bufferEntry.buffer;
-      buffer_free_list.erase(entry);
-      bufferEntry.command_buffer = command_buffer;
-      buffer_in_use_list.push_back(bufferEntry);
-      buffer_mutex.unlock();
-
-      /* Copy over data */
-      if (pointer) {
-        memcpy(buffer.contents, pointer, length);
-        if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) {
-          [buffer didModifyRange:NSMakeRange(0, length)];
-        }
+  {
+    thread_scoped_lock lock(buffer_mutex);
+    /* Find an unused buffer with matching size and storage mode. */
+    for (MetalBufferListEntry &bufferEntry : temp_buffers) {
+      if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
+          cpuCacheMode == bufferEntry.buffer.cpuCacheMode && bufferEntry.command_buffer == nil)
+      {
+        buffer = bufferEntry.buffer;
+        bufferEntry.command_buffer = command_buffer;
+        break;
      }
-
-      return buffer;
+    }
+    if (!buffer) {
+      /* Create a new buffer and add it to the pool. Typically this pool will only grow to a
+       * handful of entries. */
+      buffer = [device newBufferWithLength:length options:options];
+      stats.mem_alloc(buffer.allocatedSize);
+      total_temp_mem_size += buffer.allocatedSize;
+      temp_buffers.push_back(MetalBufferListEntry{buffer, command_buffer});
    }
  }
-  // NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount);
+
+  /* Copy over data */
  if (pointer) {
-    buffer = [device newBufferWithBytes:pointer length:length options:options];
+    memcpy(buffer.contents, pointer, length);
+    if (buffer.storageMode == MTLStorageModeManaged) {
+      [buffer didModifyRange:NSMakeRange(0, length)];
+    }
  }
-  else {
-    buffer = [device newBufferWithLength:length options:options];
-  }
-
-  MetalBufferListEntry buffer_entry(buffer, command_buffer);
-
-  stats.mem_alloc(buffer.allocatedSize);
-
-  total_temp_mem_size += buffer.allocatedSize;
-  buffer_in_use_list.push_back(buffer_entry);
-  buffer_mutex.unlock();

  return buffer;
 }
@@ -178,16 +167,10 @@ void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> com
 {
  assert(command_buffer);
  thread_scoped_lock lock(buffer_mutex);
-  /* Release all buffers that have not been recently reused back into the free pool */
-  for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) {
-    MetalBufferListEntry buffer_entry = *entry;
+  /* Mark any temp buffers associated with command_buffer as unused. */
+  for (MetalBufferListEntry &buffer_entry : temp_buffers) {
    if (buffer_entry.command_buffer == command_buffer) {
-      entry = buffer_in_use_list.erase(entry);
      buffer_entry.command_buffer = nil;
-      buffer_free_list.push_back(buffer_entry);
-    }
-    else {
-      entry++;
    }
  }
 }
@@ -196,16 +179,12 @@ MetalBufferPool::~MetalBufferPool()
 {
  thread_scoped_lock lock(buffer_mutex);
  /* Release all buffers that have not been recently reused */
-  for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) {
-    MetalBufferListEntry buffer_entry = *entry;
-
-    id<MTLBuffer> buffer = buffer_entry.buffer;
-    // NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount,
-    // bufferFreeList.size());
-    total_temp_mem_size -= buffer.allocatedSize;
-    [buffer release];
-    entry = buffer_free_list.erase(entry);
+  for (MetalBufferListEntry &buffer_entry : temp_buffers) {
+    total_temp_mem_size -= buffer_entry.buffer.allocatedSize;
+    [buffer_entry.buffer release];
+    buffer_entry.buffer = nil;
  }
+  temp_buffers.clear();
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -989,11 +989,12 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
 ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
                                                 IntegratorState state,
                                                 ccl_private Ray *ccl_restrict ray,
-                                                 const int object,
                                                 ccl_global float *ccl_restrict render_buffer)
 {
  ShaderData sd;
-  shader_setup_from_volume(kg, &sd, ray, object);
+  /* FIXME: `object` is used for light linking. We read the bottom of the stack for simplicity, but
+   * this does not work for overlapping volumes. */
+  shader_setup_from_volume(kg, &sd, ray, INTEGRATOR_STATE_ARRAY(state, volume_stack, 0, object));

  /* Load random number state. */
  RNGState rng_state;
@@ -1186,8 +1187,7 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
    volume_stack_clean(kg, state);
  }

-  const VolumeIntegrateEvent event = volume_integrate(
-      kg, state, &ray, isect.object, render_buffer);
+  const VolumeIntegrateEvent event = volume_integrate(kg, state, &ray, render_buffer);
  if (event == VOLUME_PATH_MISSED) {
    /* End path. */
    integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);