diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h index 44fe4c4cde2..854ae588aff 100644 --- a/intern/cycles/device/metal/bvh.h +++ b/intern/cycles/device/metal/bvh.h @@ -57,6 +57,9 @@ class BVHMetal : public BVH { Geometry *const geom, bool refit); bool build_TLAS(Progress &progress, id device, id queue, bool refit); + + API_AVAILABLE(macos(11.0)) + void set_accel_struct(id new_accel_struct); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index ca7b11f8819..c4ca2707646 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -119,17 +119,27 @@ BVHMetal::BVHMetal(const BVHParams ¶ms_, BVHMetal::~BVHMetal() { - /* Clear point used by enqueueing. */ - device->release_bvh(this); + if (@available(macos 12.0, *)) { + set_accel_struct(nil); + if (null_BLAS) { + [null_BLAS release]; + } + } +} +API_AVAILABLE(macos(11.0)) +void BVHMetal::set_accel_struct(id new_accel_struct) +{ if (@available(macos 12.0, *)) { if (accel_struct) { device->stats.mem_free(accel_struct.allocatedSize); [accel_struct release]; + accel_struct = nil; } - if (null_BLAS) { - [null_BLAS release]; + if (new_accel_struct) { + accel_struct = new_accel_struct; + device->stats.mem_alloc(accel_struct.allocatedSize); } } } @@ -325,9 +335,7 @@ bool BVHMetal::build_BLAS_mesh(Progress &progress, toAccelerationStructure:accel]; [accelEnc endEncoding]; [accelCommands addCompletedHandler:^(id /*command_buffer*/) { - uint64_t allocated_size = [accel allocatedSize]; - device->stats.mem_alloc(allocated_size); - accel_struct = accel; + set_accel_struct(accel); [accel_uncompressed release]; /* Signal that we've finished doing GPU acceleration struct build. */ @@ -338,10 +346,7 @@ bool BVHMetal::build_BLAS_mesh(Progress &progress, } else { /* set our acceleration structure to the uncompressed structure */ - accel_struct = accel_uncompressed; - - uint64_t allocated_size = [accel_struct allocatedSize]; - device->stats.mem_alloc(allocated_size); + set_accel_struct(accel_uncompressed); /* Signal that we've finished doing GPU acceleration struct build. */ g_bvh_build_throttler.release(wired_size); @@ -663,9 +668,7 @@ bool BVHMetal::build_BLAS_hair(Progress &progress, toAccelerationStructure:accel]; [accelEnc endEncoding]; [accelCommands addCompletedHandler:^(id /*command_buffer*/) { - uint64_t allocated_size = [accel allocatedSize]; - device->stats.mem_alloc(allocated_size); - accel_struct = accel; + set_accel_struct(accel); [accel_uncompressed release]; /* Signal that we've finished doing GPU acceleration struct build. */ @@ -676,10 +679,7 @@ bool BVHMetal::build_BLAS_hair(Progress &progress, } else { /* set our acceleration structure to the uncompressed structure */ - accel_struct = accel_uncompressed; - - uint64_t allocated_size = [accel_struct allocatedSize]; - device->stats.mem_alloc(allocated_size); + set_accel_struct(accel_uncompressed); /* Signal that we've finished doing GPU acceleration struct build. */ g_bvh_build_throttler.release(wired_size); @@ -910,9 +910,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress, toAccelerationStructure:accel]; [accelEnc endEncoding]; [accelCommands addCompletedHandler:^(id /*command_buffer*/) { - uint64_t allocated_size = [accel allocatedSize]; - device->stats.mem_alloc(allocated_size); - accel_struct = accel; + set_accel_struct(accel); [accel_uncompressed release]; /* Signal that we've finished doing GPU acceleration struct build. */ @@ -923,10 +921,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress, } else { /* set our acceleration structure to the uncompressed structure */ - accel_struct = accel_uncompressed; - - uint64_t allocated_size = [accel_struct allocatedSize]; - device->stats.mem_alloc(allocated_size); + set_accel_struct(accel_uncompressed); /* Signal that we've finished doing GPU acceleration struct build. */ g_bvh_build_throttler.release(wired_size); @@ -1036,10 +1031,6 @@ bool BVHMetal::build_TLAS(Progress &progress, for (Object *ob : objects) { num_instances++; - /* Skip motion for non-traceable objects */ - if (!ob->is_traceable()) - continue; - if (ob->use_motion()) { num_motion_transforms += max((size_t)1, ob->get_motion().size()); } @@ -1115,8 +1106,8 @@ bool BVHMetal::build_TLAS(Progress &progress, /* Skip non-traceable objects */ Geometry const *geom = ob->get_geometry(); BVHMetal const *blas = static_cast(geom->bvh); - if (!blas || !blas->accel_struct) { - /* Place a degenerate instance, to ensure [[instance_id]] equals ob->get_mtl_device_index() + if (!blas || !blas->accel_struct || !ob->is_traceable()) { + /* Place a degenerate instance, to ensure [[instance_id]] equals ob->get_device_index() * in our intersection functions */ blas = nullptr; @@ -1299,11 +1290,8 @@ bool BVHMetal::build_TLAS(Progress &progress, [instanceBuf release]; [scratchBuf release]; - uint64_t allocated_size = [accel allocatedSize]; - device->stats.mem_alloc(allocated_size); - /* Cache top and bottom-level acceleration structs */ - accel_struct = accel; + set_accel_struct(accel); unique_blas_array.clear(); unique_blas_array.reserve(all_blas.count); @@ -1322,16 +1310,18 @@ bool BVHMetal::build(Progress &progress, bool refit) { if (@available(macos 12.0, *)) { - if (refit && params.bvh_type != BVH_TYPE_STATIC) { - assert(accel_struct); - } - else { - if (accel_struct) { - device->stats.mem_free(accel_struct.allocatedSize); - [accel_struct release]; - accel_struct = nil; + if (refit) { + /* It isn't valid to refit a non-existent BVH, or one which wasn't constructed as dynamic. + * In such cases, assert in development but try to recover in the wild. */ + if (params.bvh_type != BVH_TYPE_DYNAMIC || !accel_struct) { + assert(false); + refit = false; } } + + if (!refit) { + set_accel_struct(nil); + } } @autoreleasepool { diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 52a5c7e5896..8a5f7ff5ef4 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -39,10 +39,19 @@ class MetalDevice : public Device { KernelParamsMetal launch_params = {0}; /* MetalRT members ----------------------------------*/ - BVHMetal *bvhMetalRT = nullptr; + bool use_metalrt = false; bool motion_blur = false; id mtlASArgEncoder = nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */ + + id mtlBlasArgEncoder = nil; + id blas_buffer = nil; + + API_AVAILABLE(macos(11.0)) + vector> unique_blas_array; + + API_AVAILABLE(macos(11.0)) + id accel_struct = nil; /*---------------------------------------------------*/ uint kernel_features; @@ -79,11 +88,6 @@ class MetalDevice : public Device { id texture_bindings_3d = nil; std::vector> texture_slot_map; - /* BLAS encoding & lookup */ - id mtlBlasArgEncoder = nil; - id blas_buffer = nil; - - bool use_metalrt = false; MetalPipelineType kernel_specialization_level = PSO_GENERIC; int device_id = 0; @@ -138,8 +142,6 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - virtual void release_bvh(BVH *bvh) override; - virtual void optimize_for_scene(Scene *scene) override; static void compile_and_load(int device_id, MetalPipelineType pso_type); @@ -184,6 +186,10 @@ class MetalDevice : public Device { void tex_free(device_texture &mem); void flush_delayed_free_list(); + + void free_bvh(); + + void update_bvh(BVHMetal *bvh_metal); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 7b44c69304e..201baf52600 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -267,6 +267,7 @@ MetalDevice::~MetalDevice() } } + free_bvh(); flush_delayed_free_list(); if (texture_bindings_2d) { @@ -1372,24 +1373,7 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) { if (bvh->params.top_level) { - bvhMetalRT = bvh_metal; - - // allocate required buffers for BLAS array - uint64_t count = bvhMetalRT->blas_array.size(); - uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count; - blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode]; - stats.mem_alloc(blas_buffer.allocatedSize); - - for (uint64_t i = 0; i < count; ++i) { - if (bvhMetalRT->blas_array[i]) { - [mtlBlasArgEncoder setArgumentBuffer:blas_buffer - offset:i * mtlBlasArgEncoder.encodedLength]; - [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0]; - } - } - if (default_storage_mode == MTLResourceStorageModeManaged) { - [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)]; - } + update_bvh(bvh_metal); } } @@ -1399,10 +1383,54 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } } -void MetalDevice::release_bvh(BVH *bvh) +void MetalDevice::free_bvh() { - if (bvhMetalRT == bvh) { - bvhMetalRT = nullptr; + for (id &blas : unique_blas_array) { + [blas release]; + } + unique_blas_array.clear(); + + if (blas_buffer) { + [blas_buffer release]; + blas_buffer = nil; + } + + if (accel_struct) { + [accel_struct release]; + accel_struct = nil; + } +} + +void MetalDevice::update_bvh(BVHMetal *bvh_metal) +{ + free_bvh(); + + if (!bvh_metal) { + return; + } + + accel_struct = bvh_metal->accel_struct; + unique_blas_array = bvh_metal->unique_blas_array; + + [accel_struct retain]; + for (id &blas : unique_blas_array) { + [blas retain]; + } + + // Allocate required buffers for BLAS array. + uint64_t count = bvh_metal->blas_array.size(); + uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count; + blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:default_storage_mode]; + stats.mem_alloc(blas_buffer.allocatedSize); + + for (uint64_t i = 0; i < count; ++i) { + if (bvh_metal->blas_array[i]) { + [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength]; + [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0]; + } + } + if (default_storage_mode == MTLResourceStorageModeManaged) { + [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)]; } } diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index cc343c1b4e4..3fabc1fabd5 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -54,10 +54,12 @@ enum MetalPipelineType { const char *kernel_type_as_string(MetalPipelineType pso_type); -struct MetalKernelPipeline { - +/* A pipeline object that can be shared between multiple instances of MetalDeviceQueue. */ +class MetalKernelPipeline { + public: void compile(); + int pipeline_id; int originating_device_id; id mtlLibrary = nil; @@ -83,6 +85,28 @@ struct MetalKernelPipeline { string error_str; + NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; +}; + +/* An actively instanced pipeline that can only be used by a single instance of MetalDeviceQueue. + */ +class MetalDispatchPipeline { + public: + ~MetalDispatchPipeline(); + + bool update(MetalDevice *metal_device, DeviceKernel kernel); + void free_intersection_function_tables(); + + private: + friend class MetalDeviceQueue; + friend struct ShaderCache; + + int pipeline_id = -1; + + MetalPipelineType pso_type; + id pipeline = nil; + int num_threads_per_block = 0; + API_AVAILABLE(macos(11.0)) id intersection_func_table[METALRT_TABLE_NUM] = {nil}; }; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 615993f0362..1c07c4c1bc6 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -133,6 +133,9 @@ using DeviceShaderCache = std::pair, unique_ptr>; int g_shaderCacheCount = 0; DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM]; +/* Next UID for associating a MetalDispatchPipeline with an originating MetalKernelPipeline. */ +static std::atomic_int g_next_pipeline_id = 0; + ShaderCache *get_shader_cache(id mtlDevice) { for (int i = 0; i < g_shaderCacheCount; i++) { @@ -325,6 +328,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* Keep track of the originating device's ID so that we can cancel requests if the device ceases * to be active. */ + pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1); pipeline->originating_device_id = device->device_id; memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_)); pipeline->pso_type = pso_type; @@ -450,6 +454,64 @@ static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nul return constant_values; } +void MetalDispatchPipeline::free_intersection_function_tables() +{ + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + if (intersection_func_table[table]) { + [intersection_func_table[table] release]; + intersection_func_table[table] = nil; + } + } +} + +MetalDispatchPipeline::~MetalDispatchPipeline() +{ + free_intersection_function_tables(); +} + +bool MetalDispatchPipeline::update(MetalDevice *metal_device, DeviceKernel kernel) +{ + const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device, + kernel); + if (!best_pipeline) { + return false; + } + + if (pipeline_id == best_pipeline->pipeline_id) { + /* The best pipeline is already active - nothing to do. */ + return true; + } + pipeline_id = best_pipeline->pipeline_id; + pipeline = best_pipeline->pipeline; + pso_type = best_pipeline->pso_type; + num_threads_per_block = best_pipeline->num_threads_per_block; + + /* Create the MTLIntersectionFunctionTables if needed. */ + if (best_pipeline->use_metalrt && device_kernel_has_intersection(best_pipeline->device_kernel)) { + free_intersection_function_tables(); + + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + @autoreleasepool { + MTLIntersectionFunctionTableDescriptor *ift_desc = + [[MTLIntersectionFunctionTableDescriptor alloc] init]; + ift_desc.functionCount = best_pipeline->table_functions[table].count; + intersection_func_table[table] = [this->pipeline + newIntersectionFunctionTableWithDescriptor:ift_desc]; + + /* Finally write the function handles into this pipeline's table */ + int size = int([best_pipeline->table_functions[table] count]); + for (int i = 0; i < size; i++) { + id handle = [pipeline + functionHandleWithFunction:best_pipeline->table_functions[table][i]]; + [intersection_func_table[table] setFunction:handle atIndex:i]; + } + } + } + } + + return true; +} + id MetalKernelPipeline::make_intersection_function(const char *function_name) { MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; @@ -507,7 +569,6 @@ void MetalKernelPipeline::compile() function.label = [@(function_name.c_str()) copy]; - NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; NSArray *linked_functions = nil; if (use_metalrt && device_kernel_has_intersection(device_kernel)) { @@ -754,24 +815,6 @@ void MetalKernelPipeline::compile() [computePipelineStateDescriptor release]; computePipelineStateDescriptor = nil; - if (use_metalrt && linked_functions) { - for (int table = 0; table < METALRT_TABLE_NUM; table++) { - MTLIntersectionFunctionTableDescriptor *ift_desc = - [[MTLIntersectionFunctionTableDescriptor alloc] init]; - ift_desc.functionCount = table_functions[table].count; - intersection_func_table[table] = [this->pipeline - newIntersectionFunctionTableWithDescriptor:ift_desc]; - - /* Finally write the function handles into this pipeline's table */ - int size = (int)[table_functions[table] count]; - for (int i = 0; i < size; i++) { - id handle = [pipeline - functionHandleWithFunction:table_functions[table][i]]; - [intersection_func_table[table] setFunction:handle atIndex:i]; - } - } - } - if (!use_binary_archive) { metal_printf("%16s | %2d | %-55s | %7.2fs\n", kernel_type_as_string(pso_type), diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 5afd2f748b2..3c8c138075c 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -66,6 +66,7 @@ class MetalDeviceQueue : public DeviceQueue { id shared_event_ = nil; API_AVAILABLE(macos(10.14), ios(14.0)) MTLSharedEventListener *shared_event_listener_ = nil; + MetalDispatchPipeline active_pipelines_[DEVICE_KERNEL_NUM]; dispatch_queue_t event_queue_; dispatch_semaphore_t wait_semaphore_; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index d952bf21075..bb529c23a49 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -465,13 +465,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } bytes_written = globals_offsets + sizeof(KernelParamsMetal); - const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline( - metal_device_, kernel); - if (!metal_kernel_pso) { + if (!active_pipelines_[kernel].update(metal_device_, kernel)) { metal_device_->set_error( - string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel))); + string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel))); return false; } + MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel]; /* Encode ancillaries */ [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; @@ -487,8 +486,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, if (@available(macos 12.0, *)) { if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) { - if (metal_device_->bvhMetalRT) { - id accel_struct = metal_device_->bvhMetalRT->accel_struct; + if (id accel_struct = metal_device_->accel_struct) { [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3]; [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer offset:0 @@ -496,14 +494,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } for (int table = 0; table < METALRT_TABLE_NUM; table++) { - if (metal_kernel_pso->intersection_func_table[table]) { - [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer - offset:globals_offsets - atIndex:1]; + if (active_pipeline.intersection_func_table[table]) { + [active_pipeline.intersection_func_table[table] setBuffer:arg_buffer + offset:globals_offsets + atIndex:1]; [metal_device_->mtlAncillaryArgEncoder - setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table] + setIntersectionFunctionTable:active_pipeline.intersection_func_table[table] atIndex:4 + table]; - [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table] + [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table] usage:MTLResourceUsageRead]; } else { @@ -526,24 +524,22 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) { if (@available(macos 12.0, *)) { - BVHMetal *bvhMetalRT = metal_device_->bvhMetalRT; - if (bvhMetalRT && bvhMetalRT->accel_struct) { + if (id accel_struct = metal_device_->accel_struct) { /* Mark all Accelerations resources as used */ - [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct - usage:MTLResourceUsageRead]; + [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead]; [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer usage:MTLResourceUsageRead]; - [mtlComputeCommandEncoder useResources:bvhMetalRT->unique_blas_array.data() - count:bvhMetalRT->unique_blas_array.size() + [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data() + count:metal_device_->unique_blas_array.size() usage:MTLResourceUsageRead]; } } } - [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline]; + [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline]; /* Compute kernel launch parameters. */ - const int num_threads_per_block = metal_kernel_pso->num_threads_per_block; + const int num_threads_per_block = active_pipeline.num_threads_per_block; int shared_mem_bytes = 0; @@ -594,7 +590,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, const char *errCStr = [[NSString stringWithFormat:@"%@", command_buffer.error] UTF8String]; str += string_printf("(%s.%s):\n%s\n", - kernel_type_as_string(metal_kernel_pso->pso_type), + kernel_type_as_string(active_pipeline.pso_type), device_kernel_as_string(kernel), errCStr); } diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index 7cfc9d1f9b6..fbae7189be7 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -41,23 +41,14 @@ struct MetalInfo { /* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */ class MetalBufferPool { struct MetalBufferListEntry { - MetalBufferListEntry(id buffer, id command_buffer) - : buffer(buffer), command_buffer(command_buffer) - { - } - - MetalBufferListEntry() = delete; - id buffer; id command_buffer; }; - std::vector buffer_free_list; - std::vector buffer_in_use_list; + std::vector temp_buffers; thread_mutex buffer_mutex; size_t total_temp_mem_size = 0; public: - MetalBufferPool() = default; ~MetalBufferPool(); id get_buffer(id device, diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index 2b497ac6822..2cf73a2ae61 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -123,53 +123,42 @@ id MetalBufferPool::get_buffer(id device, const void *pointer, Stats &stats) { - id buffer; + id buffer = nil; MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >> MTLResourceStorageModeShift); MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >> MTLResourceCPUCacheModeShift); - buffer_mutex.lock(); - for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) { - MetalBufferListEntry bufferEntry = *entry; - - /* Check if buffer matches size and storage mode and is old enough to reuse */ - if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode && - cpuCacheMode == bufferEntry.buffer.cpuCacheMode) - { - buffer = bufferEntry.buffer; - buffer_free_list.erase(entry); - bufferEntry.command_buffer = command_buffer; - buffer_in_use_list.push_back(bufferEntry); - buffer_mutex.unlock(); - - /* Copy over data */ - if (pointer) { - memcpy(buffer.contents, pointer, length); - if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) { - [buffer didModifyRange:NSMakeRange(0, length)]; - } + { + thread_scoped_lock lock(buffer_mutex); + /* Find an unused buffer with matching size and storage mode. */ + for (MetalBufferListEntry &bufferEntry : temp_buffers) { + if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode && + cpuCacheMode == bufferEntry.buffer.cpuCacheMode && bufferEntry.command_buffer == nil) + { + buffer = bufferEntry.buffer; + bufferEntry.command_buffer = command_buffer; + break; } - - return buffer; + } + if (!buffer) { + /* Create a new buffer and add it to the pool. Typically this pool will only grow to a + * handful of entries. */ + buffer = [device newBufferWithLength:length options:options]; + stats.mem_alloc(buffer.allocatedSize); + total_temp_mem_size += buffer.allocatedSize; + temp_buffers.push_back(MetalBufferListEntry{buffer, command_buffer}); } } - // NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount); + + /* Copy over data */ if (pointer) { - buffer = [device newBufferWithBytes:pointer length:length options:options]; + memcpy(buffer.contents, pointer, length); + if (buffer.storageMode == MTLStorageModeManaged) { + [buffer didModifyRange:NSMakeRange(0, length)]; + } } - else { - buffer = [device newBufferWithLength:length options:options]; - } - - MetalBufferListEntry buffer_entry(buffer, command_buffer); - - stats.mem_alloc(buffer.allocatedSize); - - total_temp_mem_size += buffer.allocatedSize; - buffer_in_use_list.push_back(buffer_entry); - buffer_mutex.unlock(); return buffer; } @@ -178,16 +167,10 @@ void MetalBufferPool::process_command_buffer_completion(id com { assert(command_buffer); thread_scoped_lock lock(buffer_mutex); - /* Release all buffers that have not been recently reused back into the free pool */ - for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) { - MetalBufferListEntry buffer_entry = *entry; + /* Mark any temp buffers associated with command_buffer as unused. */ + for (MetalBufferListEntry &buffer_entry : temp_buffers) { if (buffer_entry.command_buffer == command_buffer) { - entry = buffer_in_use_list.erase(entry); buffer_entry.command_buffer = nil; - buffer_free_list.push_back(buffer_entry); - } - else { - entry++; } } } @@ -196,16 +179,12 @@ MetalBufferPool::~MetalBufferPool() { thread_scoped_lock lock(buffer_mutex); /* Release all buffers that have not been recently reused */ - for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) { - MetalBufferListEntry buffer_entry = *entry; - - id buffer = buffer_entry.buffer; - // NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount, - // bufferFreeList.size()); - total_temp_mem_size -= buffer.allocatedSize; - [buffer release]; - entry = buffer_free_list.erase(entry); + for (MetalBufferListEntry &buffer_entry : temp_buffers) { + total_temp_mem_size -= buffer_entry.buffer.allocatedSize; + [buffer_entry.buffer release]; + buffer_entry.buffer = nil; } + temp_buffers.clear(); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h index 14d7d317914..2ba1cebf535 100644 --- a/intern/cycles/kernel/integrator/shade_volume.h +++ b/intern/cycles/kernel/integrator/shade_volume.h @@ -989,11 +989,12 @@ ccl_device_forceinline bool integrate_volume_phase_scatter( ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg, IntegratorState state, ccl_private Ray *ccl_restrict ray, - const int object, ccl_global float *ccl_restrict render_buffer) { ShaderData sd; - shader_setup_from_volume(kg, &sd, ray, object); + /* FIXME: `object` is used for light linking. We read the bottom of the stack for simplicity, but + * this does not work for overlapping volumes. */ + shader_setup_from_volume(kg, &sd, ray, INTEGRATOR_STATE_ARRAY(state, volume_stack, 0, object)); /* Load random number state. */ RNGState rng_state; @@ -1186,8 +1187,7 @@ ccl_device void integrator_shade_volume(KernelGlobals kg, volume_stack_clean(kg, state); } - const VolumeIntegrateEvent event = volume_integrate( - kg, state, &ray, isect.object, render_buffer); + const VolumeIntegrateEvent event = volume_integrate(kg, state, &ray, render_buffer); if (event == VOLUME_PATH_MISSED) { /* End path. */ integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);