diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index c4251d42b47..193f3db9e87 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -27,7 +27,7 @@ class MetalDeviceQueue : public DeviceQueue { int num_concurrent_states(const size_t /*state_size*/) const override; int num_concurrent_busy_states(const size_t /*state_size*/) const override; - int num_sort_partition_elements() const override; + int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override; bool supports_local_atomic_sort() const override; void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 8a7bbb54336..0df6dc30aea 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -297,9 +297,19 @@ int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const return num_concurrent_states(state_size) / 4; } -int MetalDeviceQueue::num_sort_partition_elements() const +int MetalDeviceQueue::num_sort_partitions(int max_num_paths, uint max_scene_shaders) const { - return MetalInfo::optimal_sort_partition_elements(); + int sort_partition_elements = MetalInfo::optimal_sort_partition_elements(); + /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of + * a more sophisticated heuristic we simply disable sort partitioning if the shader count is + * high. + */ + if (max_scene_shaders < 300 && sort_partition_elements > 0) { + return max(max_num_paths / sort_partition_elements, 1); + } + else { + return 1; + } } bool MetalDeviceQueue::supports_local_atomic_sort() const diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index 9b19fd20467..130ba969cff 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -42,9 +42,14 @@ int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) c return 4 * max(8 * max_num_threads, 65536); } -int OneapiDeviceQueue::num_sort_partition_elements() const +int OneapiDeviceQueue::num_sort_partitions(int max_num_paths, uint /*max_scene_shaders*/) const { - return (oneapi_device_->get_max_num_threads_per_multiprocessor() >= 128) ? 65536 : 8192; + int sort_partition_elements = (oneapi_device_->get_max_num_threads_per_multiprocessor() >= 128) ? + 65536 : + 8192; + /* Sort partitioning with local sorting on Intel GPUs is currently the most effective solution no + * matter the number of shaders. */ + return max(max_num_paths / sort_partition_elements, 1); } void OneapiDeviceQueue::init_execution() diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h index d35812ee46e..949a3c8a86d 100644 --- a/intern/cycles/device/oneapi/queue.h +++ b/intern/cycles/device/oneapi/queue.h @@ -27,7 +27,7 @@ class OneapiDeviceQueue : public DeviceQueue { int num_concurrent_busy_states(const size_t state_size) const override; - int num_sort_partition_elements() const override; + int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override; void init_execution() override; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 51045472f6e..860676eadeb 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -99,11 +99,20 @@ class DeviceQueue { * value. */ virtual int num_concurrent_busy_states(const size_t state_size) const = 0; - /* Number of elements in a partition of sorted shaders, that improves memory locality of + /* Number of partitions of sorted shaders, that improves memory locality of * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ - virtual int num_sort_partition_elements() const + virtual int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const { - return 65536; + /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of + * a more sophisticated heuristic we simply disable sort partitioning if the shader count is + * high. + */ + if (max_scene_shaders < 300) { + return max(max_num_paths / 65536, 1); + } + else { + return 1; + } } /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index f03e73b03e6..2f8cf312a45 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -241,17 +241,8 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { - /* Compute sort partitions, to balance between memory locality and coherence. - * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a - * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. - */ - num_sort_partitions_ = 1; - if (device_scene_->data.max_shaders < 300) { - const int num_elements = queue_->num_sort_partition_elements(); - if (num_elements) { - num_sort_partitions_ = max(max_num_paths_ / num_elements, 1); - } - } + num_sort_partitions_ = queue_->num_sort_partitions(max_num_paths_, + device_scene_->data.max_shaders); integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, num_sort_partitions_);