Merge branch 'blender-v4.5-release'

2025-06-18 08:22:04 +02:00
parent bc1ca4442d 2df163a648
commit 588b9ff3cd
6 changed files with 35 additions and 20 deletions
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -27,7 +27,7 @@ class MetalDeviceQueue : public DeviceQueue {

  int num_concurrent_states(const size_t /*state_size*/) const override;
  int num_concurrent_busy_states(const size_t /*state_size*/) const override;
-  int num_sort_partition_elements() const override;
+  int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override;
  bool supports_local_atomic_sort() const override;

  void init_execution() override;
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -297,9 +297,19 @@ int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
  return num_concurrent_states(state_size) / 4;
 }

-int MetalDeviceQueue::num_sort_partition_elements() const
+int MetalDeviceQueue::num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
 {
-  return MetalInfo::optimal_sort_partition_elements();
+  int sort_partition_elements = MetalInfo::optimal_sort_partition_elements();
+  /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
+   * a more sophisticated heuristic we simply disable sort partitioning if the shader count is
+   * high.
+   */
+  if (max_scene_shaders < 300 && sort_partition_elements > 0) {
+    return max(max_num_paths / sort_partition_elements, 1);
+  }
+  else {
+    return 1;
+  }
 }

 bool MetalDeviceQueue::supports_local_atomic_sort() const
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -42,9 +42,14 @@ int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) c
  return 4 * max(8 * max_num_threads, 65536);
 }

-int OneapiDeviceQueue::num_sort_partition_elements() const
+int OneapiDeviceQueue::num_sort_partitions(int max_num_paths, uint /*max_scene_shaders*/) const
 {
-  return (oneapi_device_->get_max_num_threads_per_multiprocessor() >= 128) ? 65536 : 8192;
+  int sort_partition_elements = (oneapi_device_->get_max_num_threads_per_multiprocessor() >= 128) ?
+                                    65536 :
+                                    8192;
+  /* Sort partitioning with local sorting on Intel GPUs is currently the most effective solution no
+   * matter the number of shaders. */
+  return max(max_num_paths / sort_partition_elements, 1);
 }

 void OneapiDeviceQueue::init_execution()
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -27,7 +27,7 @@ class OneapiDeviceQueue : public DeviceQueue {

  int num_concurrent_busy_states(const size_t state_size) const override;

-  int num_sort_partition_elements() const override;
+  int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override;

  void init_execution() override;

--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -99,11 +99,20 @@ class DeviceQueue {
   * value. */
  virtual int num_concurrent_busy_states(const size_t state_size) const = 0;

-  /* Number of elements in a partition of sorted shaders, that improves memory locality of
+  /* Number of partitions of sorted shaders, that improves memory locality of
   * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
-  virtual int num_sort_partition_elements() const
+  virtual int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
  {
-    return 65536;
+    /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
+     * a more sophisticated heuristic we simply disable sort partitioning if the shader count is
+     * high.
+     */
+    if (max_scene_shaders < 300) {
+      return max(max_num_paths / 65536, 1);
+    }
+    else {
+      return 1;
+    }
  }

  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -241,17 +241,8 @@ void PathTraceWorkGPU::alloc_integrator_queue()

 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
-  /* Compute sort partitions, to balance between memory locality and coherence.
-   * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
-   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
-   */
-  num_sort_partitions_ = 1;
-  if (device_scene_->data.max_shaders < 300) {
-    const int num_elements = queue_->num_sort_partition_elements();
-    if (num_elements) {
-      num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
-    }
-  }
+  num_sort_partitions_ = queue_->num_sort_partitions(max_num_paths_,
+                                                     device_scene_->data.max_shaders);

  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);