Merge branch 'blender-v4.5-release'

This commit is contained in:
Xavier Hallade
2025-06-18 08:22:04 +02:00
6 changed files with 35 additions and 20 deletions

View File

@@ -27,7 +27,7 @@ class MetalDeviceQueue : public DeviceQueue {
int num_concurrent_states(const size_t /*state_size*/) const override;
int num_concurrent_busy_states(const size_t /*state_size*/) const override;
int num_sort_partition_elements() const override;
int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override;
bool supports_local_atomic_sort() const override;
void init_execution() override;

View File

@@ -297,9 +297,19 @@ int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
return num_concurrent_states(state_size) / 4;
}
int MetalDeviceQueue::num_sort_partition_elements() const
int MetalDeviceQueue::num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
{
return MetalInfo::optimal_sort_partition_elements();
int sort_partition_elements = MetalInfo::optimal_sort_partition_elements();
/* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
* a more sophisticated heuristic we simply disable sort partitioning if the shader count is
* high.
*/
if (max_scene_shaders < 300 && sort_partition_elements > 0) {
return max(max_num_paths / sort_partition_elements, 1);
}
else {
return 1;
}
}
bool MetalDeviceQueue::supports_local_atomic_sort() const

View File

@@ -42,9 +42,14 @@ int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) c
return 4 * max(8 * max_num_threads, 65536);
}
int OneapiDeviceQueue::num_sort_partition_elements() const
int OneapiDeviceQueue::num_sort_partitions(int max_num_paths, uint /*max_scene_shaders*/) const
{
return (oneapi_device_->get_max_num_threads_per_multiprocessor() >= 128) ? 65536 : 8192;
int sort_partition_elements = (oneapi_device_->get_max_num_threads_per_multiprocessor() >= 128) ?
65536 :
8192;
/* Sort partitioning with local sorting on Intel GPUs is currently the most effective solution no
* matter the number of shaders. */
return max(max_num_paths / sort_partition_elements, 1);
}
void OneapiDeviceQueue::init_execution()

View File

@@ -27,7 +27,7 @@ class OneapiDeviceQueue : public DeviceQueue {
int num_concurrent_busy_states(const size_t state_size) const override;
int num_sort_partition_elements() const override;
int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override;
void init_execution() override;

View File

@@ -99,11 +99,20 @@ class DeviceQueue {
* value. */
virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
/* Number of elements in a partition of sorted shaders, that improves memory locality of
/* Number of partitions of sorted shaders, that improves memory locality of
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */
virtual int num_sort_partition_elements() const
virtual int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
{
return 65536;
/* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
* a more sophisticated heuristic we simply disable sort partitioning if the shader count is
* high.
*/
if (max_scene_shaders < 300) {
return max(max_num_paths / 65536, 1);
}
else {
return 1;
}
}
/* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and

View File

@@ -241,17 +241,8 @@ void PathTraceWorkGPU::alloc_integrator_queue()
void PathTraceWorkGPU::alloc_integrator_sorting()
{
/* Compute sort partitions, to balance between memory locality and coherence.
* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
* more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
*/
num_sort_partitions_ = 1;
if (device_scene_->data.max_shaders < 300) {
const int num_elements = queue_->num_sort_partition_elements();
if (num_elements) {
num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
}
}
num_sort_partitions_ = queue_->num_sort_partitions(max_num_paths_,
device_scene_->data.max_shaders);
integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
num_sort_partitions_);