diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index a93e305999a..a391484024b 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -15,6 +15,7 @@ #include "util/log.h" #include "util/string.h" +#include "kernel/device/gpu/block_sizes.h" #include "kernel/types.h" CCL_NAMESPACE_BEGIN @@ -592,8 +593,12 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel, const DeviceKernelArguments args( &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel); - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args); - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, + GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_, + args); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, + GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_, + args); return; } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 5ebcbaee76a..5b5a955ed41 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -62,6 +62,7 @@ set(SRC_KERNEL_DEVICE_CPU_HEADERS device/cpu/kernel_arch_impl.h ) set(SRC_KERNEL_DEVICE_GPU_HEADERS + device/gpu/block_sizes.h device/gpu/image.h device/gpu/kernel.h device/gpu/parallel_active_index.h diff --git a/intern/cycles/kernel/device/gpu/block_sizes.h b/intern/cycles/kernel/device/gpu/block_sizes.h new file mode 100644 index 00000000000..8fe74ef80a6 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/block_sizes.h @@ -0,0 +1,18 @@ +/* SPDX-FileCopyrightText: 2017-2025 Blender Foundation + * + * SPDX-License-Identifier: Apache-2.0 */ + +#pragma once + +#ifdef __HIP__ +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif + +#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) +#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024 diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index e1e74d8988b..a65d0f26a1a 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -11,14 +11,9 @@ CCL_NAMESPACE_BEGIN * * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */ +#include "kernel/device/gpu/block_sizes.h" #include "util/atomic.h" -#ifdef __HIP__ -# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 -#else -# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 -#endif - /* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`, * `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices * and keep device specific code in `compat.h`. */ diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index 5abf056ed01..8877e8c8d30 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -15,12 +15,6 @@ CCL_NAMESPACE_BEGIN #include "util/atomic.h" -#ifdef __HIP__ -# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 -#else -# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 -#endif - __device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index ecfc9ec8984..f24b97594cf 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -12,16 +12,9 @@ CCL_NAMESPACE_BEGIN * * TODO: there may be ways to optimize this to avoid this many atomic ops? */ +#include "kernel/device/gpu/block_sizes.h" #include "util/atomic.h" -#ifdef __HIP__ -# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 -#else -# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 -#endif -#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) -#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024 - #if defined(__KERNEL_LOCAL_ATOMIC_SORT__) ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,