From 930a942dd070dd037ff78e2f35e8ecac53241a6c Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 1 Aug 2025 13:26:02 +0200 Subject: [PATCH] Refactor: Cycles: Move block sizes into common header This change puts all the block size macros in the same common header, so they can be included in host side code without needing to also include the kernels that are defined in the device headers that contained these values. This change also removes a magic number used to enqueue a kernel, which happened to agree with the GPU_PARALLEL_SORT_BLOCK_SIZE macro. Pull Request: https://projects.blender.org/blender/blender/pulls/143646 --- .../cycles/integrator/path_trace_work_gpu.cpp | 9 +++++++-- intern/cycles/kernel/CMakeLists.txt | 1 + intern/cycles/kernel/device/gpu/block_sizes.h | 18 ++++++++++++++++++ .../kernel/device/gpu/parallel_active_index.h | 7 +------ .../kernel/device/gpu/parallel_prefix_sum.h | 6 ------ .../kernel/device/gpu/parallel_sorted_index.h | 9 +-------- 6 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 intern/cycles/kernel/device/gpu/block_sizes.h diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index a93e305999a..a391484024b 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -15,6 +15,7 @@ #include "util/log.h" #include "util/string.h" +#include "kernel/device/gpu/block_sizes.h" #include "kernel/types.h" CCL_NAMESPACE_BEGIN @@ -592,8 +593,12 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel, const DeviceKernelArguments args( &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel); - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args); - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, + GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_, + args); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, + GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_, + args); return; } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 5ebcbaee76a..5b5a955ed41 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -62,6 +62,7 @@ set(SRC_KERNEL_DEVICE_CPU_HEADERS device/cpu/kernel_arch_impl.h ) set(SRC_KERNEL_DEVICE_GPU_HEADERS + device/gpu/block_sizes.h device/gpu/image.h device/gpu/kernel.h device/gpu/parallel_active_index.h diff --git a/intern/cycles/kernel/device/gpu/block_sizes.h b/intern/cycles/kernel/device/gpu/block_sizes.h new file mode 100644 index 00000000000..8fe74ef80a6 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/block_sizes.h @@ -0,0 +1,18 @@ +/* SPDX-FileCopyrightText: 2017-2025 Blender Foundation + * + * SPDX-License-Identifier: Apache-2.0 */ + +#pragma once + +#ifdef __HIP__ +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif + +#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) +#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024 diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index e1e74d8988b..a65d0f26a1a 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -11,14 +11,9 @@ CCL_NAMESPACE_BEGIN * * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */ +#include "kernel/device/gpu/block_sizes.h" #include "util/atomic.h" -#ifdef __HIP__ -# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 -#else -# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 -#endif - /* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`, * `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices * and keep device specific code in `compat.h`. */ diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index 5abf056ed01..8877e8c8d30 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -15,12 +15,6 @@ CCL_NAMESPACE_BEGIN #include "util/atomic.h" -#ifdef __HIP__ -# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 -#else -# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 -#endif - __device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index ecfc9ec8984..f24b97594cf 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -12,16 +12,9 @@ CCL_NAMESPACE_BEGIN * * TODO: there may be ways to optimize this to avoid this many atomic ops? */ +#include "kernel/device/gpu/block_sizes.h" #include "util/atomic.h" -#ifdef __HIP__ -# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 -#else -# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 -#endif -#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) -#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024 - #if defined(__KERNEL_LOCAL_ATOMIC_SORT__) ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,