Refactor: Cycles: Move block sizes into common header

This change puts all the block size macros in the same common header, so they can be included in host side code without needing to also include the kernels that are defined in the device headers that contained these values. This change also removes a magic number used to enqueue a kernel, which happened to agree with the GPU_PARALLEL_SORT_BLOCK_SIZE macro. Pull Request: https://projects.blender.org/blender/blender/pulls/143646
2025-08-01 13:26:02 +02:00
parent 3905cdd89a
commit 930a942dd0
6 changed files with 28 additions and 22 deletions
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -15,6 +15,7 @@
 #include "util/log.h"
 #include "util/string.h"

+#include "kernel/device/gpu/block_sizes.h"
 #include "kernel/types.h"

 CCL_NAMESPACE_BEGIN
@@ -592,8 +593,12 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
    const DeviceKernelArguments args(
        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);

-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+                    GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
+                    args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
+                    GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
+                    args);
    return;
  }

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -62,6 +62,7 @@ set(SRC_KERNEL_DEVICE_CPU_HEADERS
  device/cpu/kernel_arch_impl.h
 )
 set(SRC_KERNEL_DEVICE_GPU_HEADERS
+  device/gpu/block_sizes.h
  device/gpu/image.h
  device/gpu/kernel.h
  device/gpu/parallel_active_index.h
--- a/intern/cycles/kernel/device/gpu/block_sizes.h
+++ b/intern/cycles/kernel/device/gpu/block_sizes.h
@@ -0,0 +1,18 @@
+/* SPDX-FileCopyrightText: 2017-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#pragma once
+
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
+
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -11,14 +11,9 @@ CCL_NAMESPACE_BEGIN
 *
 * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */

+#include "kernel/device/gpu/block_sizes.h"
 #include "util/atomic.h"

-#ifdef __HIP__
-#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
-#else
-#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
-#endif
-
 /* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`,
 * `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices
 * and keep device specific code in `compat.h`. */
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -15,12 +15,6 @@ CCL_NAMESPACE_BEGIN

 #include "util/atomic.h"

-#ifdef __HIP__
-#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
-#else
-#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
-#endif
-
 __device__ void gpu_parallel_prefix_sum(const int global_id,
                                        ccl_global int *counter,
                                        ccl_global int *prefix_sum,
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -12,16 +12,9 @@ CCL_NAMESPACE_BEGIN
 *
 * TODO: there may be ways to optimize this to avoid this many atomic ops? */

+#include "kernel/device/gpu/block_sizes.h"
 #include "util/atomic.h"

-#ifdef __HIP__
-#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
-#else
-#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
-#endif
-#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
-#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
-
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)

 ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,