From 930a942dd070dd037ff78e2f35e8ecac53241a6c Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hdelaney@qti.qualcomm.com>
Date: Fri, 1 Aug 2025 13:26:02 +0200
Subject: [PATCH] Refactor: Cycles: Move block sizes into common header

This change puts all the block size macros in the same common header, so
they can be included in host side code without needing to also include
the kernels that are defined in the device headers that contained these
values.

This change also removes a magic number used to enqueue a kernel, which
happened to agree with the GPU_PARALLEL_SORT_BLOCK_SIZE macro.

Pull Request: https://projects.blender.org/blender/blender/pulls/143646
---
 .../cycles/integrator/path_trace_work_gpu.cpp  |  9 +++++++--
 intern/cycles/kernel/CMakeLists.txt            |  1 +
 intern/cycles/kernel/device/gpu/block_sizes.h  | 18 ++++++++++++++++++
 .../kernel/device/gpu/parallel_active_index.h  |  7 +------
 .../kernel/device/gpu/parallel_prefix_sum.h    |  6 ------
 .../kernel/device/gpu/parallel_sorted_index.h  |  9 +--------
 6 files changed, 28 insertions(+), 22 deletions(-)
 create mode 100644 intern/cycles/kernel/device/gpu/block_sizes.h

diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index a93e305999a..a391484024b 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -15,6 +15,7 @@
 #include "util/log.h"
 #include "util/string.h"
 
+#include "kernel/device/gpu/block_sizes.h"
 #include "kernel/types.h"
 
 CCL_NAMESPACE_BEGIN
@@ -592,8 +593,12 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
     const DeviceKernelArguments args(
         &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
 
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+                    GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
+                    args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
+                    GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
+                    args);
     return;
   }
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 5ebcbaee76a..5b5a955ed41 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -62,6 +62,7 @@ set(SRC_KERNEL_DEVICE_CPU_HEADERS
   device/cpu/kernel_arch_impl.h
 )
 set(SRC_KERNEL_DEVICE_GPU_HEADERS
+  device/gpu/block_sizes.h
   device/gpu/image.h
   device/gpu/kernel.h
   device/gpu/parallel_active_index.h
diff --git a/intern/cycles/kernel/device/gpu/block_sizes.h b/intern/cycles/kernel/device/gpu/block_sizes.h
new file mode 100644
index 00000000000..8fe74ef80a6
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/block_sizes.h
@@ -0,0 +1,18 @@
+/* SPDX-FileCopyrightText: 2017-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#pragma once
+
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
+
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index e1e74d8988b..a65d0f26a1a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -11,14 +11,9 @@ CCL_NAMESPACE_BEGIN
  *
  * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
 
+#include "kernel/device/gpu/block_sizes.h"
 #include "util/atomic.h"
 
-#ifdef __HIP__
-#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
-#else
-#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
-#endif
-
 /* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`,
  * `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices
  * and keep device specific code in `compat.h`. */
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index 5abf056ed01..8877e8c8d30 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -15,12 +15,6 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/atomic.h"
 
-#ifdef __HIP__
-#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
-#else
-#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
-#endif
-
 __device__ void gpu_parallel_prefix_sum(const int global_id,
                                         ccl_global int *counter,
                                         ccl_global int *prefix_sum,
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index ecfc9ec8984..f24b97594cf 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -12,16 +12,9 @@ CCL_NAMESPACE_BEGIN
  *
  * TODO: there may be ways to optimize this to avoid this many atomic ops? */
 
+#include "kernel/device/gpu/block_sizes.h"
 #include "util/atomic.h"
 
-#ifdef __HIP__
-#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
-#else
-#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
-#endif
-#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
-#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
-
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
 
 ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,