Refactor: Cycles: Move block sizes into common header

This change puts all the block size macros in the same common header, so
they can be included in host side code without needing to also include
the kernels that are defined in the device headers that contained these
values.

This change also removes a magic number used to enqueue a kernel, which
happened to agree with the GPU_PARALLEL_SORT_BLOCK_SIZE macro.

Pull Request: https://projects.blender.org/blender/blender/pulls/143646
This commit is contained in:
Hugh Delaney
2025-08-01 13:26:02 +02:00
committed by Nikita Sirgienko
parent 3905cdd89a
commit 930a942dd0
6 changed files with 28 additions and 22 deletions

View File

@@ -15,6 +15,7 @@
#include "util/log.h"
#include "util/string.h"
#include "kernel/device/gpu/block_sizes.h"
#include "kernel/types.h"
CCL_NAMESPACE_BEGIN
@@ -592,8 +593,12 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
const DeviceKernelArguments args(
&work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
args);
return;
}

View File

@@ -62,6 +62,7 @@ set(SRC_KERNEL_DEVICE_CPU_HEADERS
device/cpu/kernel_arch_impl.h
)
set(SRC_KERNEL_DEVICE_GPU_HEADERS
device/gpu/block_sizes.h
device/gpu/image.h
device/gpu/kernel.h
device/gpu/parallel_active_index.h

View File

@@ -0,0 +1,18 @@
/* SPDX-FileCopyrightText: 2017-2025 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#ifdef __HIP__
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
#else
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024

View File

@@ -11,14 +11,9 @@ CCL_NAMESPACE_BEGIN
*
* Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
#include "kernel/device/gpu/block_sizes.h"
#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
#else
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
/* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`,
* `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices
* and keep device specific code in `compat.h`. */

View File

@@ -15,12 +15,6 @@ CCL_NAMESPACE_BEGIN
#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
#else
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
#endif
__device__ void gpu_parallel_prefix_sum(const int global_id,
ccl_global int *counter,
ccl_global int *prefix_sum,

View File

@@ -12,16 +12,9 @@ CCL_NAMESPACE_BEGIN
*
* TODO: there may be ways to optimize this to avoid this many atomic ops? */
#include "kernel/device/gpu/block_sizes.h"
#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
#else
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,