Refactor: Cycles: Move block sizes into common header
This change puts all the block size macros in the same common header, so they can be included in host side code without needing to also include the kernels that are defined in the device headers that contained these values. This change also removes a magic number used to enqueue a kernel, which happened to agree with the GPU_PARALLEL_SORT_BLOCK_SIZE macro. Pull Request: https://projects.blender.org/blender/blender/pulls/143646
This commit is contained in:
committed by
Nikita Sirgienko
parent
3905cdd89a
commit
930a942dd0
@@ -15,6 +15,7 @@
|
||||
#include "util/log.h"
|
||||
#include "util/string.h"
|
||||
|
||||
#include "kernel/device/gpu/block_sizes.h"
|
||||
#include "kernel/types.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
@@ -592,8 +593,12 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
|
||||
const DeviceKernelArguments args(
|
||||
&work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
|
||||
GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
|
||||
args);
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
|
||||
GPU_PARALLEL_SORT_BLOCK_SIZE * num_sort_partitions_,
|
||||
args);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -62,6 +62,7 @@ set(SRC_KERNEL_DEVICE_CPU_HEADERS
|
||||
device/cpu/kernel_arch_impl.h
|
||||
)
|
||||
set(SRC_KERNEL_DEVICE_GPU_HEADERS
|
||||
device/gpu/block_sizes.h
|
||||
device/gpu/image.h
|
||||
device/gpu/kernel.h
|
||||
device/gpu/parallel_active_index.h
|
||||
|
||||
18
intern/cycles/kernel/device/gpu/block_sizes.h
Normal file
18
intern/cycles/kernel/device/gpu/block_sizes.h
Normal file
@@ -0,0 +1,18 @@
|
||||
/* SPDX-FileCopyrightText: 2017-2025 Blender Foundation
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 */
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
|
||||
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
|
||||
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
|
||||
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
|
||||
#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
|
||||
@@ -11,14 +11,9 @@ CCL_NAMESPACE_BEGIN
|
||||
*
|
||||
* Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
|
||||
|
||||
#include "kernel/device/gpu/block_sizes.h"
|
||||
#include "util/atomic.h"
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
/* TODO: abstract more device differences, define `ccl_gpu_local_syncthreads`,
|
||||
* `ccl_gpu_thread_warp`, `ccl_gpu_warp_index`, `ccl_gpu_num_warps` for all devices
|
||||
* and keep device specific code in `compat.h`. */
|
||||
|
||||
@@ -15,12 +15,6 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#include "util/atomic.h"
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
__device__ void gpu_parallel_prefix_sum(const int global_id,
|
||||
ccl_global int *counter,
|
||||
ccl_global int *prefix_sum,
|
||||
|
||||
@@ -12,16 +12,9 @@ CCL_NAMESPACE_BEGIN
|
||||
*
|
||||
* TODO: there may be ways to optimize this to avoid this many atomic ops? */
|
||||
|
||||
#include "kernel/device/gpu/block_sizes.h"
|
||||
#include "util/atomic.h"
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
|
||||
#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
|
||||
|
||||
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
|
||||
|
||||
ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
|
||||
|
||||
Reference in New Issue
Block a user