Cycles: oneAPI: set correct work group sizes for kernels that have a predefined one

This commit is contained in:
Nikita Sirgienko
2023-05-17 00:02:12 +02:00
parent a17d07ee87
commit b8173278b0

View File

@@ -109,7 +109,10 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
assert(queue);
(void)kernel_global_size;
const static size_t preferred_work_group_size_intersect_shading = 32;
const static size_t preferred_work_group_size_technical = 1024;
/* Shader evalutation kernels seems to use some amount of shared memory, so better
* to avoid usage of maximum work group sizes for them. */
const static size_t preferred_work_group_size_shader_evaluation = 256;
const static size_t preferred_work_group_size_default = 1024;
size_t preferred_work_group_size = 0;
switch (kernel) {
@@ -133,19 +136,36 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
preferred_work_group_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
case DEVICE_KERNEL_INTEGRATOR_RESET:
case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
preferred_work_group_size = preferred_work_group_size_technical;
preferred_work_group_size = GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
preferred_work_group_size = GPU_PARALLEL_SORT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_PREFIX_SUM:
preferred_work_group_size = GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
break;
default:
preferred_work_group_size = 512;
preferred_work_group_size = preferred_work_group_size_default;
break;
}
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
@@ -316,12 +336,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
kernel_context->queue, device_kernel, global_size);
assert(global_size % local_size == 0);
/* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
* overwrite it outside of oneapi_kernel_preferred_local_size. */
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
}
/* Kernels listed below need a specific number of work groups. */
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||