Cycles: oneAPI: Refactoring of local size choice logic

This commit is contained in:
Nikita Sirgienko
2023-08-22 19:04:16 +02:00
parent 42bf06a57d
commit abab47a805
5 changed files with 148 additions and 125 deletions

View File

@@ -748,13 +748,107 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_,
# undef KERNEL_DATA_ARRAY
}
bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
void **args)
bool OneapiDevice::enqueue_kernel(
KernelContext *kernel_context, int kernel, size_t global_size, size_t local_size, void **args)
{
return oneapi_enqueue_kernel(
kernel_context, kernel, global_size, kernel_features, use_hardware_raytracing, args);
return oneapi_enqueue_kernel(kernel_context,
kernel,
global_size,
local_size,
kernel_features,
use_hardware_raytracing,
args);
}
void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
const DeviceKernel kernel,
size_t &kernel_global_size,
size_t &kernel_local_size)
{
assert(queue);
const static size_t preferred_work_group_size_intersect_shading = 32;
/* Shader evaluation kernels seems to use some amount of shared memory, so better
* to avoid usage of maximum work group sizes for them. */
const static size_t preferred_work_group_size_shader_evaluation = 256;
/* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
* for now their work-group size is restricted to 512. */
const static size_t preferred_work_group_size_cryptomatte = 512;
const static size_t preferred_work_group_size_default = 1024;
size_t preferred_work_group_size = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT:
preferred_work_group_size = preferred_work_group_size_intersect_shading;
break;
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
preferred_work_group_size = preferred_work_group_size_cryptomatte;
break;
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
break;
default:
/* Do nothing and keep initial zero value. */
break;
}
/* Such order of logic allow us to override Blender default values, if needed,
* yet respect them overwise. */
if (preferred_work_group_size == 0) {
preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel);
}
/* If there is no recommendetion, then use manual default value. */
if (preferred_work_group_size == 0) {
preferred_work_group_size = preferred_work_group_size_default;
}
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
->get_device()
.get_info<sycl::info::device::max_work_group_size>();
kernel_local_size = std::min(limit_work_group_size, preferred_work_group_size);
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */
kernel_global_size = round_up(kernel_global_size, kernel_local_size);
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
/* Kernels listed below need a specific number of work groups. */
if (kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY)
{
/* Path array implementation is serial in case of SYCL Host Task execution. */
global_size = 1;
local_size = 1;
}
# endif
assert(uniformed_kernel_work_size % local_size == 0);
}
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows

View File

@@ -118,7 +118,15 @@ class OneapiDevice : public Device {
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer);
bool enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, void **args);
bool enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
size_t local_size,
void **args);
void get_adjusted_global_and_local_sizes(SyclQueue *queue,
const DeviceKernel kernel,
size_t &kernel_global_size,
size_t &kernel_local_size);
SyclQueue *sycl_queue();
protected:

View File

@@ -77,18 +77,18 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
debug_enqueue_begin(kernel, signed_kernel_work_size);
assert(signed_kernel_work_size >= 0);
size_t kernel_work_size = (size_t)signed_kernel_work_size;
size_t kernel_global_size = (size_t)signed_kernel_work_size;
size_t kernel_local_size;
assert(kernel_context_);
kernel_context_->scene_max_shaders = oneapi_device_->scene_max_shaders();
size_t kernel_local_size = oneapi_kernel_preferred_local_size(
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
oneapi_device_->get_adjusted_global_and_local_sizes(
kernel_context_->queue, kernel, kernel_global_size, kernel_local_size);
/* Call the oneAPI kernel DLL to launch the requested kernel. */
bool is_finished_ok = oneapi_device_->enqueue_kernel(
kernel_context_, kernel, uniformed_kernel_work_size, args);
kernel_context_, kernel, kernel_global_size, kernel_local_size, args);
if (is_finished_ok == false) {
oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) +

View File

@@ -41,6 +41,37 @@ void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
s_error_user_ptr = user_ptr;
}
size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel)
{
/* This defines are available only to the device code, so making this function
* seems to be the most reasonable way to provide access to them for the host code. */
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
return GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
return GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE;
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
return GPU_PARALLEL_SORT_BLOCK_SIZE;
case DEVICE_KERNEL_PREFIX_SUM:
return GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE;
default:
return (size_t)0;
}
}
/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
* memory allocations, memory transfers and execution of kernel with USM memory. */
bool oneapi_run_test_kernel(SyclQueue *queue_)
@@ -103,90 +134,6 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return is_computation_correct;
}
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
/* TODO: Move below function to oneapi/queue.cpp. */
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
const DeviceKernel kernel,
const size_t kernel_global_size)
{
assert(queue);
(void)kernel_global_size;
const static size_t preferred_work_group_size_intersect_shading = 32;
/* Shader evaluation kernels seems to use some amount of shared memory, so better
* to avoid usage of maximum work group sizes for them. */
const static size_t preferred_work_group_size_shader_evaluation = 256;
/* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
* for now their work-group size is restricted to 512. */
const static size_t preferred_work_group_size_cryptomatte = 512;
const static size_t preferred_work_group_size_default = 1024;
size_t preferred_work_group_size = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT:
preferred_work_group_size = preferred_work_group_size_intersect_shading;
break;
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
preferred_work_group_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
preferred_work_group_size = GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
preferred_work_group_size = GPU_PARALLEL_SORT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_PREFIX_SUM:
preferred_work_group_size = GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE;
break;
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
preferred_work_group_size = preferred_work_group_size_cryptomatte;
break;
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
break;
default:
preferred_work_group_size = preferred_work_group_size_default;
break;
}
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
->get_device()
.get_info<sycl::info::device::max_work_group_size>();
return std::min(limit_work_group_size, preferred_work_group_size);
}
bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
const uint kernel_features)
{
@@ -331,6 +278,7 @@ bool oneapi_load_kernels(SyclQueue *queue_,
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
size_t local_size,
const uint kernel_features,
bool use_hardware_raytracing,
void **args)
@@ -344,33 +292,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
return false;
}
size_t local_size = oneapi_kernel_preferred_local_size(
kernel_context->queue, device_kernel, global_size);
assert(global_size % local_size == 0);
/* Kernels listed below need a specific number of work groups. */
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY)
{
int num_states = *((int *)(args[0]));
/* Round up to the next work-group. */
size_t groups_count = (num_states + local_size - 1) / local_size;
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */
global_size = groups_count * local_size;
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
/* Path array implementation is serial in case of SYCL Host Task execution. */
global_size = 1;
local_size = 1;
# endif
}
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
# if defined(_WIN32)
# pragma warning(error : 4062)

View File

@@ -45,11 +45,11 @@ extern "C" {
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
int kernel,
size_t global_size,
size_t local_size,
const unsigned int kernel_features,
bool use_hardware_raytracing,
void **args);