Cycles: oneAPI: Refactoring of local size choice logic
This commit is contained in:
@@ -748,13 +748,107 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_,
|
||||
# undef KERNEL_DATA_ARRAY
|
||||
}
|
||||
|
||||
bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
void **args)
|
||||
bool OneapiDevice::enqueue_kernel(
|
||||
KernelContext *kernel_context, int kernel, size_t global_size, size_t local_size, void **args)
|
||||
{
|
||||
return oneapi_enqueue_kernel(
|
||||
kernel_context, kernel, global_size, kernel_features, use_hardware_raytracing, args);
|
||||
return oneapi_enqueue_kernel(kernel_context,
|
||||
kernel,
|
||||
global_size,
|
||||
local_size,
|
||||
kernel_features,
|
||||
use_hardware_raytracing,
|
||||
args);
|
||||
}
|
||||
|
||||
void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
|
||||
const DeviceKernel kernel,
|
||||
size_t &kernel_global_size,
|
||||
size_t &kernel_local_size)
|
||||
{
|
||||
assert(queue);
|
||||
|
||||
const static size_t preferred_work_group_size_intersect_shading = 32;
|
||||
/* Shader evaluation kernels seems to use some amount of shared memory, so better
|
||||
* to avoid usage of maximum work group sizes for them. */
|
||||
const static size_t preferred_work_group_size_shader_evaluation = 256;
|
||||
/* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
|
||||
* for now their work-group size is restricted to 512. */
|
||||
const static size_t preferred_work_group_size_cryptomatte = 512;
|
||||
const static size_t preferred_work_group_size_default = 1024;
|
||||
|
||||
size_t preferred_work_group_size = 0;
|
||||
switch (kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT:
|
||||
preferred_work_group_size = preferred_work_group_size_intersect_shading;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
|
||||
preferred_work_group_size = preferred_work_group_size_cryptomatte;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
|
||||
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
|
||||
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
|
||||
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Do nothing and keep initial zero value. */
|
||||
break;
|
||||
}
|
||||
|
||||
/* Such order of logic allow us to override Blender default values, if needed,
|
||||
* yet respect them overwise. */
|
||||
if (preferred_work_group_size == 0) {
|
||||
preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel);
|
||||
}
|
||||
|
||||
/* If there is no recommendetion, then use manual default value. */
|
||||
if (preferred_work_group_size == 0) {
|
||||
preferred_work_group_size = preferred_work_group_size_default;
|
||||
}
|
||||
|
||||
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
kernel_local_size = std::min(limit_work_group_size, preferred_work_group_size);
|
||||
|
||||
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
|
||||
* we extend work size to fit uniformity requirements. */
|
||||
kernel_global_size = round_up(kernel_global_size, kernel_local_size);
|
||||
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
/* Kernels listed below need a specific number of work groups. */
|
||||
if (kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY)
|
||||
{
|
||||
/* Path array implementation is serial in case of SYCL Host Task execution. */
|
||||
global_size = 1;
|
||||
local_size = 1;
|
||||
}
|
||||
# endif
|
||||
|
||||
assert(uniformed_kernel_work_size % local_size == 0);
|
||||
}
|
||||
|
||||
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
|
||||
|
||||
@@ -118,7 +118,15 @@ class OneapiDevice : public Device {
|
||||
void *kernel_globals,
|
||||
const char *memory_name,
|
||||
void *memory_device_pointer);
|
||||
bool enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, void **args);
|
||||
bool enqueue_kernel(KernelContext *kernel_context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
size_t local_size,
|
||||
void **args);
|
||||
void get_adjusted_global_and_local_sizes(SyclQueue *queue,
|
||||
const DeviceKernel kernel,
|
||||
size_t &kernel_global_size,
|
||||
size_t &kernel_local_size);
|
||||
SyclQueue *sycl_queue();
|
||||
|
||||
protected:
|
||||
|
||||
@@ -77,18 +77,18 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
|
||||
debug_enqueue_begin(kernel, signed_kernel_work_size);
|
||||
assert(signed_kernel_work_size >= 0);
|
||||
size_t kernel_work_size = (size_t)signed_kernel_work_size;
|
||||
size_t kernel_global_size = (size_t)signed_kernel_work_size;
|
||||
size_t kernel_local_size;
|
||||
|
||||
assert(kernel_context_);
|
||||
kernel_context_->scene_max_shaders = oneapi_device_->scene_max_shaders();
|
||||
|
||||
size_t kernel_local_size = oneapi_kernel_preferred_local_size(
|
||||
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
|
||||
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
|
||||
oneapi_device_->get_adjusted_global_and_local_sizes(
|
||||
kernel_context_->queue, kernel, kernel_global_size, kernel_local_size);
|
||||
|
||||
/* Call the oneAPI kernel DLL to launch the requested kernel. */
|
||||
bool is_finished_ok = oneapi_device_->enqueue_kernel(
|
||||
kernel_context_, kernel, uniformed_kernel_work_size, args);
|
||||
kernel_context_, kernel, kernel_global_size, kernel_local_size, args);
|
||||
|
||||
if (is_finished_ok == false) {
|
||||
oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) +
|
||||
|
||||
@@ -41,6 +41,37 @@ void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
|
||||
s_error_user_ptr = user_ptr;
|
||||
}
|
||||
|
||||
size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel)
|
||||
{
|
||||
/* This defines are available only to the device code, so making this function
|
||||
* seems to be the most reasonable way to provide access to them for the host code. */
|
||||
switch (kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
|
||||
return GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
|
||||
return GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE;
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
|
||||
return GPU_PARALLEL_SORT_BLOCK_SIZE;
|
||||
|
||||
case DEVICE_KERNEL_PREFIX_SUM:
|
||||
return GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE;
|
||||
|
||||
default:
|
||||
return (size_t)0;
|
||||
}
|
||||
}
|
||||
|
||||
/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
|
||||
* memory allocations, memory transfers and execution of kernel with USM memory. */
|
||||
bool oneapi_run_test_kernel(SyclQueue *queue_)
|
||||
@@ -103,90 +134,6 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
|
||||
return is_computation_correct;
|
||||
}
|
||||
|
||||
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
|
||||
/* TODO: Move below function to oneapi/queue.cpp. */
|
||||
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
|
||||
const DeviceKernel kernel,
|
||||
const size_t kernel_global_size)
|
||||
{
|
||||
assert(queue);
|
||||
(void)kernel_global_size;
|
||||
const static size_t preferred_work_group_size_intersect_shading = 32;
|
||||
/* Shader evaluation kernels seems to use some amount of shared memory, so better
|
||||
* to avoid usage of maximum work group sizes for them. */
|
||||
const static size_t preferred_work_group_size_shader_evaluation = 256;
|
||||
/* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
|
||||
* for now their work-group size is restricted to 512. */
|
||||
const static size_t preferred_work_group_size_cryptomatte = 512;
|
||||
const static size_t preferred_work_group_size_default = 1024;
|
||||
|
||||
size_t preferred_work_group_size = 0;
|
||||
switch (kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT:
|
||||
preferred_work_group_size = preferred_work_group_size_intersect_shading;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
|
||||
preferred_work_group_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
|
||||
preferred_work_group_size = GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
|
||||
preferred_work_group_size = GPU_PARALLEL_SORT_BLOCK_SIZE;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_PREFIX_SUM:
|
||||
preferred_work_group_size = GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
|
||||
preferred_work_group_size = preferred_work_group_size_cryptomatte;
|
||||
break;
|
||||
|
||||
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
|
||||
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
|
||||
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
|
||||
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
|
||||
break;
|
||||
|
||||
default:
|
||||
preferred_work_group_size = preferred_work_group_size_default;
|
||||
break;
|
||||
}
|
||||
|
||||
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
|
||||
->get_device()
|
||||
.get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
return std::min(limit_work_group_size, preferred_work_group_size);
|
||||
}
|
||||
|
||||
bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
|
||||
const uint kernel_features)
|
||||
{
|
||||
@@ -331,6 +278,7 @@ bool oneapi_load_kernels(SyclQueue *queue_,
|
||||
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
size_t local_size,
|
||||
const uint kernel_features,
|
||||
bool use_hardware_raytracing,
|
||||
void **args)
|
||||
@@ -344,33 +292,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t local_size = oneapi_kernel_preferred_local_size(
|
||||
kernel_context->queue, device_kernel, global_size);
|
||||
assert(global_size % local_size == 0);
|
||||
|
||||
/* Kernels listed below need a specific number of work groups. */
|
||||
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY)
|
||||
{
|
||||
int num_states = *((int *)(args[0]));
|
||||
/* Round up to the next work-group. */
|
||||
size_t groups_count = (num_states + local_size - 1) / local_size;
|
||||
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
|
||||
* we extend work size to fit uniformity requirements. */
|
||||
global_size = groups_count * local_size;
|
||||
|
||||
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
|
||||
/* Path array implementation is serial in case of SYCL Host Task execution. */
|
||||
global_size = 1;
|
||||
local_size = 1;
|
||||
# endif
|
||||
}
|
||||
|
||||
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
|
||||
# if defined(_WIN32)
|
||||
# pragma warning(error : 4062)
|
||||
|
||||
@@ -45,11 +45,11 @@ extern "C" {
|
||||
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
|
||||
SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel);
|
||||
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
|
||||
int kernel,
|
||||
size_t global_size,
|
||||
size_t local_size,
|
||||
const unsigned int kernel_features,
|
||||
bool use_hardware_raytracing,
|
||||
void **args);
|
||||
|
||||
Reference in New Issue
Block a user