Files
test/intern/cycles/device/oneapi/device_impl.cpp
Xavier Hallade 4477641467 Cycles: oneAPI: Fix driver version check for future Intel GPU drivers
SYCL runtime currently relies on an internal driver behavior that will
break the driver version string returned by SYCL if it changes:
https://github.com/oneapi-src/unified-runtime/issues/1777
This will be fixed at SYCL runtime level but until we use a new enough
one, we need to add additional verifications to avoid blocking execution
on a driver that will change this internal behavior.

Pull Request: https://projects.blender.org/blender/blender/pulls/124084
2024-07-03 14:12:16 +02:00

1289 lines
44 KiB
C++

/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0 */
#ifdef WITH_ONEAPI
/* <algorithm> is needed until included upstream in sycl/detail/property_list_base.hpp */
# include <algorithm>
# include <sycl/sycl.hpp>
# include "device/oneapi/device_impl.h"
# include "util/debug.h"
# include "util/foreach.h"
# include "util/log.h"
# ifdef WITH_EMBREE_GPU
# include "bvh/embree.h"
# endif
# if defined(WITH_OPENIMAGEDENOISE)
# include <OpenImageDenoise/config.h>
# if OIDN_VERSION >= 20300
# include "util/openimagedenoise.h"
# endif
# endif
# include "kernel/device/oneapi/globals.h"
# include "kernel/device/oneapi/kernel.h"
# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
/* These declarations are missing from embree headers when compiling from a compiler that doesn't
* support SYCL. */
extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
# endif
CCL_NAMESPACE_BEGIN
static std::vector<sycl::device> available_sycl_devices();
static int parse_driver_build_version(const sycl::device &device);
static void queue_error_cb(const char *message, void *user_ptr)
{
if (user_ptr) {
*reinterpret_cast<std::string *>(user_ptr) = message;
}
}
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
: GPUDevice(info, stats, profiler, headless),
device_queue_(nullptr),
# ifdef WITH_EMBREE_GPU
embree_device(nullptr),
embree_scene(nullptr),
# endif
kg_memory_(nullptr),
kg_memory_device_(nullptr),
kg_memory_size_(0)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(void *));
static_assert(sizeof(arrayMemObject) == sizeof(void *));
use_hardware_raytracing = info.use_hardware_raytracing;
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
bool is_finished_ok = create_queue(device_queue_,
info.num,
# ifdef WITH_EMBREE_GPU
use_hardware_raytracing ? &embree_device : nullptr
# else
nullptr
# endif
);
if (is_finished_ok == false) {
set_error("oneAPI queue initialization error: got runtime exception \"" +
oneapi_error_string_ + "\"");
}
else {
VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
<< info.description << "\"";
assert(device_queue_);
}
# ifdef WITH_EMBREE_GPU
use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
# else
use_hardware_raytracing = false;
# endif
if (use_hardware_raytracing) {
VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
}
size_t globals_segment_size;
is_finished_ok = kernel_globals_size(globals_segment_size);
if (is_finished_ok == false) {
set_error("oneAPI constant memory initialization got runtime exception \"" +
oneapi_error_string_ + "\"");
}
else {
VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
}
kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
kg_memory_size_ = globals_segment_size;
max_memory_on_device_ = get_memcapacity();
init_host_memory();
move_texture_to_host = false;
can_map_host = true;
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
if (headroom_str != nullptr) {
const long long override_headroom = (float)atoll(headroom_str);
device_working_headroom = override_headroom;
device_texture_headroom = override_headroom;
}
VLOG_DEBUG << "oneAPI memory headroom size: "
<< string_human_readable_size(device_working_headroom);
}
OneapiDevice::~OneapiDevice()
{
# ifdef WITH_EMBREE_GPU
if (embree_device)
rtcReleaseDevice(embree_device);
# endif
texture_info.free();
usm_free(device_queue_, kg_memory_);
usm_free(device_queue_, kg_memory_device_);
for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
delete mt->second;
if (device_queue_)
free_queue(device_queue_);
}
bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
{
return false;
}
bool OneapiDevice::can_use_hardware_raytracing_for_features(uint requested_features) const
{
/* MNEE and Ray-trace kernels work correctly with Hardware Ray-tracing starting with Embree 4.1.
*/
# if defined(RTC_VERSION) && RTC_VERSION < 40100
return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE));
# else
(void)requested_features;
return true;
# endif
}
BVHLayoutMask OneapiDevice::get_bvh_layout_mask(uint requested_features) const
{
return (use_hardware_raytracing &&
can_use_hardware_raytracing_for_features(requested_features)) ?
BVH_LAYOUT_EMBREEGPU :
BVH_LAYOUT_BVH2;
}
# ifdef WITH_EMBREE_GPU
void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
{
if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREEGPU) {
BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
if (refit) {
bvh_embree->refit(progress);
}
else {
bvh_embree->build(progress, &stats, embree_device, true);
}
# if RTC_VERSION >= 40302
thread_scoped_lock lock(scene_data_mutex);
all_embree_scenes.push_back(bvh_embree->scene);
# endif
if (bvh->params.top_level) {
embree_scene = bvh_embree->scene;
# if RTC_VERSION >= 40302
if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) {
set_error(
string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)",
bvh_embree->get_last_error_message()));
}
all_embree_scenes.clear();
# endif
}
}
else {
Device::build_bvh(bvh, progress, refit);
}
}
# endif
size_t OneapiDevice::get_free_mem() const
{
/* Accurate: Use device info, which is practically useful only on dGPU.
* This is because for non-discrete GPUs, all GPU memory allocations would
* be in the RAM, thus having the same performance for device and host pointers,
* so there is no need to be very accurate about what would end where. */
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
const bool is_integrated_gpu = device.get_info<sycl::info::device::host_unified_memory>();
if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu == false) {
return device.get_info<sycl::ext::intel::info::device::free_memory>();
}
/* Estimate: Capacity - in use. */
else if (device_mem_in_use < max_memory_on_device_) {
return max_memory_on_device_ - device_mem_in_use;
}
else {
return 0;
}
}
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
kernel_features = requested_features;
bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
if (is_finished_ok == false) {
set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
"\"");
return false;
}
else {
VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\"";
assert(device_queue_);
}
if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
VLOG_INFO
<< "Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
use_hardware_raytracing = false;
}
is_finished_ok = oneapi_load_kernels(
device_queue_, (const unsigned int)requested_features, use_hardware_raytracing);
if (is_finished_ok == false) {
set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
}
else {
VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
}
if (is_finished_ok) {
reserve_private_memory(requested_features);
is_finished_ok = !have_error();
}
return is_finished_ok;
}
void OneapiDevice::reserve_private_memory(const uint kernel_features)
{
size_t free_before = get_free_mem();
/* Use the biggest kernel for estimation. */
const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
(kernel_features & KERNEL_FEATURE_MNEE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
{
unique_ptr<DeviceQueue> queue = gpu_queue_create();
device_ptr d_path_index = 0;
device_ptr d_render_buffer = 0;
int d_work_size = 0;
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
queue->init_execution();
/* Launch of the kernel seems to be sufficient to reserve all
* needed memory regardless of the execution global size.
* So, the smallest possible size is used here. */
queue->enqueue(test_kernel, 1, args);
queue->synchronize();
}
size_t free_after = get_free_mem();
VLOG_INFO << "For kernel execution were reserved "
<< string_human_readable_number(free_before - free_after) << " bytes. ("
<< string_human_readable_size(free_before - free_after) << ")";
}
void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
{
free = get_free_mem();
total = max_memory_on_device_;
}
bool OneapiDevice::alloc_device(void *&device_pointer, size_t size)
{
bool allocation_success = false;
device_pointer = usm_alloc_device(device_queue_, size);
if (device_pointer != nullptr) {
allocation_success = true;
/* Due to lazy memory initialization in GPU runtime we will force memory to
* appear in device memory via execution of a kernel using this memory. */
if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
"\"");
usm_free(device_queue_, device_pointer);
device_pointer = nullptr;
allocation_success = false;
}
}
return allocation_success;
}
void OneapiDevice::free_device(void *device_pointer)
{
usm_free(device_queue_, device_pointer);
}
bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size)
{
shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
return shared_pointer != nullptr;
}
void OneapiDevice::free_host(void *shared_pointer)
{
usm_free(device_queue_, shared_pointer);
}
void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
/* Device and host pointer are in the same address space
* as we're using Unified Shared Memory. */
device_pointer = shared_pointer;
}
void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
usm_memcpy(device_queue_, device_pointer, host_pointer, size);
}
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
SyclQueue *OneapiDevice::sycl_queue()
{
return device_queue_;
}
string OneapiDevice::oneapi_error_message()
{
return string(oneapi_error_string_);
}
int OneapiDevice::scene_max_shaders()
{
return scene_max_shaders_;
}
void *OneapiDevice::kernel_globals_device_pointer()
{
return kg_memory_device_;
}
void OneapiDevice::mem_alloc(device_memory &mem)
{
if (mem.type == MEM_TEXTURE) {
assert(!"mem_alloc not supported for textures.");
}
else if (mem.type == MEM_GLOBAL) {
assert(!"mem_alloc not supported for global memory.");
}
else {
if (mem.name) {
VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
}
generic_alloc(mem);
}
}
void OneapiDevice::mem_copy_to(device_memory &mem)
{
if (mem.name) {
VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
}
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
* because the associated GPU context may be in an invalid state at this point. */
if (have_error()) {
return;
}
if (mem.type == MEM_GLOBAL) {
global_free(mem);
global_alloc(mem);
}
else if (mem.type == MEM_TEXTURE) {
tex_free((device_texture &)mem);
tex_alloc((device_texture &)mem);
}
else {
if (!mem.device_pointer)
generic_alloc(mem);
generic_copy_to(mem);
}
}
void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
{
if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
assert(!"mem_copy_from not supported for textures.");
}
else if (mem.host_pointer) {
const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
const size_t offset = elem * y * w;
if (mem.name) {
VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ") from offset " << offset
<< " data " << size << " bytes";
}
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
* because the associated GPU context may be in an invalid state at this point. */
if (have_error()) {
return;
}
assert(device_queue_);
assert(size != 0);
if (mem.device_pointer) {
char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
if (is_finished_ok == false) {
set_error("oneAPI memory operation error: got runtime exception \"" +
oneapi_error_string_ + "\"");
}
}
}
}
void OneapiDevice::mem_zero(device_memory &mem)
{
if (mem.name) {
VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")\n";
}
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
* because the associated GPU context may be in an invalid state at this point. */
if (have_error()) {
return;
}
if (!mem.device_pointer) {
mem_alloc(mem);
}
if (!mem.device_pointer) {
return;
}
assert(device_queue_);
bool is_finished_ok = usm_memset(
device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
if (is_finished_ok == false) {
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
"\"");
}
}
void OneapiDevice::mem_free(device_memory &mem)
{
if (mem.name) {
VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
<< string_human_readable_number(mem.device_size) << " bytes. ("
<< string_human_readable_size(mem.device_size) << ")\n";
}
if (mem.type == MEM_GLOBAL) {
global_free(mem);
}
else if (mem.type == MEM_TEXTURE) {
tex_free((device_texture &)mem);
}
else {
generic_free(mem);
}
}
device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
{
return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
mem.memory_elements_size(offset));
}
void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
{
assert(name);
VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
<< string_human_readable_number(size) << " bytes. ("
<< string_human_readable_size(size) << ")";
# ifdef WITH_EMBREE_GPU
if (embree_scene != nullptr && strcmp(name, "data") == 0) {
assert(size <= sizeof(KernelData));
/* Update scene handle(since it is different for each device on multi devices) */
KernelData *const data = (KernelData *)host;
data->device_bvh = embree_scene;
/* We need this number later for proper local memory allocation. */
scene_max_shaders_ = data->max_shaders;
}
# endif
ConstMemMap::iterator i = const_mem_map_.find(name);
device_vector<uchar> *data;
if (i == const_mem_map_.end()) {
data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
data->alloc(size);
const_mem_map_.insert(ConstMemMap::value_type(name, data));
}
else {
data = i->second;
}
assert(data->memory_size() <= size);
memcpy(data->data(), host, size);
data->copy_to_device();
set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
}
void OneapiDevice::global_alloc(device_memory &mem)
{
assert(mem.name);
size_t size = mem.memory_size();
VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
<< string_human_readable_number(size) << " bytes. ("
<< string_human_readable_size(size) << ")";
generic_alloc(mem);
generic_copy_to(mem);
set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
}
void OneapiDevice::global_free(device_memory &mem)
{
if (mem.device_pointer) {
generic_free(mem);
}
}
void OneapiDevice::tex_alloc(device_texture &mem)
{
generic_alloc(mem);
generic_copy_to(mem);
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
const uint slot = mem.slot;
if (slot >= texture_info.size()) {
texture_info.resize(slot + 128);
}
texture_info[slot] = mem.info;
need_texture_info = true;
texture_info[slot].data = (uint64_t)mem.device_pointer;
}
void OneapiDevice::tex_free(device_texture &mem)
{
/* There is no texture memory in SYCL. */
if (mem.device_pointer) {
generic_free(mem);
}
}
unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
{
return make_unique<OneapiDeviceQueue>(this);
}
bool OneapiDevice::should_use_graphics_interop()
{
/* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
* return false. */
return false;
}
void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
{
assert(device_queue_);
return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
}
void OneapiDevice::usm_free(void *usm_ptr)
{
assert(device_queue_);
return usm_free(device_queue_, usm_ptr);
}
void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
{
# ifndef NDEBUG
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
sycl::info::device_type device_type =
queue->get_device().get_info<sycl::info::device::device_type>();
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
# else
const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
# endif
assert(usm_type == main_memory_type ||
(usm_type == sycl::usm::alloc::host &&
(allow_host || device_type == sycl::info::device_type::cpu)) ||
usm_type == sycl::usm::alloc::unknown);
# else
/* Silence warning about unused arguments. */
(void)queue_;
(void)usm_ptr;
(void)allow_host;
# endif
}
bool OneapiDevice::create_queue(SyclQueue *&external_queue,
int device_index,
void *embree_device_pointer)
{
bool finished_correct = true;
try {
std::vector<sycl::device> devices = available_sycl_devices();
if (device_index < 0 || device_index >= devices.size()) {
return false;
}
sycl::queue *created_queue = new sycl::queue(devices[device_index],
sycl::property::queue::in_order());
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
# ifdef WITH_EMBREE_GPU
if (embree_device_pointer) {
*((RTCDevice *)embree_device_pointer) = rtcNewSYCLDevice(created_queue->get_context(), "");
}
# else
(void)embree_device_pointer;
# endif
}
catch (sycl::exception const &e) {
finished_correct = false;
oneapi_error_string_ = e.what();
}
return finished_correct;
}
void OneapiDevice::free_queue(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
delete queue;
}
void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
}
void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it could more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations, although usage of
* USM shared imply some documented limitations on the memory usage in regards of parallel access
* from different threads. But for Blender/Cycles this type of memory is not very suitable in
* current application architecture, because Cycles is multi-thread application and already uses
* two different pointer for host activity and device activity, and also has to perform all
* needed memory transfer operations. So, USM device memory type has been used for oneAPI device
* in order to better fit in Cycles architecture. */
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
return sycl::malloc_device(memory_size, *queue);
# else
return sycl::malloc_host(memory_size, *queue);
# endif
}
void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
sycl::free(usm_ptr, *queue);
}
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, dest, true);
OneapiDevice::check_usm(queue_, src, true);
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
/* Unknown here means, that this is not an USM allocation, which implies that this is
* some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
(src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
{
memcpy(dest, src, num_bytes);
return true;
}
try {
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
# else
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
src_type == sycl::usm::alloc::unknown;
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
* may not wait until the end of the transfer before using the memory.
*/
if (from_device_to_host || host_or_device_memop_with_offset)
mem_event.wait();
return true;
# endif
}
catch (sycl::exception const &e) {
oneapi_error_string_ = e.what();
return false;
}
}
bool OneapiDevice::usm_memset(SyclQueue *queue_,
void *usm_ptr,
unsigned char value,
size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memset may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
try {
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
# ifdef WITH_CYCLES_DEBUG
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
# else
(void)mem_event;
# endif
return true;
}
catch (sycl::exception const &e) {
oneapi_error_string_ = e.what();
return false;
}
}
bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
oneapi_error_string_ = e.what();
return false;
}
}
bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size)
{
kernel_global_size = sizeof(KernelGlobalsGPU);
return true;
}
void OneapiDevice::set_global_memory(SyclQueue *queue_,
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer)
{
assert(queue_);
assert(kernel_globals);
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
OneapiDevice::check_usm(queue_, memory_device_pointer, true);
OneapiDevice::check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
/* This macro will change global ptr of KernelGlobals via name matching. */
# define KERNEL_DATA_ARRAY(type, name) \
else if (#name == matched_name) { \
globals->__##name = (type *)memory_device_pointer; \
return; \
}
if (false) {
}
else if ("integrator_state" == matched_name) {
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
return;
}
KERNEL_DATA_ARRAY(KernelData, data)
# include "kernel/data_arrays.h"
else {
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
<< std::endl;
assert(false);
}
# undef KERNEL_DATA_ARRAY
}
bool OneapiDevice::enqueue_kernel(
KernelContext *kernel_context, int kernel, size_t global_size, size_t local_size, void **args)
{
return oneapi_enqueue_kernel(kernel_context,
kernel,
global_size,
local_size,
kernel_features,
use_hardware_raytracing,
args);
}
void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
const DeviceKernel kernel,
size_t &kernel_global_size,
size_t &kernel_local_size)
{
assert(queue);
const static size_t preferred_work_group_size_intersect = 128;
const static size_t preferred_work_group_size_shading = 256;
const static size_t preferred_work_group_size_shading_simd8 = 64;
/* Shader evaluation kernels seems to use some amount of shared memory, so better
* to avoid usage of maximum work group sizes for them. */
const static size_t preferred_work_group_size_shader_evaluation = 256;
/* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
* for now their work-group size is restricted to 512. */
const static size_t preferred_work_group_size_cryptomatte = 512;
const static size_t preferred_work_group_size_default = 1024;
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
const size_t max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
size_t preferred_work_group_size = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
preferred_work_group_size = preferred_work_group_size_intersect;
break;
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT: {
const bool device_is_simd8 =
(device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() == 8);
preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 :
preferred_work_group_size_shading;
} break;
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
preferred_work_group_size = preferred_work_group_size_cryptomatte;
break;
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
break;
default:
/* Do nothing and keep initial zero value. */
break;
}
/* Such order of logic allow us to override Blender default values, if needed,
* yet respect them otherwise. */
if (preferred_work_group_size == 0) {
preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel);
}
/* If there is no recommendation, then use manual default value. */
if (preferred_work_group_size == 0) {
preferred_work_group_size = preferred_work_group_size_default;
}
kernel_local_size = std::min(max_work_group_size, preferred_work_group_size);
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */
kernel_global_size = round_up(kernel_global_size, kernel_local_size);
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
/* Kernels listed below need a specific number of work groups. */
if (kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY)
{
/* Path array implementation is serial in case of SYCL Host Task execution. */
kernel_global_size = 1;
kernel_local_size = 1;
}
# endif
assert(kernel_global_size % kernel_local_size == 0);
}
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
static const int lowest_supported_driver_version_win = 1015518;
# ifdef _WIN32
/* For Windows driver 101.5518, compute-runtime version is 28044.
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
static const int lowest_supported_driver_version_neo = 29283;
# else
static const int lowest_supported_driver_version_neo = 27642;
# endif
int parse_driver_build_version(const sycl::device &device)
{
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
int driver_build_version = 0;
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
if (second_dot_position == std::string::npos) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
else {
try {
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
if (third_dot_position != std::string::npos) {
const std::string &third_number_substr = driver_version.substr(
second_dot_position + 1, third_dot_position - second_dot_position - 1);
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
driver_build_version = std::stoi(third_number_substr) * 10000 +
std::stoi(forth_number_substr);
}
else {
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
driver_build_version = std::stoi(third_number_substr);
}
}
catch (std::invalid_argument &) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
}
return driver_build_version;
}
std::vector<sycl::device> available_sycl_devices()
{
bool allow_all_devices = false;
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
allow_all_devices = true;
}
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
std::vector<sycl::device> available_devices;
for (const sycl::platform &platform : oneapi_platforms) {
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
*/
if (platform.get_backend() == sycl::backend::opencl) {
continue;
}
const std::vector<sycl::device> &oneapi_devices =
(allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
bool filter_out = false;
if (!allow_all_devices) {
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
* Official support can be broaden to older and smaller GPUs once ready. */
if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
filter_out = true;
}
else {
/* Filtered-out defaults in-case these values aren't available. */
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
}
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
threads_per_eu =
device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
}
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
if (number_of_eus <= 96 && threads_per_eu == 7) {
filter_out = true;
}
/* if not already filtered out, check driver version. */
bool check_driver_version = !filter_out;
/* We don't know how to check driver version strings for non-Intel GPUs. */
if (check_driver_version &&
device.get_info<sycl::info::device::vendor>().find("Intel") == std::string::npos)
{
check_driver_version = false;
}
/* Because of https://github.com/oneapi-src/unified-runtime/issues/1777, future drivers
* may break parsing done by a SYCL runtime from before the fix we expect in major
* version 8. Parsed driver version would start with something different than current
* "1.3.". To avoid blocking a device by mistake in the case of new driver / old SYCL
* runtime, we disable driver version check in case LIBSYCL_MAJOR_VERSION is below 8 and
* actual driver version doesn't start with 1.3. */
# if __LIBSYCL_MAJOR_VERSION < 8
if (check_driver_version &&
!string_startswith(device.get_info<sycl::info::device::driver_version>(), "1.3."))
{
check_driver_version = false;
}
# endif
if (check_driver_version) {
int driver_build_version = parse_driver_build_version(device);
if ((driver_build_version > 100000 &&
driver_build_version < lowest_supported_driver_version_win) ||
driver_build_version < lowest_supported_driver_version_neo)
{
filter_out = true;
}
}
}
}
if (!filter_out) {
available_devices.push_back(device);
}
}
}
return available_devices;
}
char *OneapiDevice::device_capabilities()
{
std::stringstream capabilities;
const std::vector<sycl::device> &oneapi_devices = available_sycl_devices();
for (const sycl::device &device : oneapi_devices) {
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
const std::string &name = device.get_info<sycl::info::device::name>();
# else
const std::string &name = "SYCL Host Task (Debug)";
# endif
capabilities << std::string("\t") << name << "\n";
capabilities << "\t\tsycl::info::platform::name\t\t\t"
<< device.get_platform().get_info<sycl::info::platform::name>() << "\n";
# define WRITE_ATTR(attribute_name, attribute_variable) \
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
<< "\n";
# define GET_ATTR(attribute) \
{ \
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" \
<< device.get_info<sycl::info::device ::attribute>() << "\n"; \
}
# define GET_INTEL_ATTR(attribute) \
{ \
if (device.has(sycl::aspect::ext_intel_##attribute)) { \
capabilities << "\t\tsycl::ext::intel::info::device::" #attribute "\t\t\t" \
<< device.get_info<sycl::ext::intel::info::device ::attribute>() << "\n"; \
} \
}
# define GET_ASPECT(aspect_) \
{ \
capabilities << "\t\tdevice::has(" #aspect_ ")\t\t\t" << device.has(sycl::aspect ::aspect_) \
<< "\n"; \
}
GET_ATTR(vendor)
GET_ATTR(driver_version)
GET_ATTR(max_compute_units)
GET_ATTR(max_clock_frequency)
GET_ATTR(global_mem_size)
GET_INTEL_ATTR(pci_address)
GET_INTEL_ATTR(gpu_eu_simd_width)
GET_INTEL_ATTR(gpu_eu_count)
GET_INTEL_ATTR(gpu_slices)
GET_INTEL_ATTR(gpu_subslices_per_slice)
GET_INTEL_ATTR(gpu_eu_count_per_subslice)
GET_INTEL_ATTR(gpu_hw_threads_per_eu)
GET_INTEL_ATTR(max_mem_bandwidth)
GET_ATTR(max_work_group_size)
GET_ATTR(max_work_item_dimensions)
sycl::id<3> max_work_item_sizes =
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
WRITE_ATTR(max_work_item_sizes[0], max_work_item_sizes.get(0))
WRITE_ATTR(max_work_item_sizes[1], max_work_item_sizes.get(1))
WRITE_ATTR(max_work_item_sizes[2], max_work_item_sizes.get(2))
GET_ATTR(max_num_sub_groups)
for (size_t sub_group_size : device.get_info<sycl::info::device::sub_group_sizes>()) {
WRITE_ATTR(sub_group_size[], sub_group_size)
}
GET_ATTR(sub_group_independent_forward_progress)
GET_ATTR(preferred_vector_width_char)
GET_ATTR(preferred_vector_width_short)
GET_ATTR(preferred_vector_width_int)
GET_ATTR(preferred_vector_width_long)
GET_ATTR(preferred_vector_width_float)
GET_ATTR(preferred_vector_width_double)
GET_ATTR(preferred_vector_width_half)
GET_ATTR(address_bits)
GET_ATTR(max_mem_alloc_size)
GET_ATTR(mem_base_addr_align)
GET_ATTR(error_correction_support)
GET_ATTR(is_available)
GET_ASPECT(cpu)
GET_ASPECT(gpu)
GET_ASPECT(fp16)
GET_ASPECT(atomic64)
GET_ASPECT(usm_host_allocations)
GET_ASPECT(usm_device_allocations)
GET_ASPECT(usm_shared_allocations)
GET_ASPECT(usm_system_allocations)
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_non_uniform_groups__
GET_ASPECT(ext_oneapi_non_uniform_groups)
# endif
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images__
GET_ASPECT(ext_oneapi_bindless_images)
# endif
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__
GET_ASPECT(ext_oneapi_interop_semaphore_import)
# endif
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_export__
GET_ASPECT(ext_oneapi_interop_semaphore_export)
# endif
# undef GET_INTEL_ATTR
# undef GET_ASPECT
# undef GET_ATTR
# undef WRITE_ATTR
capabilities << "\n";
}
return ::strdup(capabilities.str().c_str());
}
void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
{
int num = 0;
std::vector<sycl::device> devices = available_sycl_devices();
for (sycl::device &device : devices) {
const std::string &platform_name =
device.get_platform().get_info<sycl::info::platform::name>();
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
std::string name = device.get_info<sycl::info::device::name>();
# else
std::string name = "SYCL Host Task (Debug)";
# endif
# ifdef WITH_EMBREE_GPU
bool hwrt_support = rtcIsSYCLDeviceSupported(device);
# else
bool hwrt_support = false;
# endif
# if defined(WITH_OPENIMAGEDENOISE) && OIDN_VERSION >= 20300
bool oidn_support = oidnIsSYCLDeviceSupported(&device);
# else
bool oidn_support = false;
# endif
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, hwrt_support, oidn_support, user_ptr);
num++;
}
}
size_t OneapiDevice::get_memcapacity()
{
return reinterpret_cast<sycl::queue *>(device_queue_)
->get_device()
.get_info<sycl::info::device::global_mem_size>();
}
int OneapiDevice::get_num_multiprocessors()
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
}
else
return 0;
}
int OneapiDevice::get_max_num_threads_per_multiprocessor()
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu))
{
return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
}
else
return 0;
}
CCL_NAMESPACE_END
#endif