There is a large overhead when doing copies between a device and non-USM host memory. Using the prepare/release API avoids it, as presented in the optimization guide: https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/optimizing-data-transfers.html This currently translates to a 4-5% overall rendering speedups on my Arc B580 in most scenes. Pull Request: https://projects.blender.org/blender/blender/pulls/132859
1325 lines
46 KiB
C++
1325 lines
46 KiB
C++
/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#ifdef WITH_ONEAPI
|
|
|
|
/* <algorithm> is needed until included upstream in sycl/detail/property_list_base.hpp */
|
|
# include <algorithm>
|
|
# include <sycl/sycl.hpp>
|
|
|
|
# include "device/oneapi/device_impl.h"
|
|
|
|
# include "util/log.h"
|
|
|
|
# ifdef WITH_EMBREE_GPU
|
|
# include "bvh/embree.h"
|
|
# endif
|
|
|
|
# if defined(WITH_OPENIMAGEDENOISE)
|
|
# include <OpenImageDenoise/config.h>
|
|
# if OIDN_VERSION >= 20300
|
|
# include "util/openimagedenoise.h" // IWYU pragma: keep
|
|
# endif
|
|
# endif
|
|
|
|
# include "kernel/device/oneapi/globals.h"
|
|
# include "kernel/device/oneapi/kernel.h"
|
|
|
|
# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
|
|
/* These declarations are missing from embree headers when compiling from a compiler that doesn't
|
|
* support SYCL. */
|
|
extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
|
|
extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
|
|
# endif
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
static std::vector<sycl::device> available_sycl_devices();
|
|
static int parse_driver_build_version(const sycl::device &device);
|
|
|
|
static void queue_error_cb(const char *message, void *user_ptr)
|
|
{
|
|
if (user_ptr) {
|
|
*reinterpret_cast<std::string *>(user_ptr) = message;
|
|
}
|
|
}
|
|
|
|
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
|
|
: GPUDevice(info, stats, profiler, headless),
|
|
device_queue_(nullptr),
|
|
# ifdef WITH_EMBREE_GPU
|
|
embree_device(nullptr),
|
|
embree_scene(nullptr),
|
|
# endif
|
|
kg_memory_(nullptr),
|
|
kg_memory_device_(nullptr),
|
|
kg_memory_size_(0)
|
|
{
|
|
/* Verify that base class types can be used with specific backend types */
|
|
static_assert(sizeof(texMemObject) == sizeof(void *));
|
|
static_assert(sizeof(arrayMemObject) == sizeof(void *));
|
|
|
|
use_hardware_raytracing = info.use_hardware_raytracing;
|
|
|
|
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
|
|
|
|
bool is_finished_ok = create_queue(device_queue_,
|
|
info.num,
|
|
# ifdef WITH_EMBREE_GPU
|
|
use_hardware_raytracing ? (void *)&embree_device : nullptr
|
|
# else
|
|
nullptr
|
|
# endif
|
|
);
|
|
|
|
if (is_finished_ok == false) {
|
|
set_error("oneAPI queue initialization error: got runtime exception \"" +
|
|
oneapi_error_string_ + "\"");
|
|
}
|
|
else {
|
|
VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
|
|
<< info.description << "\"";
|
|
assert(device_queue_);
|
|
}
|
|
|
|
# ifdef WITH_EMBREE_GPU
|
|
use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
|
|
# else
|
|
use_hardware_raytracing = false;
|
|
# endif
|
|
|
|
if (use_hardware_raytracing) {
|
|
VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
|
|
}
|
|
|
|
size_t globals_segment_size;
|
|
is_finished_ok = kernel_globals_size(globals_segment_size);
|
|
if (is_finished_ok == false) {
|
|
set_error("oneAPI constant memory initialization got runtime exception \"" +
|
|
oneapi_error_string_ + "\"");
|
|
}
|
|
else {
|
|
VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
|
|
}
|
|
|
|
kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
|
|
usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
|
|
|
|
kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
|
|
|
|
kg_memory_size_ = globals_segment_size;
|
|
|
|
max_memory_on_device_ = get_memcapacity();
|
|
init_host_memory();
|
|
move_texture_to_host = false;
|
|
can_map_host = true;
|
|
|
|
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
|
|
if (headroom_str != nullptr) {
|
|
const long long override_headroom = (float)atoll(headroom_str);
|
|
device_working_headroom = override_headroom;
|
|
device_texture_headroom = override_headroom;
|
|
}
|
|
VLOG_DEBUG << "oneAPI memory headroom size: "
|
|
<< string_human_readable_size(device_working_headroom);
|
|
}
|
|
|
|
OneapiDevice::~OneapiDevice()
|
|
{
|
|
# ifdef WITH_EMBREE_GPU
|
|
if (embree_device) {
|
|
rtcReleaseDevice(embree_device);
|
|
}
|
|
# endif
|
|
|
|
texture_info.free();
|
|
usm_free(device_queue_, kg_memory_);
|
|
usm_free(device_queue_, kg_memory_device_);
|
|
|
|
const_mem_map_.clear();
|
|
|
|
if (device_queue_) {
|
|
free_queue(device_queue_);
|
|
}
|
|
}
|
|
|
|
bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool OneapiDevice::can_use_hardware_raytracing_for_features(const uint requested_features) const
|
|
{
|
|
/* MNEE and Ray-trace kernels work correctly with Hardware Ray-tracing starting with Embree 4.1.
|
|
*/
|
|
# if defined(RTC_VERSION) && RTC_VERSION < 40100
|
|
return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE));
|
|
# else
|
|
(void)requested_features;
|
|
return true;
|
|
# endif
|
|
}
|
|
|
|
BVHLayoutMask OneapiDevice::get_bvh_layout_mask(const uint requested_features) const
|
|
{
|
|
return (use_hardware_raytracing &&
|
|
can_use_hardware_raytracing_for_features(requested_features)) ?
|
|
BVH_LAYOUT_EMBREEGPU :
|
|
BVH_LAYOUT_BVH2;
|
|
}
|
|
|
|
# ifdef WITH_EMBREE_GPU
|
|
void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
|
{
|
|
if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREEGPU) {
|
|
BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
|
|
if (refit) {
|
|
bvh_embree->refit(progress);
|
|
}
|
|
else {
|
|
bvh_embree->build(progress, &stats, embree_device, true);
|
|
}
|
|
|
|
# if RTC_VERSION >= 40302
|
|
thread_scoped_lock lock(scene_data_mutex);
|
|
all_embree_scenes.push_back(bvh_embree->scene);
|
|
# endif
|
|
|
|
if (bvh->params.top_level) {
|
|
embree_scene = bvh_embree->scene;
|
|
# if RTC_VERSION >= 40302
|
|
RTCError error_code = bvh_embree->offload_scenes_to_gpu(all_embree_scenes);
|
|
if (error_code != RTC_ERROR_NONE) {
|
|
set_error(
|
|
string_printf("BVH failed to migrate to the GPU due to Embree library error (%s)",
|
|
bvh_embree->get_error_string(error_code)));
|
|
}
|
|
all_embree_scenes.clear();
|
|
# endif
|
|
}
|
|
}
|
|
else {
|
|
Device::build_bvh(bvh, progress, refit);
|
|
}
|
|
}
|
|
# endif
|
|
|
|
size_t OneapiDevice::get_free_mem() const
|
|
{
|
|
/* Accurate: Use device info, which is practically useful only on dGPU.
|
|
* This is because for non-discrete GPUs, all GPU memory allocations would
|
|
* be in the RAM, thus having the same performance for device and host pointers,
|
|
* so there is no need to be very accurate about what would end where. */
|
|
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
|
|
const bool is_integrated_gpu = device.get_info<sycl::info::device::host_unified_memory>();
|
|
if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu == false) {
|
|
return device.get_info<sycl::ext::intel::info::device::free_memory>();
|
|
}
|
|
/* Estimate: Capacity - in use. */
|
|
if (device_mem_in_use < max_memory_on_device_) {
|
|
return max_memory_on_device_ - device_mem_in_use;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool OneapiDevice::load_kernels(const uint requested_features)
|
|
{
|
|
assert(device_queue_);
|
|
|
|
/* Kernel loading is expected to be a cumulative operation; for example, if
|
|
* a device is asked to load kernel A and then kernel B, then after these
|
|
* operations, both A and B should be available for use. So we need to store
|
|
* and use a cumulative mask of the requested kernel features, and not just
|
|
* the latest requested features.
|
|
*/
|
|
kernel_features |= requested_features;
|
|
|
|
bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
|
|
if (is_finished_ok == false) {
|
|
set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
|
|
"\"");
|
|
return false;
|
|
}
|
|
VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\"";
|
|
assert(device_queue_);
|
|
|
|
if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
|
|
VLOG_INFO
|
|
<< "Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
|
|
use_hardware_raytracing = false;
|
|
}
|
|
|
|
is_finished_ok = oneapi_load_kernels(
|
|
device_queue_, (const unsigned int)requested_features, use_hardware_raytracing);
|
|
if (is_finished_ok == false) {
|
|
set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
|
|
}
|
|
else {
|
|
VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
|
|
}
|
|
|
|
if (is_finished_ok) {
|
|
reserve_private_memory(requested_features);
|
|
is_finished_ok = !have_error();
|
|
}
|
|
|
|
return is_finished_ok;
|
|
}
|
|
|
|
void OneapiDevice::reserve_private_memory(const uint kernel_features)
|
|
{
|
|
size_t free_before = get_free_mem();
|
|
|
|
/* Use the biggest kernel for estimation. */
|
|
const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
|
|
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
|
|
(kernel_features & KERNEL_FEATURE_MNEE) ?
|
|
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
|
|
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
|
|
|
|
{
|
|
unique_ptr<DeviceQueue> queue = gpu_queue_create();
|
|
|
|
device_ptr d_path_index = 0;
|
|
device_ptr d_render_buffer = 0;
|
|
int d_work_size = 0;
|
|
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
|
|
|
|
queue->init_execution();
|
|
/* Launch of the kernel seems to be sufficient to reserve all
|
|
* needed memory regardless of the execution global size.
|
|
* So, the smallest possible size is used here. */
|
|
queue->enqueue(test_kernel, 1, args);
|
|
queue->synchronize();
|
|
}
|
|
|
|
size_t free_after = get_free_mem();
|
|
|
|
VLOG_INFO << "For kernel execution were reserved "
|
|
<< string_human_readable_number(free_before - free_after) << " bytes. ("
|
|
<< string_human_readable_size(free_before - free_after) << ")";
|
|
}
|
|
|
|
void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
|
|
{
|
|
free = get_free_mem();
|
|
total = max_memory_on_device_;
|
|
}
|
|
|
|
bool OneapiDevice::alloc_device(void *&device_pointer, const size_t size)
|
|
{
|
|
bool allocation_success = false;
|
|
device_pointer = usm_alloc_device(device_queue_, size);
|
|
if (device_pointer != nullptr) {
|
|
allocation_success = true;
|
|
/* Due to lazy memory initialization in GPU runtime we will force memory to
|
|
* appear in device memory via execution of a kernel using this memory. */
|
|
if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
|
|
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
|
|
"\"");
|
|
usm_free(device_queue_, device_pointer);
|
|
|
|
device_pointer = nullptr;
|
|
allocation_success = false;
|
|
}
|
|
}
|
|
|
|
return allocation_success;
|
|
}
|
|
|
|
void OneapiDevice::free_device(void *device_pointer)
|
|
{
|
|
usm_free(device_queue_, device_pointer);
|
|
}
|
|
|
|
bool OneapiDevice::alloc_host(void *&shared_pointer, const size_t size)
|
|
{
|
|
shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
|
|
return shared_pointer != nullptr;
|
|
}
|
|
|
|
void OneapiDevice::free_host(void *shared_pointer)
|
|
{
|
|
usm_free(device_queue_, shared_pointer);
|
|
}
|
|
|
|
void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
|
|
{
|
|
/* Device and host pointer are in the same address space
|
|
* as we're using Unified Shared Memory. */
|
|
device_pointer = shared_pointer;
|
|
}
|
|
|
|
void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, const size_t size)
|
|
{
|
|
usm_memcpy(device_queue_, device_pointer, host_pointer, size);
|
|
}
|
|
|
|
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
|
|
SyclQueue *OneapiDevice::sycl_queue()
|
|
{
|
|
return device_queue_;
|
|
}
|
|
|
|
string OneapiDevice::oneapi_error_message()
|
|
{
|
|
return string(oneapi_error_string_);
|
|
}
|
|
|
|
int OneapiDevice::scene_max_shaders()
|
|
{
|
|
return scene_max_shaders_;
|
|
}
|
|
|
|
void *OneapiDevice::kernel_globals_device_pointer()
|
|
{
|
|
return kg_memory_device_;
|
|
}
|
|
|
|
void OneapiDevice::mem_alloc(device_memory &mem)
|
|
{
|
|
if (mem.type == MEM_TEXTURE) {
|
|
assert(!"mem_alloc not supported for textures.");
|
|
}
|
|
else if (mem.type == MEM_GLOBAL) {
|
|
assert(!"mem_alloc not supported for global memory.");
|
|
}
|
|
else {
|
|
if (mem.name) {
|
|
VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
|
}
|
|
generic_alloc(mem);
|
|
# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
|
|
/* Import host_pointer into USM memory for faster host<->device data transfers. */
|
|
if (mem.type == MEM_READ_WRITE || mem.type == MEM_READ_ONLY) {
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
|
|
sycl::ext::oneapi::experimental::prepare_for_device_copy(
|
|
mem.host_pointer, mem.memory_size(), *queue);
|
|
}
|
|
# endif
|
|
}
|
|
}
|
|
|
|
void OneapiDevice::mem_copy_to(device_memory &mem)
|
|
{
|
|
if (mem.name) {
|
|
VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
|
}
|
|
|
|
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
|
|
* because the associated GPU context may be in an invalid state at this point. */
|
|
if (have_error()) {
|
|
return;
|
|
}
|
|
|
|
if (mem.type == MEM_GLOBAL) {
|
|
global_free(mem);
|
|
global_alloc(mem);
|
|
}
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
tex_free((device_texture &)mem);
|
|
tex_alloc((device_texture &)mem);
|
|
}
|
|
else {
|
|
if (!mem.device_pointer) {
|
|
generic_alloc(mem);
|
|
}
|
|
|
|
generic_copy_to(mem);
|
|
}
|
|
}
|
|
|
|
void OneapiDevice::mem_copy_from(
|
|
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem)
|
|
{
|
|
if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
|
|
assert(!"mem_copy_from not supported for textures.");
|
|
}
|
|
else if (mem.host_pointer) {
|
|
const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
|
|
const size_t offset = elem * y * w;
|
|
|
|
if (mem.name) {
|
|
VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
<< string_human_readable_size(mem.memory_size()) << ") from offset " << offset
|
|
<< " data " << size << " bytes";
|
|
}
|
|
|
|
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
|
|
* because the associated GPU context may be in an invalid state at this point. */
|
|
if (have_error()) {
|
|
return;
|
|
}
|
|
|
|
assert(device_queue_);
|
|
|
|
assert(size != 0);
|
|
if (mem.device_pointer) {
|
|
char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
|
|
char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
|
|
bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
|
|
if (is_finished_ok == false) {
|
|
set_error("oneAPI memory operation error: got runtime exception \"" +
|
|
oneapi_error_string_ + "\"");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void OneapiDevice::mem_zero(device_memory &mem)
|
|
{
|
|
if (mem.name) {
|
|
VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
<< string_human_readable_size(mem.memory_size()) << ")\n";
|
|
}
|
|
|
|
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
|
|
* because the associated GPU context may be in an invalid state at this point. */
|
|
if (have_error()) {
|
|
return;
|
|
}
|
|
|
|
if (!mem.device_pointer) {
|
|
mem_alloc(mem);
|
|
}
|
|
if (!mem.device_pointer) {
|
|
return;
|
|
}
|
|
|
|
assert(device_queue_);
|
|
bool is_finished_ok = usm_memset(
|
|
device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
|
|
if (is_finished_ok == false) {
|
|
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
|
|
"\"");
|
|
}
|
|
}
|
|
|
|
void OneapiDevice::mem_free(device_memory &mem)
|
|
{
|
|
if (mem.name) {
|
|
VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
|
|
<< string_human_readable_number(mem.device_size) << " bytes. ("
|
|
<< string_human_readable_size(mem.device_size) << ")\n";
|
|
}
|
|
|
|
if (mem.type == MEM_GLOBAL) {
|
|
global_free(mem);
|
|
}
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
tex_free((device_texture &)mem);
|
|
}
|
|
else {
|
|
# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
|
|
if (mem.type == MEM_READ_WRITE || mem.type == MEM_READ_ONLY) {
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
|
|
sycl::ext::oneapi::experimental::release_from_device_copy(mem.host_pointer, *queue);
|
|
}
|
|
# endif
|
|
generic_free(mem);
|
|
}
|
|
}
|
|
|
|
device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem,
|
|
const size_t offset,
|
|
size_t /*size*/)
|
|
{
|
|
return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
|
|
mem.memory_elements_size(offset));
|
|
}
|
|
|
|
void OneapiDevice::const_copy_to(const char *name, void *host, const size_t size)
|
|
{
|
|
assert(name);
|
|
|
|
VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
|
|
<< string_human_readable_number(size) << " bytes. ("
|
|
<< string_human_readable_size(size) << ")";
|
|
|
|
# ifdef WITH_EMBREE_GPU
|
|
if (embree_scene != nullptr && strcmp(name, "data") == 0) {
|
|
assert(size <= sizeof(KernelData));
|
|
|
|
/* Update scene handle(since it is different for each device on multi devices) */
|
|
KernelData *const data = (KernelData *)host;
|
|
data->device_bvh = embree_scene;
|
|
|
|
/* We need this number later for proper local memory allocation. */
|
|
scene_max_shaders_ = data->max_shaders;
|
|
}
|
|
# endif
|
|
|
|
ConstMemMap::iterator i = const_mem_map_.find(name);
|
|
device_vector<uchar> *data;
|
|
|
|
if (i == const_mem_map_.end()) {
|
|
unique_ptr<device_vector<uchar>> data_ptr = make_unique<device_vector<uchar>>(
|
|
this, name, MEM_READ_ONLY);
|
|
data_ptr->alloc(size);
|
|
data = data_ptr.get();
|
|
const_mem_map_.insert(ConstMemMap::value_type(name, std::move(data_ptr)));
|
|
}
|
|
else {
|
|
data = i->second.get();
|
|
}
|
|
|
|
assert(data->memory_size() <= size);
|
|
memcpy(data->data(), host, size);
|
|
data->copy_to_device();
|
|
|
|
set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
|
|
|
|
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
|
}
|
|
|
|
void OneapiDevice::global_alloc(device_memory &mem)
|
|
{
|
|
assert(mem.name);
|
|
|
|
size_t size = mem.memory_size();
|
|
VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
|
|
<< string_human_readable_number(size) << " bytes. ("
|
|
<< string_human_readable_size(size) << ")";
|
|
|
|
generic_alloc(mem);
|
|
generic_copy_to(mem);
|
|
|
|
set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
|
|
|
|
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
|
}
|
|
|
|
void OneapiDevice::global_free(device_memory &mem)
|
|
{
|
|
if (mem.device_pointer) {
|
|
generic_free(mem);
|
|
}
|
|
}
|
|
|
|
void OneapiDevice::tex_alloc(device_texture &mem)
|
|
{
|
|
generic_alloc(mem);
|
|
generic_copy_to(mem);
|
|
|
|
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
|
|
const uint slot = mem.slot;
|
|
if (slot >= texture_info.size()) {
|
|
texture_info.resize(slot + 128);
|
|
}
|
|
|
|
texture_info[slot] = mem.info;
|
|
need_texture_info = true;
|
|
|
|
texture_info[slot].data = (uint64_t)mem.device_pointer;
|
|
}
|
|
|
|
void OneapiDevice::tex_free(device_texture &mem)
|
|
{
|
|
/* There is no texture memory in SYCL. */
|
|
if (mem.device_pointer) {
|
|
generic_free(mem);
|
|
}
|
|
}
|
|
|
|
unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
|
|
{
|
|
return make_unique<OneapiDeviceQueue>(this);
|
|
}
|
|
|
|
bool OneapiDevice::should_use_graphics_interop()
|
|
{
|
|
/* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
|
|
* return false. */
|
|
return false;
|
|
}
|
|
|
|
void *OneapiDevice::usm_aligned_alloc_host(const size_t memory_size, const size_t alignment)
|
|
{
|
|
assert(device_queue_);
|
|
return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
|
|
}
|
|
|
|
void OneapiDevice::usm_free(void *usm_ptr)
|
|
{
|
|
assert(device_queue_);
|
|
usm_free(device_queue_, usm_ptr);
|
|
}
|
|
|
|
void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
|
|
{
|
|
# ifndef NDEBUG
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
sycl::info::device_type device_type =
|
|
queue->get_device().get_info<sycl::info::device::device_type>();
|
|
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
|
|
(void)usm_type;
|
|
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
|
const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
|
|
# else
|
|
const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
|
|
# endif
|
|
assert(usm_type == main_memory_type ||
|
|
(usm_type == sycl::usm::alloc::host &&
|
|
(allow_host || device_type == sycl::info::device_type::cpu)) ||
|
|
usm_type == sycl::usm::alloc::unknown);
|
|
# else
|
|
/* Silence warning about unused arguments. */
|
|
(void)queue_;
|
|
(void)usm_ptr;
|
|
(void)allow_host;
|
|
# endif
|
|
}
|
|
|
|
bool OneapiDevice::create_queue(SyclQueue *&external_queue,
|
|
const int device_index,
|
|
void *embree_device_pointer)
|
|
{
|
|
bool finished_correct = true;
|
|
try {
|
|
std::vector<sycl::device> devices = available_sycl_devices();
|
|
if (device_index < 0 || device_index >= devices.size()) {
|
|
return false;
|
|
}
|
|
sycl::queue *created_queue = new sycl::queue(devices[device_index],
|
|
sycl::property::queue::in_order());
|
|
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
|
|
# ifdef WITH_EMBREE_GPU
|
|
if (embree_device_pointer) {
|
|
RTCDevice *device_object_ptr = reinterpret_cast<RTCDevice *>(embree_device_pointer);
|
|
*device_object_ptr = rtcNewSYCLDevice(created_queue->get_context(), "");
|
|
if (*device_object_ptr == nullptr) {
|
|
finished_correct = false;
|
|
oneapi_error_string_ =
|
|
"Hardware Raytracing is not available; please install "
|
|
"\"intel-level-zero-gpu-raytracing\" to enable it or disable Embree on GPU.";
|
|
}
|
|
}
|
|
# else
|
|
(void)embree_device_pointer;
|
|
# endif
|
|
}
|
|
catch (const sycl::exception &e) {
|
|
finished_correct = false;
|
|
oneapi_error_string_ = e.what();
|
|
}
|
|
return finished_correct;
|
|
}
|
|
|
|
void OneapiDevice::free_queue(SyclQueue *queue_)
|
|
{
|
|
assert(queue_);
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
delete queue;
|
|
}
|
|
|
|
void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_,
|
|
size_t memory_size,
|
|
const size_t alignment)
|
|
{
|
|
assert(queue_);
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
|
|
}
|
|
|
|
void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
|
|
{
|
|
assert(queue_);
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
|
|
* and shared. For new project it could more beneficial to use USM shared memory, because it
|
|
* provides automatic migration mechanism in order to allow to use the same pointer on host and
|
|
* on device, without need to worry about explicit memory transfer operations, although usage of
|
|
* USM shared imply some documented limitations on the memory usage in regards of parallel access
|
|
* from different threads. But for Blender/Cycles this type of memory is not very suitable in
|
|
* current application architecture, because Cycles is multi-thread application and already uses
|
|
* two different pointer for host activity and device activity, and also has to perform all
|
|
* needed memory transfer operations. So, USM device memory type has been used for oneAPI device
|
|
* in order to better fit in Cycles architecture. */
|
|
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
|
return sycl::malloc_device(memory_size, *queue);
|
|
# else
|
|
return sycl::malloc_host(memory_size, *queue);
|
|
# endif
|
|
}
|
|
|
|
void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
|
|
{
|
|
assert(queue_);
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
OneapiDevice::check_usm(queue_, usm_ptr, true);
|
|
sycl::free(usm_ptr, *queue);
|
|
}
|
|
|
|
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, const size_t num_bytes)
|
|
{
|
|
assert(queue_);
|
|
/* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
|
|
* runtime errors. It's better to avoid running memory operations in that case.
|
|
* The render will be canceled and the queue will be destroyed anyway. */
|
|
if (have_error()) {
|
|
return false;
|
|
}
|
|
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
OneapiDevice::check_usm(queue_, dest, true);
|
|
OneapiDevice::check_usm(queue_, src, true);
|
|
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
|
|
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
|
|
/* Unknown here means, that this is not an USM allocation, which implies that this is
|
|
* some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
|
|
if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
|
|
(src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
|
|
{
|
|
memcpy(dest, src, num_bytes);
|
|
return true;
|
|
}
|
|
|
|
try {
|
|
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
|
|
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
|
|
*/
|
|
mem_event.wait_and_throw();
|
|
return true;
|
|
# else
|
|
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
|
|
src_type == sycl::usm::alloc::device;
|
|
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
|
|
src_type == sycl::usm::alloc::unknown;
|
|
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
|
|
* may not wait until the end of the transfer before using the memory.
|
|
*/
|
|
if (from_device_to_host || host_or_device_memop_with_offset) {
|
|
mem_event.wait();
|
|
}
|
|
return true;
|
|
# endif
|
|
}
|
|
catch (const sycl::exception &e) {
|
|
oneapi_error_string_ = e.what();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool OneapiDevice::usm_memset(SyclQueue *queue_,
|
|
void *usm_ptr,
|
|
unsigned char value,
|
|
const size_t num_bytes)
|
|
{
|
|
assert(queue_);
|
|
/* sycl::queue::memset may crash if the queue is in an invalid state due to previous
|
|
* runtime errors. It's better to avoid running memory operations in that case.
|
|
* The render will be canceled and the queue will be destroyed anyway. */
|
|
if (have_error()) {
|
|
return false;
|
|
}
|
|
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
OneapiDevice::check_usm(queue_, usm_ptr, true);
|
|
try {
|
|
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
|
|
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
|
|
*/
|
|
mem_event.wait_and_throw();
|
|
# else
|
|
(void)mem_event;
|
|
# endif
|
|
return true;
|
|
}
|
|
catch (const sycl::exception &e) {
|
|
oneapi_error_string_ = e.what();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
|
|
{
|
|
assert(queue_);
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
try {
|
|
queue->wait_and_throw();
|
|
return true;
|
|
}
|
|
catch (const sycl::exception &e) {
|
|
oneapi_error_string_ = e.what();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size)
|
|
{
|
|
kernel_global_size = sizeof(KernelGlobalsGPU);
|
|
|
|
return true;
|
|
}
|
|
|
|
void OneapiDevice::set_global_memory(SyclQueue *queue_,
|
|
void *kernel_globals,
|
|
const char *memory_name,
|
|
void *memory_device_pointer)
|
|
{
|
|
assert(queue_);
|
|
assert(kernel_globals);
|
|
assert(memory_name);
|
|
assert(memory_device_pointer);
|
|
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
|
|
OneapiDevice::check_usm(queue_, memory_device_pointer, true);
|
|
OneapiDevice::check_usm(queue_, kernel_globals, true);
|
|
|
|
std::string matched_name(memory_name);
|
|
|
|
/* This macro will change global ptr of KernelGlobals via name matching. */
|
|
# define KERNEL_DATA_ARRAY(type, name) \
|
|
else if (#name == matched_name) { \
|
|
globals->__##name = (type *)memory_device_pointer; \
|
|
return; \
|
|
}
|
|
if (false) {
|
|
}
|
|
else if ("integrator_state" == matched_name) {
|
|
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
|
|
return;
|
|
}
|
|
KERNEL_DATA_ARRAY(KernelData, data)
|
|
# include "kernel/data_arrays.h"
|
|
else {
|
|
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
|
|
<< std::endl;
|
|
assert(false);
|
|
}
|
|
# undef KERNEL_DATA_ARRAY
|
|
}
|
|
|
|
bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
|
|
const int kernel,
|
|
const size_t global_size,
|
|
const size_t local_size,
|
|
void **args)
|
|
{
|
|
return oneapi_enqueue_kernel(kernel_context,
|
|
kernel,
|
|
global_size,
|
|
local_size,
|
|
kernel_features,
|
|
use_hardware_raytracing,
|
|
args);
|
|
}
|
|
|
|
void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
|
|
const DeviceKernel kernel,
|
|
size_t &kernel_global_size,
|
|
size_t &kernel_local_size)
|
|
{
|
|
assert(queue);
|
|
static const size_t preferred_work_group_size_intersect = 128;
|
|
static const size_t preferred_work_group_size_shading = 256;
|
|
static const size_t preferred_work_group_size_shading_simd8 = 64;
|
|
/* Shader evaluation kernels seems to use some amount of shared memory, so better
|
|
* to avoid usage of maximum work group sizes for them. */
|
|
static const size_t preferred_work_group_size_shader_evaluation = 256;
|
|
/* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so
|
|
* for now their work-group size is restricted to 512. */
|
|
static const size_t preferred_work_group_size_cryptomatte = 512;
|
|
static const size_t preferred_work_group_size_default = 1024;
|
|
|
|
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
|
|
const size_t max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
|
|
|
|
size_t preferred_work_group_size = 0;
|
|
switch (kernel) {
|
|
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
|
|
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
|
|
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
|
|
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
|
|
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
|
|
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
|
|
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
|
|
preferred_work_group_size = preferred_work_group_size_intersect;
|
|
break;
|
|
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
|
|
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT: {
|
|
const bool device_is_simd8 =
|
|
(device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
|
|
device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() == 8);
|
|
preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 :
|
|
preferred_work_group_size_shading;
|
|
} break;
|
|
|
|
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
|
|
preferred_work_group_size = preferred_work_group_size_cryptomatte;
|
|
break;
|
|
|
|
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
|
|
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
|
|
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
|
|
preferred_work_group_size = preferred_work_group_size_shader_evaluation;
|
|
break;
|
|
|
|
default:
|
|
/* Do nothing and keep initial zero value. */
|
|
break;
|
|
}
|
|
|
|
/* Such order of logic allow us to override Blender default values, if needed,
|
|
* yet respect them otherwise. */
|
|
if (preferred_work_group_size == 0) {
|
|
preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel);
|
|
}
|
|
|
|
/* If there is no recommendation, then use manual default value. */
|
|
if (preferred_work_group_size == 0) {
|
|
preferred_work_group_size = preferred_work_group_size_default;
|
|
}
|
|
|
|
kernel_local_size = std::min(max_work_group_size, preferred_work_group_size);
|
|
|
|
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
|
|
* we extend work size to fit uniformity requirements. */
|
|
kernel_global_size = round_up(kernel_global_size, kernel_local_size);
|
|
|
|
# ifdef WITH_ONEAPI_SYCL_HOST_TASK
|
|
/* Kernels listed below need a specific number of work groups. */
|
|
if (kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
|
|
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
|
|
kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
|
|
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
|
|
kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
|
|
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
|
|
kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY)
|
|
{
|
|
/* Path array implementation is serial in case of SYCL Host Task execution. */
|
|
kernel_global_size = 1;
|
|
kernel_local_size = 1;
|
|
}
|
|
# endif
|
|
|
|
assert(kernel_global_size % kernel_local_size == 0);
|
|
}
|
|
|
|
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
|
|
* since Windows driver 101.3268. */
|
|
static const int lowest_supported_driver_version_win = 1015730;
|
|
# ifdef _WIN32
|
|
/* For Windows driver 101.5730, compute-runtime version is 29550.
|
|
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
|
|
static const int lowest_supported_driver_version_neo = 29550;
|
|
# else
|
|
static const int lowest_supported_driver_version_neo = 29735;
|
|
# endif
|
|
|
|
int parse_driver_build_version(const sycl::device &device)
|
|
{
|
|
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
|
|
int driver_build_version = 0;
|
|
|
|
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
|
|
if (second_dot_position == std::string::npos) {
|
|
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
|
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
|
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
|
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
|
}
|
|
else {
|
|
try {
|
|
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
|
|
if (third_dot_position != std::string::npos) {
|
|
const std::string &third_number_substr = driver_version.substr(
|
|
second_dot_position + 1, third_dot_position - second_dot_position - 1);
|
|
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
|
|
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) {
|
|
driver_build_version = std::stoi(third_number_substr) * 10000 +
|
|
std::stoi(forth_number_substr);
|
|
}
|
|
}
|
|
else {
|
|
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
|
|
driver_build_version = std::stoi(third_number_substr);
|
|
}
|
|
}
|
|
catch (std::invalid_argument &) {
|
|
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
|
|
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
|
|
<< " xx.xx.xxx.xxxx (Windows) for device \""
|
|
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
|
|
}
|
|
}
|
|
|
|
return driver_build_version;
|
|
}
|
|
|
|
std::vector<sycl::device> available_sycl_devices()
|
|
{
|
|
bool allow_all_devices = false;
|
|
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
|
|
allow_all_devices = true;
|
|
}
|
|
|
|
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
|
|
|
|
std::vector<sycl::device> available_devices;
|
|
for (const sycl::platform &platform : oneapi_platforms) {
|
|
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
|
|
*/
|
|
if (platform.get_backend() == sycl::backend::opencl) {
|
|
continue;
|
|
}
|
|
|
|
const std::vector<sycl::device> &oneapi_devices =
|
|
(allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
|
|
platform.get_devices(sycl::info::device_type::gpu);
|
|
|
|
for (const sycl::device &device : oneapi_devices) {
|
|
bool filter_out = false;
|
|
if (!allow_all_devices) {
|
|
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
|
|
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
|
|
* Official support can be broaden to older and smaller GPUs once ready. */
|
|
if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
|
filter_out = true;
|
|
}
|
|
else {
|
|
/* Filtered-out defaults in-case these values aren't available. */
|
|
int number_of_eus = 96;
|
|
int threads_per_eu = 7;
|
|
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
|
|
number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
|
|
}
|
|
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
|
|
threads_per_eu =
|
|
device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
|
|
}
|
|
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
|
|
if (number_of_eus <= 96 && threads_per_eu == 7) {
|
|
filter_out = true;
|
|
}
|
|
/* if not already filtered out, check driver version. */
|
|
bool check_driver_version = !filter_out;
|
|
/* We don't know how to check driver version strings for non-Intel GPUs. */
|
|
if (check_driver_version &&
|
|
device.get_info<sycl::info::device::vendor>().find("Intel") == std::string::npos)
|
|
{
|
|
check_driver_version = false;
|
|
}
|
|
/* Because of https://github.com/oneapi-src/unified-runtime/issues/1777, future drivers
|
|
* may break parsing done by a SYCL runtime from before the fix we expect in major
|
|
* version 8. Parsed driver version would start with something different than current
|
|
* "1.3.". To avoid blocking a device by mistake in the case of new driver / old SYCL
|
|
* runtime, we disable driver version check in case LIBSYCL_MAJOR_VERSION is below 8 and
|
|
* actual driver version doesn't start with 1.3. */
|
|
# if __LIBSYCL_MAJOR_VERSION < 8
|
|
if (check_driver_version &&
|
|
!string_startswith(device.get_info<sycl::info::device::driver_version>(), "1.3."))
|
|
{
|
|
check_driver_version = false;
|
|
}
|
|
# endif
|
|
if (check_driver_version) {
|
|
int driver_build_version = parse_driver_build_version(device);
|
|
if ((driver_build_version > 100000 &&
|
|
driver_build_version < lowest_supported_driver_version_win) ||
|
|
driver_build_version < lowest_supported_driver_version_neo)
|
|
{
|
|
filter_out = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (!filter_out) {
|
|
available_devices.push_back(device);
|
|
}
|
|
}
|
|
}
|
|
|
|
return available_devices;
|
|
}
|
|
|
|
char *OneapiDevice::device_capabilities()
|
|
{
|
|
std::stringstream capabilities;
|
|
|
|
const std::vector<sycl::device> &oneapi_devices = available_sycl_devices();
|
|
for (const sycl::device &device : oneapi_devices) {
|
|
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
|
const std::string &name = device.get_info<sycl::info::device::name>();
|
|
# else
|
|
const std::string &name = "SYCL Host Task (Debug)";
|
|
# endif
|
|
|
|
capabilities << std::string("\t") << name << "\n";
|
|
capabilities << "\t\tsycl::info::platform::name\t\t\t"
|
|
<< device.get_platform().get_info<sycl::info::platform::name>() << "\n";
|
|
|
|
# define WRITE_ATTR(attribute_name, attribute_variable) \
|
|
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
|
|
<< "\n";
|
|
# define GET_ATTR(attribute) \
|
|
{ \
|
|
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" \
|
|
<< device.get_info<sycl::info::device ::attribute>() << "\n"; \
|
|
}
|
|
# define GET_INTEL_ATTR(attribute) \
|
|
{ \
|
|
if (device.has(sycl::aspect::ext_intel_##attribute)) { \
|
|
capabilities << "\t\tsycl::ext::intel::info::device::" #attribute "\t\t\t" \
|
|
<< device.get_info<sycl::ext::intel::info::device ::attribute>() << "\n"; \
|
|
} \
|
|
}
|
|
# define GET_ASPECT(aspect_) \
|
|
{ \
|
|
capabilities << "\t\tdevice::has(" #aspect_ ")\t\t\t" << device.has(sycl::aspect ::aspect_) \
|
|
<< "\n"; \
|
|
}
|
|
|
|
GET_ATTR(vendor)
|
|
GET_ATTR(driver_version)
|
|
GET_ATTR(max_compute_units)
|
|
GET_ATTR(max_clock_frequency)
|
|
GET_ATTR(global_mem_size)
|
|
GET_INTEL_ATTR(pci_address)
|
|
GET_INTEL_ATTR(gpu_eu_simd_width)
|
|
GET_INTEL_ATTR(gpu_eu_count)
|
|
GET_INTEL_ATTR(gpu_slices)
|
|
GET_INTEL_ATTR(gpu_subslices_per_slice)
|
|
GET_INTEL_ATTR(gpu_eu_count_per_subslice)
|
|
GET_INTEL_ATTR(gpu_hw_threads_per_eu)
|
|
GET_INTEL_ATTR(max_mem_bandwidth)
|
|
GET_ATTR(max_work_group_size)
|
|
GET_ATTR(max_work_item_dimensions)
|
|
sycl::id<3> max_work_item_sizes =
|
|
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
|
|
WRITE_ATTR(max_work_item_sizes[0], max_work_item_sizes.get(0))
|
|
WRITE_ATTR(max_work_item_sizes[1], max_work_item_sizes.get(1))
|
|
WRITE_ATTR(max_work_item_sizes[2], max_work_item_sizes.get(2))
|
|
|
|
GET_ATTR(max_num_sub_groups)
|
|
for (size_t sub_group_size : device.get_info<sycl::info::device::sub_group_sizes>()) {
|
|
WRITE_ATTR(sub_group_size[], sub_group_size)
|
|
}
|
|
GET_ATTR(sub_group_independent_forward_progress)
|
|
|
|
GET_ATTR(preferred_vector_width_char)
|
|
GET_ATTR(preferred_vector_width_short)
|
|
GET_ATTR(preferred_vector_width_int)
|
|
GET_ATTR(preferred_vector_width_long)
|
|
GET_ATTR(preferred_vector_width_float)
|
|
GET_ATTR(preferred_vector_width_double)
|
|
GET_ATTR(preferred_vector_width_half)
|
|
|
|
GET_ATTR(address_bits)
|
|
GET_ATTR(max_mem_alloc_size)
|
|
GET_ATTR(mem_base_addr_align)
|
|
GET_ATTR(error_correction_support)
|
|
GET_ATTR(is_available)
|
|
|
|
GET_ASPECT(cpu)
|
|
GET_ASPECT(gpu)
|
|
GET_ASPECT(fp16)
|
|
GET_ASPECT(atomic64)
|
|
GET_ASPECT(usm_host_allocations)
|
|
GET_ASPECT(usm_device_allocations)
|
|
GET_ASPECT(usm_shared_allocations)
|
|
GET_ASPECT(usm_system_allocations)
|
|
|
|
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_non_uniform_groups__
|
|
GET_ASPECT(ext_oneapi_non_uniform_groups)
|
|
# endif
|
|
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images__
|
|
GET_ASPECT(ext_oneapi_bindless_images)
|
|
# endif
|
|
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__
|
|
GET_ASPECT(ext_oneapi_interop_semaphore_import)
|
|
# endif
|
|
# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_export__
|
|
GET_ASPECT(ext_oneapi_interop_semaphore_export)
|
|
# endif
|
|
|
|
# undef GET_INTEL_ATTR
|
|
# undef GET_ASPECT
|
|
# undef GET_ATTR
|
|
# undef WRITE_ATTR
|
|
capabilities << "\n";
|
|
}
|
|
|
|
return ::strdup(capabilities.str().c_str());
|
|
}
|
|
|
|
void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
|
|
{
|
|
int num = 0;
|
|
std::vector<sycl::device> devices = available_sycl_devices();
|
|
for (sycl::device &device : devices) {
|
|
const std::string &platform_name =
|
|
device.get_platform().get_info<sycl::info::platform::name>();
|
|
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
|
std::string name = device.get_info<sycl::info::device::name>();
|
|
# else
|
|
std::string name = "SYCL Host Task (Debug)";
|
|
# endif
|
|
# ifdef WITH_EMBREE_GPU
|
|
bool hwrt_support = rtcIsSYCLDeviceSupported(device);
|
|
# else
|
|
bool hwrt_support = false;
|
|
# endif
|
|
# if defined(WITH_OPENIMAGEDENOISE) && OIDN_VERSION >= 20300
|
|
bool oidn_support = oidnIsSYCLDeviceSupported(&device);
|
|
# else
|
|
bool oidn_support = false;
|
|
# endif
|
|
std::string id = "ONEAPI_" + platform_name + "_" + name;
|
|
if (device.has(sycl::aspect::ext_intel_pci_address)) {
|
|
id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
|
|
}
|
|
(cb)(id.c_str(), name.c_str(), num, hwrt_support, oidn_support, user_ptr);
|
|
num++;
|
|
}
|
|
}
|
|
|
|
size_t OneapiDevice::get_memcapacity()
|
|
{
|
|
return reinterpret_cast<sycl::queue *>(device_queue_)
|
|
->get_device()
|
|
.get_info<sycl::info::device::global_mem_size>();
|
|
}
|
|
|
|
int OneapiDevice::get_num_multiprocessors()
|
|
{
|
|
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
|
|
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
|
|
return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int OneapiDevice::get_max_num_threads_per_multiprocessor()
|
|
{
|
|
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
|
|
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
|
|
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu))
|
|
{
|
|
return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
|
|
device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
#endif
|