/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation * * SPDX-License-Identifier: Apache-2.0 */ #ifdef WITH_ONEAPI /* is needed until included upstream in sycl/detail/property_list_base.hpp */ # include # include # include "device/oneapi/device_impl.h" # include "util/debug.h" # include "util/foreach.h" # include "util/log.h" # ifdef WITH_EMBREE_GPU # include "bvh/embree.h" # endif # if defined(WITH_OPENIMAGEDENOISE) # include # if OIDN_VERSION >= 20300 # include "util/openimagedenoise.h" # endif # endif # include "kernel/device/oneapi/globals.h" # include "kernel/device/oneapi/kernel.h" # if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION) /* These declarations are missing from embree headers when compiling from a compiler that doesn't * support SYCL. */ extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config); extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device); # endif CCL_NAMESPACE_BEGIN static std::vector available_sycl_devices(); static int parse_driver_build_version(const sycl::device &device); static void queue_error_cb(const char *message, void *user_ptr) { if (user_ptr) { *reinterpret_cast(user_ptr) = message; } } OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless) : GPUDevice(info, stats, profiler, headless), device_queue_(nullptr), # ifdef WITH_EMBREE_GPU embree_device(nullptr), embree_scene(nullptr), # endif kg_memory_(nullptr), kg_memory_device_(nullptr), kg_memory_size_(0) { /* Verify that base class types can be used with specific backend types */ static_assert(sizeof(texMemObject) == sizeof(void *)); static_assert(sizeof(arrayMemObject) == sizeof(void *)); use_hardware_raytracing = info.use_hardware_raytracing; oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); bool is_finished_ok = create_queue(device_queue_, info.num, # ifdef WITH_EMBREE_GPU use_hardware_raytracing ? &embree_device : nullptr # else nullptr # endif ); if (is_finished_ok == false) { set_error("oneAPI queue initialization error: got runtime exception \"" + oneapi_error_string_ + "\""); } else { VLOG_DEBUG << "oneAPI queue has been successfully created for the device \"" << info.description << "\""; assert(device_queue_); } # ifdef WITH_EMBREE_GPU use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr); # else use_hardware_raytracing = false; # endif if (use_hardware_raytracing) { VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration."; } size_t globals_segment_size; is_finished_ok = kernel_globals_size(globals_segment_size); if (is_finished_ok == false) { set_error("oneAPI constant memory initialization got runtime exception \"" + oneapi_error_string_ + "\""); } else { VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)"; } kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16); usm_memset(device_queue_, kg_memory_, 0, globals_segment_size); kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size); kg_memory_size_ = globals_segment_size; max_memory_on_device_ = get_memcapacity(); init_host_memory(); move_texture_to_host = false; can_map_host = true; const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM"); if (headroom_str != nullptr) { const long long override_headroom = (float)atoll(headroom_str); device_working_headroom = override_headroom; device_texture_headroom = override_headroom; } VLOG_DEBUG << "oneAPI memory headroom size: " << string_human_readable_size(device_working_headroom); } OneapiDevice::~OneapiDevice() { # ifdef WITH_EMBREE_GPU if (embree_device) rtcReleaseDevice(embree_device); # endif texture_info.free(); usm_free(device_queue_, kg_memory_); usm_free(device_queue_, kg_memory_device_); for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++) delete mt->second; if (device_queue_) free_queue(device_queue_); } bool OneapiDevice::check_peer_access(Device * /*peer_device*/) { return false; } bool OneapiDevice::can_use_hardware_raytracing_for_features(uint requested_features) const { /* MNEE and Ray-trace kernels work correctly with Hardware Ray-tracing starting with Embree 4.1. */ # if defined(RTC_VERSION) && RTC_VERSION < 40100 return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE)); # else (void)requested_features; return true; # endif } BVHLayoutMask OneapiDevice::get_bvh_layout_mask(uint requested_features) const { return (use_hardware_raytracing && can_use_hardware_raytracing_for_features(requested_features)) ? BVH_LAYOUT_EMBREEGPU : BVH_LAYOUT_BVH2; } # ifdef WITH_EMBREE_GPU void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) { if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREEGPU) { BVHEmbree *const bvh_embree = static_cast(bvh); if (refit) { bvh_embree->refit(progress); } else { bvh_embree->build(progress, &stats, embree_device, true); } # if RTC_VERSION >= 40302 thread_scoped_lock lock(scene_data_mutex); all_embree_scenes.push_back(bvh_embree->scene); # endif if (bvh->params.top_level) { embree_scene = bvh_embree->scene; # if RTC_VERSION >= 40302 if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) { set_error( string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)", bvh_embree->get_last_error_message())); } all_embree_scenes.clear(); # endif } } else { Device::build_bvh(bvh, progress, refit); } } # endif size_t OneapiDevice::get_free_mem() const { /* Accurate: Use device info, which is practically useful only on dGPU. * This is because for non-discrete GPUs, all GPU memory allocations would * be in the RAM, thus having the same performance for device and host pointers, * so there is no need to be very accurate about what would end where. */ const sycl::device &device = reinterpret_cast(device_queue_)->get_device(); const bool is_integrated_gpu = device.get_info(); if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu == false) { return device.get_info(); } /* Estimate: Capacity - in use. */ else if (device_mem_in_use < max_memory_on_device_) { return max_memory_on_device_ - device_mem_in_use; } else { return 0; } } bool OneapiDevice::load_kernels(const uint requested_features) { assert(device_queue_); /* Kernel loading is expected to be a cumulative operation; for example, if * a device is asked to load kernel A and then kernel B, then after these * operations, both A and B should be available for use. So we need to store * and use a cumulative mask of the requested kernel features, and not just * the latest requested features. */ kernel_features |= requested_features; bool is_finished_ok = oneapi_run_test_kernel(device_queue_); if (is_finished_ok == false) { set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ + "\""); return false; } else { VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\""; assert(device_queue_); } if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) { VLOG_INFO << "Hardware ray tracing disabled, not supported yet by oneAPI for requested features."; use_hardware_raytracing = false; } is_finished_ok = oneapi_load_kernels( device_queue_, (const unsigned int)requested_features, use_hardware_raytracing); if (is_finished_ok == false) { set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\""); } else { VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\""; } if (is_finished_ok) { reserve_private_memory(requested_features); is_finished_ok = !have_error(); } return is_finished_ok; } void OneapiDevice::reserve_private_memory(const uint kernel_features) { size_t free_before = get_free_mem(); /* Use the biggest kernel for estimation. */ const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE : (kernel_features & KERNEL_FEATURE_MNEE) ? DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE : DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE; { unique_ptr queue = gpu_queue_create(); device_ptr d_path_index = 0; device_ptr d_render_buffer = 0; int d_work_size = 0; DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); queue->init_execution(); /* Launch of the kernel seems to be sufficient to reserve all * needed memory regardless of the execution global size. * So, the smallest possible size is used here. */ queue->enqueue(test_kernel, 1, args); queue->synchronize(); } size_t free_after = get_free_mem(); VLOG_INFO << "For kernel execution were reserved " << string_human_readable_number(free_before - free_after) << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; } void OneapiDevice::get_device_memory_info(size_t &total, size_t &free) { free = get_free_mem(); total = max_memory_on_device_; } bool OneapiDevice::alloc_device(void *&device_pointer, size_t size) { bool allocation_success = false; device_pointer = usm_alloc_device(device_queue_, size); if (device_pointer != nullptr) { allocation_success = true; /* Due to lazy memory initialization in GPU runtime we will force memory to * appear in device memory via execution of a kernel using this memory. */ if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) { set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + "\""); usm_free(device_queue_, device_pointer); device_pointer = nullptr; allocation_success = false; } } return allocation_success; } void OneapiDevice::free_device(void *device_pointer) { usm_free(device_queue_, device_pointer); } bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size) { shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64); return shared_pointer != nullptr; } void OneapiDevice::free_host(void *shared_pointer) { usm_free(device_queue_, shared_pointer); } void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer) { /* Device and host pointer are in the same address space * as we're using Unified Shared Memory. */ device_pointer = shared_pointer; } void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) { usm_memcpy(device_queue_, device_pointer, host_pointer, size); } /* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */ SyclQueue *OneapiDevice::sycl_queue() { return device_queue_; } string OneapiDevice::oneapi_error_message() { return string(oneapi_error_string_); } int OneapiDevice::scene_max_shaders() { return scene_max_shaders_; } void *OneapiDevice::kernel_globals_device_pointer() { return kg_memory_device_; } void OneapiDevice::mem_alloc(device_memory &mem) { if (mem.type == MEM_TEXTURE) { assert(!"mem_alloc not supported for textures."); } else if (mem.type == MEM_GLOBAL) { assert(!"mem_alloc not supported for global memory."); } else { if (mem.name) { VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; } generic_alloc(mem); } } void OneapiDevice::mem_copy_to(device_memory &mem) { if (mem.name) { VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; } /* After getting runtime errors we need to avoid performing oneAPI runtime operations * because the associated GPU context may be in an invalid state at this point. */ if (have_error()) { return; } if (mem.type == MEM_GLOBAL) { global_free(mem); global_alloc(mem); } else if (mem.type == MEM_TEXTURE) { tex_free((device_texture &)mem); tex_alloc((device_texture &)mem); } else { if (!mem.device_pointer) generic_alloc(mem); generic_copy_to(mem); } } void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) { if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { assert(!"mem_copy_from not supported for textures."); } else if (mem.host_pointer) { const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size(); const size_t offset = elem * y * w; if (mem.name) { VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ") from offset " << offset << " data " << size << " bytes"; } /* After getting runtime errors we need to avoid performing oneAPI runtime operations * because the associated GPU context may be in an invalid state at this point. */ if (have_error()) { return; } assert(device_queue_); assert(size != 0); if (mem.device_pointer) { char *shifted_host = reinterpret_cast(mem.host_pointer) + offset; char *shifted_device = reinterpret_cast(mem.device_pointer) + offset; bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size); if (is_finished_ok == false) { set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + "\""); } } } } void OneapiDevice::mem_zero(device_memory &mem) { if (mem.name) { VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")\n"; } /* After getting runtime errors we need to avoid performing oneAPI runtime operations * because the associated GPU context may be in an invalid state at this point. */ if (have_error()) { return; } if (!mem.device_pointer) { mem_alloc(mem); } if (!mem.device_pointer) { return; } assert(device_queue_); bool is_finished_ok = usm_memset( device_queue_, (void *)mem.device_pointer, 0, mem.memory_size()); if (is_finished_ok == false) { set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + "\""); } } void OneapiDevice::mem_free(device_memory &mem) { if (mem.name) { VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", " << string_human_readable_number(mem.device_size) << " bytes. (" << string_human_readable_size(mem.device_size) << ")\n"; } if (mem.type == MEM_GLOBAL) { global_free(mem); } else if (mem.type == MEM_TEXTURE) { tex_free((device_texture &)mem); } else { generic_free(mem); } } device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) { return reinterpret_cast(reinterpret_cast(mem.device_pointer) + mem.memory_elements_size(offset)); } void OneapiDevice::const_copy_to(const char *name, void *host, size_t size) { assert(name); VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object " << string_human_readable_number(size) << " bytes. (" << string_human_readable_size(size) << ")"; # ifdef WITH_EMBREE_GPU if (embree_scene != nullptr && strcmp(name, "data") == 0) { assert(size <= sizeof(KernelData)); /* Update scene handle(since it is different for each device on multi devices) */ KernelData *const data = (KernelData *)host; data->device_bvh = embree_scene; /* We need this number later for proper local memory allocation. */ scene_max_shaders_ = data->max_shaders; } # endif ConstMemMap::iterator i = const_mem_map_.find(name); device_vector *data; if (i == const_mem_map_.end()) { data = new device_vector(this, name, MEM_READ_ONLY); data->alloc(size); const_mem_map_.insert(ConstMemMap::value_type(name, data)); } else { data = i->second; } assert(data->memory_size() <= size); memcpy(data->data(), host, size); data->copy_to_device(); set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer); usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); } void OneapiDevice::global_alloc(device_memory &mem) { assert(mem.name); size_t size = mem.memory_size(); VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object " << string_human_readable_number(size) << " bytes. (" << string_human_readable_size(size) << ")"; generic_alloc(mem); generic_copy_to(mem); set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer); usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); } void OneapiDevice::global_free(device_memory &mem) { if (mem.device_pointer) { generic_free(mem); } } void OneapiDevice::tex_alloc(device_texture &mem) { generic_alloc(mem); generic_copy_to(mem); /* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */ const uint slot = mem.slot; if (slot >= texture_info.size()) { texture_info.resize(slot + 128); } texture_info[slot] = mem.info; need_texture_info = true; texture_info[slot].data = (uint64_t)mem.device_pointer; } void OneapiDevice::tex_free(device_texture &mem) { /* There is no texture memory in SYCL. */ if (mem.device_pointer) { generic_free(mem); } } unique_ptr OneapiDevice::gpu_queue_create() { return make_unique(this); } bool OneapiDevice::should_use_graphics_interop() { /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so * return false. */ return false; } void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment) { assert(device_queue_); return usm_aligned_alloc_host(device_queue_, memory_size, alignment); } void OneapiDevice::usm_free(void *usm_ptr) { assert(device_queue_); return usm_free(device_queue_, usm_ptr); } void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) { # ifndef NDEBUG sycl::queue *queue = reinterpret_cast(queue_); sycl::info::device_type device_type = queue->get_device().get_info(); sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); (void)usm_type; # ifndef WITH_ONEAPI_SYCL_HOST_TASK const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device; # else const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host; # endif assert(usm_type == main_memory_type || (usm_type == sycl::usm::alloc::host && (allow_host || device_type == sycl::info::device_type::cpu)) || usm_type == sycl::usm::alloc::unknown); # else /* Silence warning about unused arguments. */ (void)queue_; (void)usm_ptr; (void)allow_host; # endif } bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index, void *embree_device_pointer) { bool finished_correct = true; try { std::vector devices = available_sycl_devices(); if (device_index < 0 || device_index >= devices.size()) { return false; } sycl::queue *created_queue = new sycl::queue(devices[device_index], sycl::property::queue::in_order()); external_queue = reinterpret_cast(created_queue); # ifdef WITH_EMBREE_GPU if (embree_device_pointer) { RTCDevice *device_object_ptr = reinterpret_cast(embree_device_pointer); *device_object_ptr = rtcNewSYCLDevice(created_queue->get_context(), ""); if (*device_object_ptr == nullptr) { finished_correct = false; oneapi_error_string_ = "Hardware Raytracing is not available; please install " "\"intel-level-zero-gpu-raytracing\" to enable it or disable Embree on GPU."; } } # else (void)embree_device_pointer; # endif } catch (sycl::exception const &e) { finished_correct = false; oneapi_error_string_ = e.what(); } return finished_correct; } void OneapiDevice::free_queue(SyclQueue *queue_) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); delete queue; } void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); return sycl::aligned_alloc_host(alignment, memory_size, *queue); } void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device * and shared. For new project it could more beneficial to use USM shared memory, because it * provides automatic migration mechanism in order to allow to use the same pointer on host and * on device, without need to worry about explicit memory transfer operations, although usage of * USM shared imply some documented limitations on the memory usage in regards of parallel access * from different threads. But for Blender/Cycles this type of memory is not very suitable in * current application architecture, because Cycles is multi-thread application and already uses * two different pointer for host activity and device activity, and also has to perform all * needed memory transfer operations. So, USM device memory type has been used for oneAPI device * in order to better fit in Cycles architecture. */ # ifndef WITH_ONEAPI_SYCL_HOST_TASK return sycl::malloc_device(memory_size, *queue); # else return sycl::malloc_host(memory_size, *queue); # endif } void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); OneapiDevice::check_usm(queue_, usm_ptr, true); sycl::free(usm_ptr, *queue); } bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) { assert(queue_); /* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous * runtime errors. It's better to avoid running memory operations in that case. * The render will be canceled and the queue will be destroyed anyway. */ if (have_error()) return false; sycl::queue *queue = reinterpret_cast(queue_); OneapiDevice::check_usm(queue_, dest, true); OneapiDevice::check_usm(queue_, src, true); sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); /* Unknown here means, that this is not an USM allocation, which implies that this is * some generic C++ allocation, so we could use C++ memcpy directly with USM host. */ if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) && (src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown)) { memcpy(dest, src, num_bytes); return true; } try { sycl::event mem_event = queue->memcpy(dest, src, num_bytes); # ifdef WITH_CYCLES_DEBUG /* NOTE(@nsirgien) Waiting on memory operation may give more precise error * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. */ mem_event.wait_and_throw(); return true; # else bool from_device_to_host = dest_type == sycl::usm::alloc::host && src_type == sycl::usm::alloc::device; bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || src_type == sycl::usm::alloc::unknown; /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host * may not wait until the end of the transfer before using the memory. */ if (from_device_to_host || host_or_device_memop_with_offset) mem_event.wait(); return true; # endif } catch (sycl::exception const &e) { oneapi_error_string_ = e.what(); return false; } } bool OneapiDevice::usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) { assert(queue_); /* sycl::queue::memset may crash if the queue is in an invalid state due to previous * runtime errors. It's better to avoid running memory operations in that case. * The render will be canceled and the queue will be destroyed anyway. */ if (have_error()) return false; sycl::queue *queue = reinterpret_cast(queue_); OneapiDevice::check_usm(queue_, usm_ptr, true); try { sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); # ifdef WITH_CYCLES_DEBUG /* NOTE(@nsirgien) Waiting on memory operation may give more precise error * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. */ mem_event.wait_and_throw(); # else (void)mem_event; # endif return true; } catch (sycl::exception const &e) { oneapi_error_string_ = e.what(); return false; } } bool OneapiDevice::queue_synchronize(SyclQueue *queue_) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); try { queue->wait_and_throw(); return true; } catch (sycl::exception const &e) { oneapi_error_string_ = e.what(); return false; } } bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size) { kernel_global_size = sizeof(KernelGlobalsGPU); return true; } void OneapiDevice::set_global_memory(SyclQueue *queue_, void *kernel_globals, const char *memory_name, void *memory_device_pointer) { assert(queue_); assert(kernel_globals); assert(memory_name); assert(memory_device_pointer); KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; OneapiDevice::check_usm(queue_, memory_device_pointer, true); OneapiDevice::check_usm(queue_, kernel_globals, true); std::string matched_name(memory_name); /* This macro will change global ptr of KernelGlobals via name matching. */ # define KERNEL_DATA_ARRAY(type, name) \ else if (#name == matched_name) { \ globals->__##name = (type *)memory_device_pointer; \ return; \ } if (false) { } else if ("integrator_state" == matched_name) { globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; return; } KERNEL_DATA_ARRAY(KernelData, data) # include "kernel/data_arrays.h" else { std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" << std::endl; assert(false); } # undef KERNEL_DATA_ARRAY } bool OneapiDevice::enqueue_kernel( KernelContext *kernel_context, int kernel, size_t global_size, size_t local_size, void **args) { return oneapi_enqueue_kernel(kernel_context, kernel, global_size, local_size, kernel_features, use_hardware_raytracing, args); } void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue, const DeviceKernel kernel, size_t &kernel_global_size, size_t &kernel_local_size) { assert(queue); const static size_t preferred_work_group_size_intersect = 128; const static size_t preferred_work_group_size_shading = 256; const static size_t preferred_work_group_size_shading_simd8 = 64; /* Shader evaluation kernels seems to use some amount of shared memory, so better * to avoid usage of maximum work group sizes for them. */ const static size_t preferred_work_group_size_shader_evaluation = 256; /* NOTE(@nsirgien): 1024 currently may lead to issues with cryptomatte kernels, so * for now their work-group size is restricted to 512. */ const static size_t preferred_work_group_size_cryptomatte = 512; const static size_t preferred_work_group_size_default = 1024; const sycl::device &device = reinterpret_cast(queue)->get_device(); const size_t max_work_group_size = device.get_info(); size_t preferred_work_group_size = 0; switch (kernel) { case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT: preferred_work_group_size = preferred_work_group_size_intersect; break; case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT: { const bool device_is_simd8 = (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && device.get_info() == 8); preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 : preferred_work_group_size_shading; } break; case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: preferred_work_group_size = preferred_work_group_size_cryptomatte; break; case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: preferred_work_group_size = preferred_work_group_size_shader_evaluation; break; default: /* Do nothing and keep initial zero value. */ break; } /* Such order of logic allow us to override Blender default values, if needed, * yet respect them otherwise. */ if (preferred_work_group_size == 0) { preferred_work_group_size = oneapi_suggested_gpu_kernel_size((::DeviceKernel)kernel); } /* If there is no recommendation, then use manual default value. */ if (preferred_work_group_size == 0) { preferred_work_group_size = preferred_work_group_size_default; } kernel_local_size = std::min(max_work_group_size, preferred_work_group_size); /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ kernel_global_size = round_up(kernel_global_size, kernel_local_size); # ifdef WITH_ONEAPI_SYCL_HOST_TASK /* Kernels listed below need a specific number of work groups. */ if (kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY || kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY || kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY || kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY || kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY || kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY || kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) { /* Path array implementation is serial in case of SYCL Host Task execution. */ kernel_global_size = 1; kernel_local_size = 1; } # endif assert(kernel_global_size % kernel_local_size == 0); } /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows * since Windows driver 101.3268. */ static const int lowest_supported_driver_version_win = 1015730; # ifdef _WIN32 /* For Windows driver 101.5730, compute-runtime version is 29550. * This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/ static const int lowest_supported_driver_version_neo = 29550; # else static const int lowest_supported_driver_version_neo = 27642; # endif int parse_driver_build_version(const sycl::device &device) { const std::string &driver_version = device.get_info(); int driver_build_version = 0; size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); if (second_dot_position == std::string::npos) { std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," << " xx.xx.xxx.xxxx (Windows) for device \"" << device.get_info() << "\"." << std::endl; } else { try { size_t third_dot_position = driver_version.find('.', second_dot_position + 1); if (third_dot_position != std::string::npos) { const std::string &third_number_substr = driver_version.substr( second_dot_position + 1, third_dot_position - second_dot_position - 1); const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) driver_build_version = std::stoi(third_number_substr) * 10000 + std::stoi(forth_number_substr); } else { const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); driver_build_version = std::stoi(third_number_substr); } } catch (std::invalid_argument &) { std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," << " xx.xx.xxx.xxxx (Windows) for device \"" << device.get_info() << "\"." << std::endl; } } return driver_build_version; } std::vector available_sycl_devices() { bool allow_all_devices = false; if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) { allow_all_devices = true; } const std::vector &oneapi_platforms = sycl::platform::get_platforms(); std::vector available_devices; for (const sycl::platform &platform : oneapi_platforms) { /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. */ if (platform.get_backend() == sycl::backend::opencl) { continue; } const std::vector &oneapi_devices = (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) : platform.get_devices(sycl::info::device_type::gpu); for (const sycl::device &device : oneapi_devices) { bool filter_out = false; if (!allow_all_devices) { /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, * assuming they have either more than 96 Execution Units or not 7 threads per EU. * Official support can be broaden to older and smaller GPUs once ready. */ if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) { filter_out = true; } else { /* Filtered-out defaults in-case these values aren't available. */ int number_of_eus = 96; int threads_per_eu = 7; if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { number_of_eus = device.get_info(); } if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { threads_per_eu = device.get_info(); } /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ if (number_of_eus <= 96 && threads_per_eu == 7) { filter_out = true; } /* if not already filtered out, check driver version. */ bool check_driver_version = !filter_out; /* We don't know how to check driver version strings for non-Intel GPUs. */ if (check_driver_version && device.get_info().find("Intel") == std::string::npos) { check_driver_version = false; } /* Because of https://github.com/oneapi-src/unified-runtime/issues/1777, future drivers * may break parsing done by a SYCL runtime from before the fix we expect in major * version 8. Parsed driver version would start with something different than current * "1.3.". To avoid blocking a device by mistake in the case of new driver / old SYCL * runtime, we disable driver version check in case LIBSYCL_MAJOR_VERSION is below 8 and * actual driver version doesn't start with 1.3. */ # if __LIBSYCL_MAJOR_VERSION < 8 if (check_driver_version && !string_startswith(device.get_info(), "1.3.")) { check_driver_version = false; } # endif if (check_driver_version) { int driver_build_version = parse_driver_build_version(device); if ((driver_build_version > 100000 && driver_build_version < lowest_supported_driver_version_win) || driver_build_version < lowest_supported_driver_version_neo) { filter_out = true; } } } } if (!filter_out) { available_devices.push_back(device); } } } return available_devices; } char *OneapiDevice::device_capabilities() { std::stringstream capabilities; const std::vector &oneapi_devices = available_sycl_devices(); for (const sycl::device &device : oneapi_devices) { # ifndef WITH_ONEAPI_SYCL_HOST_TASK const std::string &name = device.get_info(); # else const std::string &name = "SYCL Host Task (Debug)"; # endif capabilities << std::string("\t") << name << "\n"; capabilities << "\t\tsycl::info::platform::name\t\t\t" << device.get_platform().get_info() << "\n"; # define WRITE_ATTR(attribute_name, attribute_variable) \ capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ << "\n"; # define GET_ATTR(attribute) \ { \ capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" \ << device.get_info() << "\n"; \ } # define GET_INTEL_ATTR(attribute) \ { \ if (device.has(sycl::aspect::ext_intel_##attribute)) { \ capabilities << "\t\tsycl::ext::intel::info::device::" #attribute "\t\t\t" \ << device.get_info() << "\n"; \ } \ } # define GET_ASPECT(aspect_) \ { \ capabilities << "\t\tdevice::has(" #aspect_ ")\t\t\t" << device.has(sycl::aspect ::aspect_) \ << "\n"; \ } GET_ATTR(vendor) GET_ATTR(driver_version) GET_ATTR(max_compute_units) GET_ATTR(max_clock_frequency) GET_ATTR(global_mem_size) GET_INTEL_ATTR(pci_address) GET_INTEL_ATTR(gpu_eu_simd_width) GET_INTEL_ATTR(gpu_eu_count) GET_INTEL_ATTR(gpu_slices) GET_INTEL_ATTR(gpu_subslices_per_slice) GET_INTEL_ATTR(gpu_eu_count_per_subslice) GET_INTEL_ATTR(gpu_hw_threads_per_eu) GET_INTEL_ATTR(max_mem_bandwidth) GET_ATTR(max_work_group_size) GET_ATTR(max_work_item_dimensions) sycl::id<3> max_work_item_sizes = device.get_info>(); WRITE_ATTR(max_work_item_sizes[0], max_work_item_sizes.get(0)) WRITE_ATTR(max_work_item_sizes[1], max_work_item_sizes.get(1)) WRITE_ATTR(max_work_item_sizes[2], max_work_item_sizes.get(2)) GET_ATTR(max_num_sub_groups) for (size_t sub_group_size : device.get_info()) { WRITE_ATTR(sub_group_size[], sub_group_size) } GET_ATTR(sub_group_independent_forward_progress) GET_ATTR(preferred_vector_width_char) GET_ATTR(preferred_vector_width_short) GET_ATTR(preferred_vector_width_int) GET_ATTR(preferred_vector_width_long) GET_ATTR(preferred_vector_width_float) GET_ATTR(preferred_vector_width_double) GET_ATTR(preferred_vector_width_half) GET_ATTR(address_bits) GET_ATTR(max_mem_alloc_size) GET_ATTR(mem_base_addr_align) GET_ATTR(error_correction_support) GET_ATTR(is_available) GET_ASPECT(cpu) GET_ASPECT(gpu) GET_ASPECT(fp16) GET_ASPECT(atomic64) GET_ASPECT(usm_host_allocations) GET_ASPECT(usm_device_allocations) GET_ASPECT(usm_shared_allocations) GET_ASPECT(usm_system_allocations) # ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_non_uniform_groups__ GET_ASPECT(ext_oneapi_non_uniform_groups) # endif # ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images__ GET_ASPECT(ext_oneapi_bindless_images) # endif # ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__ GET_ASPECT(ext_oneapi_interop_semaphore_import) # endif # ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_export__ GET_ASPECT(ext_oneapi_interop_semaphore_export) # endif # undef GET_INTEL_ATTR # undef GET_ASPECT # undef GET_ATTR # undef WRITE_ATTR capabilities << "\n"; } return ::strdup(capabilities.str().c_str()); } void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) { int num = 0; std::vector devices = available_sycl_devices(); for (sycl::device &device : devices) { const std::string &platform_name = device.get_platform().get_info(); # ifndef WITH_ONEAPI_SYCL_HOST_TASK std::string name = device.get_info(); # else std::string name = "SYCL Host Task (Debug)"; # endif # ifdef WITH_EMBREE_GPU bool hwrt_support = rtcIsSYCLDeviceSupported(device); # else bool hwrt_support = false; # endif # if defined(WITH_OPENIMAGEDENOISE) && OIDN_VERSION >= 20300 bool oidn_support = oidnIsSYCLDeviceSupported(&device); # else bool oidn_support = false; # endif std::string id = "ONEAPI_" + platform_name + "_" + name; if (device.has(sycl::aspect::ext_intel_pci_address)) { id.append("_" + device.get_info()); } (cb)(id.c_str(), name.c_str(), num, hwrt_support, oidn_support, user_ptr); num++; } } size_t OneapiDevice::get_memcapacity() { return reinterpret_cast(device_queue_) ->get_device() .get_info(); } int OneapiDevice::get_num_multiprocessors() { const sycl::device &device = reinterpret_cast(device_queue_)->get_device(); if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { return device.get_info(); } else return 0; } int OneapiDevice::get_max_num_threads_per_multiprocessor() { const sycl::device &device = reinterpret_cast(device_queue_)->get_device(); if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { return device.get_info() * device.get_info(); } else return 0; } CCL_NAMESPACE_END #endif