This PR adds Vulkan/oneAPI graphics interop to Cycles. Just like for CUDA and HIP interop, persistent memory mapping is used, as there could potentially be some overhead of continuously mapping/unmapping buffers. Pull Request: https://projects.blender.org/blender/blender/pulls/144442
175 lines
6.7 KiB
C++
175 lines
6.7 KiB
C++
/* SPDX-FileCopyrightText: 2021-2025 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#ifdef WITH_ONEAPI
|
|
# include "device/device.h"
|
|
# include "device/oneapi/device.h"
|
|
# include "device/oneapi/queue.h"
|
|
# include "kernel/device/oneapi/kernel.h"
|
|
|
|
# include "util/map.h"
|
|
# include "util/unique_ptr.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
class DeviceQueue;
|
|
|
|
using OneAPIDeviceIteratorCallback =
|
|
void (*)(const char *, const char *, const int, bool, bool, bool, void *);
|
|
|
|
class OneapiDevice : public GPUDevice {
|
|
private:
|
|
SyclQueue *device_queue_ = nullptr;
|
|
# ifdef WITH_EMBREE_GPU
|
|
RTCDevice embree_device = nullptr;
|
|
# if RTC_VERSION >= 40400
|
|
RTCTraversable embree_traversable = nullptr;
|
|
# else
|
|
RTCScene embree_traversable = nullptr;
|
|
# endif
|
|
# if RTC_VERSION >= 40302
|
|
thread_mutex scene_data_mutex;
|
|
vector<RTCScene> all_embree_scenes;
|
|
# endif
|
|
# endif
|
|
using ConstMemMap = map<string, unique_ptr<device_vector<uchar>>>;
|
|
ConstMemMap const_mem_map_;
|
|
void *kg_memory_ = nullptr;
|
|
void *kg_memory_device_ = nullptr;
|
|
size_t kg_memory_size_ = 0;
|
|
size_t max_memory_on_device_ = 0;
|
|
std::string oneapi_error_string_;
|
|
bool use_hardware_raytracing = false;
|
|
unsigned int kernel_features = 0;
|
|
int scene_max_shaders_ = 0;
|
|
/* Currently, there are some functional errors in the different software layers of the DPC++/L0
|
|
* support regarding several Intel's dGPU executions. As a result, to provide proper
|
|
* functionality to Blender users, we need to detect such configurations and enable some
|
|
* workarounds for them. These workarounds don't make sense to enable by default due to a
|
|
* performance impact - which is not as important for the discussed configuration, as without
|
|
* workarounds, the configuration with several dGPUs would simply not be functional, making the
|
|
* performance topic irrelevant anyway. For an example of such issues, see Blender issue #138384.
|
|
*/
|
|
bool is_several_intel_dgpu_devices_detected = false;
|
|
|
|
size_t get_free_mem() const;
|
|
|
|
public:
|
|
BVHLayoutMask get_bvh_layout_mask(const uint requested_features) const override;
|
|
|
|
OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless);
|
|
|
|
~OneapiDevice() override;
|
|
# ifdef WITH_EMBREE_GPU
|
|
void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
|
# endif
|
|
bool check_peer_access(Device *peer_device) override;
|
|
|
|
bool load_kernels(const uint requested_features) override;
|
|
|
|
void reserve_private_memory(const uint kernel_features);
|
|
|
|
string oneapi_error_message();
|
|
|
|
int scene_max_shaders();
|
|
|
|
void *kernel_globals_device_pointer();
|
|
|
|
/* All memory types. */
|
|
void mem_alloc(device_memory &mem) override;
|
|
void mem_copy_to(device_memory &mem) override;
|
|
void mem_move_to_host(device_memory &mem) override;
|
|
void mem_copy_from(
|
|
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;
|
|
void mem_copy_from(device_memory &mem)
|
|
{
|
|
mem_copy_from(mem, 0, 0, 0, 0);
|
|
}
|
|
void mem_zero(device_memory &mem) override;
|
|
void mem_free(device_memory &mem) override;
|
|
|
|
device_ptr mem_alloc_sub_ptr(device_memory &mem, const size_t offset, size_t /*size*/) override;
|
|
|
|
/* Global memory. */
|
|
void global_alloc(device_memory &mem);
|
|
void global_copy_to(device_memory &mem);
|
|
void global_free(device_memory &mem);
|
|
|
|
/* Texture memory. */
|
|
void tex_alloc(device_texture &mem);
|
|
void tex_copy_to(device_texture &mem);
|
|
void tex_free(device_texture &mem);
|
|
|
|
/* Host side memory, override for more efficient copies. */
|
|
void *host_alloc(const MemoryType type, const size_t size) override;
|
|
void host_free(const MemoryType type, void *host_pointer, const size_t size) override;
|
|
|
|
/* Device side memory. */
|
|
void get_device_memory_info(size_t &total, size_t &free) override;
|
|
bool alloc_device(void *&device_pointer, const size_t size) override;
|
|
void free_device(void *device_pointer) override;
|
|
|
|
/* Shared memory. */
|
|
bool shared_alloc(void *&shared_pointer, const size_t size) override;
|
|
void shared_free(void *shared_pointer) override;
|
|
void *shared_to_device_pointer(const void *shared_pointer) override;
|
|
|
|
/* Memory copy. */
|
|
void copy_host_to_device(void *device_pointer, void *host_pointer, const size_t size) override;
|
|
void const_copy_to(const char *name, void *host, const size_t size) override;
|
|
|
|
/* Graphics resources interoperability. */
|
|
bool should_use_graphics_interop(const GraphicsInteropDevice &interop_device,
|
|
const bool log) override;
|
|
|
|
unique_ptr<DeviceQueue> gpu_queue_create() override;
|
|
|
|
/* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
|
|
* side compilation (MSVC). */
|
|
void *usm_aligned_alloc_host(const size_t memory_size, const size_t alignment);
|
|
void usm_free(void *usm_ptr);
|
|
|
|
static void architecture_information(const SyclDevice *device, string &name, bool &is_optimized);
|
|
static char *device_capabilities();
|
|
static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr);
|
|
|
|
size_t get_memcapacity();
|
|
int get_num_multiprocessors();
|
|
int get_max_num_threads_per_multiprocessor();
|
|
bool queue_synchronize(SyclQueue *queue);
|
|
bool kernel_globals_size(size_t &kernel_global_size);
|
|
void set_global_memory(SyclQueue *queue,
|
|
void *kernel_globals,
|
|
const char *memory_name,
|
|
void *memory_device_pointer);
|
|
bool enqueue_kernel(KernelContext *kernel_context,
|
|
const int kernel,
|
|
const size_t global_size,
|
|
const size_t local_size,
|
|
void **args);
|
|
void get_adjusted_global_and_local_sizes(SyclQueue *queue,
|
|
const DeviceKernel kernel,
|
|
size_t &kernel_global_size,
|
|
size_t &kernel_local_size);
|
|
SyclQueue *sycl_queue();
|
|
|
|
protected:
|
|
bool can_use_hardware_raytracing_for_features(const uint requested_features) const;
|
|
void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
|
|
bool create_queue(SyclQueue *&external_queue,
|
|
const int device_index,
|
|
void *embree_device,
|
|
bool *is_several_intel_dgpu_devices_detected_pointer);
|
|
void free_queue(SyclQueue *queue);
|
|
void *usm_aligned_alloc_host(SyclQueue *queue, const size_t memory_size, const size_t alignment);
|
|
void *usm_alloc_device(SyclQueue *queue, const size_t memory_size);
|
|
void usm_free(SyclQueue *queue, void *usm_ptr);
|
|
bool usm_memcpy(SyclQueue *queue, void *dest, void *src, const size_t num_bytes);
|
|
bool usm_memset(SyclQueue *queue, void *usm_ptr, unsigned char value, const size_t num_bytes);
|
|
};
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
#endif
|