diff --git a/CMakeLists.txt b/CMakeLists.txt index 61b6bee66d0..e73bf5f526e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -740,6 +740,8 @@ This option is only for debugging purposes." # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html # The supported devices can be retrieved from `ocloc` output when running # `ocloc compile --help`. + # If you have completed optimization work and now want to enable AoT for new Intel devices, + # update the optimization status in OneapiDevice::architecture_information. set(CYCLES_ONEAPI_INTEL_BINARIES_ARCH dg2 mtl lnl bmg CACHE STRING "\ oneAPI Intel GPU architectures to build binaries for" ) diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 3552055a5ad..25ed6db3e72 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -269,7 +269,7 @@ def enum_openimagedenoise_denoiser(self, context): def enum_optix_denoiser(self, context): - if not context or bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')): + if not context or bool(context.preferences.addons[__package__].preferences.get_device_list('OPTIX')): return [('OPTIX', "OptiX", n_( "Use the OptiX AI denoiser with GPU acceleration, only available on NVIDIA GPUs when configured in the system tab in the user preferences"), 2)] return [] @@ -1668,16 +1668,19 @@ class CyclesPreferences(bpy.types.AddonPreferences): # Update name in case it changed entry.name = device[0] - # Gets all devices types for a compute device type. - def get_devices_for_type(self, compute_device_type): + # Gets all devices types to display in the preferences for a compute device type. + # This includes the CPU device. + def get_devices_for_type(self, compute_device_type, device_list=None): # Layout of the device tuples: (Name, Type, Persistent ID) - device_list = self.get_device_list(compute_device_type) + if device_list is None: + device_list = self.get_device_list(compute_device_type) # Sort entries into lists devices = [] cpu_devices = [] for device in device_list: entry = self.find_existing_device_entry(device) + entry.optimized = device[7] if entry.type == compute_device_type: devices.append(entry) elif entry.type == 'CPU': @@ -1768,9 +1771,20 @@ class CyclesPreferences(bpy.types.AddonPreferences): return False - def _draw_devices(self, layout, device_type, devices): + @staticmethod + def _format_device_name(name): + import unicodedata + return name.replace('(TM)', unicodedata.lookup('TRADE MARK SIGN')) \ + .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN')) \ + .replace('(R)', unicodedata.lookup('REGISTERED SIGN')) \ + .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')) + + def _draw_devices(self, layout, device_type, device_list): box = layout.box() + # Get preference devices, including CPU. + devices = self.get_devices_for_type(device_type, device_list) + found_device = False for device in devices: if device.type == device_type: @@ -1839,15 +1853,10 @@ class CyclesPreferences(bpy.types.AddonPreferences): return for device in devices: - import unicodedata - box.prop( - device, "use", text=device.name - .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN')) - .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN')) - .replace('(R)', unicodedata.lookup('REGISTERED SIGN')) - .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')), - translate=False - ) + name = self._format_device_name(device.name) + if not device.optimized: + name += rpt_(" (Unoptimized Performance)") + box.prop(device, "use", text=name, translate=False) def draw_impl(self, layout, context): row = layout.row() @@ -1857,14 +1866,14 @@ class CyclesPreferences(bpy.types.AddonPreferences): if compute_device_type == 'NONE': return row = layout.row() - devices = self.get_devices_for_type(compute_device_type) + devices = self.get_device_list(compute_device_type) self._draw_devices(row, compute_device_type, devices) import _cycles has_peer_memory = False has_enabled_hardware_rt = False has_disabled_hardware_rt = False - for device in self.get_device_list(compute_device_type): + for device in devices: if not self.find_existing_device_entry(device).use: continue if device[1] != compute_device_type: diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index dbc87ff3abf..d4a8ade302a 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -440,7 +440,7 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args) for (size_t i = 0; i < devices.size(); i++) { const DeviceInfo &device = devices[i]; const string type_name = Device::string_from_type(device.type); - PyObject *device_tuple = PyTuple_New(7); + PyObject *device_tuple = PyTuple_New(8); PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str())); PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str())); PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str())); @@ -449,6 +449,7 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args) PyTuple_SET_ITEM( device_tuple, 5, PyBool_FromLong(device.denoisers & DENOISER_OPENIMAGEDENOISE)); PyTuple_SET_ITEM(device_tuple, 6, PyBool_FromLong(device.denoisers & DENOISER_OPTIX)); + PyTuple_SET_ITEM(device_tuple, 7, PyBool_FromLong(device.has_execution_optimization)); PyTuple_SET_ITEM(ret, i, device_tuple); } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 55d1ed0c3b5..bed8bd3d220 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -94,6 +94,10 @@ class DeviceInfo { bool has_gpu_queue = false; /* Device supports GPU queue. */ bool use_hardware_raytracing = false; /* Use hardware instructions to accelerate ray tracing. */ bool use_metalrt_by_default = false; /* Use MetalRT by default. */ + /* Indicate that device execution has been optimized by Blender or vendor developers. + * For LTS versions, this helps communicate that newer versions may have better performance. */ + bool has_execution_optimization = true; + KernelOptimizationLevel kernel_optimization_level = KERNEL_OPTIMIZATION_LEVEL_FULL; /* Optimization level applied to path tracing * kernels (Metal only). */ diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp index 15c20a7b7e0..b4d5855bf17 100644 --- a/intern/cycles/device/oneapi/device.cpp +++ b/intern/cycles/device/oneapi/device.cpp @@ -99,6 +99,7 @@ static void device_iterator_cb(const char *id, const int num, bool hwrt_support, bool oidn_support, + bool has_execution_optimization, void *user_ptr) { vector *devices = (vector *)user_ptr; @@ -140,6 +141,8 @@ static void device_iterator_cb(const char *id, (void)hwrt_support; # endif + info.has_execution_optimization = has_execution_optimization; + devices->push_back(info); VLOG_INFO << "Added device \"" << info.description << "\" with id \"" << info.id << "\"."; diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index f0614b85bc2..270dfa91025 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -1454,6 +1454,64 @@ std::vector available_sycl_devices() return available_devices; } +void OneapiDevice::architecture_information(const SyclDevice *device, + string &name, + bool &is_optimized) +{ + const sycl::ext::oneapi::experimental::architecture arch = + reinterpret_cast(device) + ->get_info(); + +# define FILL_ARCH_INFO(architecture_code, is_arch_optimised) \ + case sycl::ext::oneapi::experimental::architecture ::architecture_code: \ + name = #architecture_code; \ + is_optimized = is_arch_optimised; \ + break; + + /* List of architectures that have been optimized by Intel and Blender developers. + * + * For example, Intel Rocket Lake iGPU (rkl) is not supported and not optimized, + * while Intel Arc Alchemist dGPU (dg2) was optimized for. + * + * Devices can changed from unoptimized to optimized manually, after DPC++ has + * been upgraded to support the architecture and CYCLES_ONEAPI_INTEL_BINARIES_ARCH + * in CMake includes the architecture. */ + switch (arch) { + FILL_ARCH_INFO(intel_gpu_bdw, false) + FILL_ARCH_INFO(intel_gpu_skl, false) + FILL_ARCH_INFO(intel_gpu_kbl, false) + FILL_ARCH_INFO(intel_gpu_cfl, false) + FILL_ARCH_INFO(intel_gpu_apl, false) + FILL_ARCH_INFO(intel_gpu_glk, false) + FILL_ARCH_INFO(intel_gpu_whl, false) + FILL_ARCH_INFO(intel_gpu_aml, false) + FILL_ARCH_INFO(intel_gpu_cml, false) + FILL_ARCH_INFO(intel_gpu_icllp, false) + FILL_ARCH_INFO(intel_gpu_ehl, false) + FILL_ARCH_INFO(intel_gpu_tgllp, false) + FILL_ARCH_INFO(intel_gpu_rkl, false) + FILL_ARCH_INFO(intel_gpu_adl_s, false) + FILL_ARCH_INFO(intel_gpu_adl_p, false) + FILL_ARCH_INFO(intel_gpu_adl_n, false) + FILL_ARCH_INFO(intel_gpu_dg1, false) + FILL_ARCH_INFO(intel_gpu_dg2_g10, true) + FILL_ARCH_INFO(intel_gpu_dg2_g11, true) + FILL_ARCH_INFO(intel_gpu_dg2_g12, true) + FILL_ARCH_INFO(intel_gpu_pvc, false) + FILL_ARCH_INFO(intel_gpu_pvc_vg, false) + /* intel_gpu_mtl_u == intel_gpu_mtl_s == intel_gpu_arl_u == intel_gpu_arl_s */ + FILL_ARCH_INFO(intel_gpu_mtl_u, true) + FILL_ARCH_INFO(intel_gpu_mtl_h, true) + FILL_ARCH_INFO(intel_gpu_bmg_g21, true) + FILL_ARCH_INFO(intel_gpu_lnl_m, true) + + default: + name = "unknown"; + is_optimized = false; + break; + } +} + char *OneapiDevice::device_capabilities() { std::stringstream capabilities; @@ -1470,6 +1528,15 @@ char *OneapiDevice::device_capabilities() capabilities << "\t\tsycl::info::platform::name\t\t\t" << device.get_platform().get_info() << "\n"; + string arch_name; + bool is_optimised_for_arch; + architecture_information( + reinterpret_cast(&device), arch_name, is_optimised_for_arch); + capabilities << "\t\tsycl::info::device::architecture\t\t\t"; + capabilities << arch_name << "\n"; + capabilities << "\t\tsycl::info::device::is_cycles_optimized\t\t\t"; + capabilities << is_optimised_for_arch << "\n"; + # define WRITE_ATTR(attribute_name, attribute_variable) \ capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ << "\n"; @@ -1587,10 +1654,22 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p bool oidn_support = false; # endif std::string id = "ONEAPI_" + platform_name + "_" + name; + + string arch_name; + bool is_optimised_for_arch; + architecture_information( + reinterpret_cast(&device), arch_name, is_optimised_for_arch); + if (device.has(sycl::aspect::ext_intel_pci_address)) { id.append("_" + device.get_info()); } - (cb)(id.c_str(), name.c_str(), num, hwrt_support, oidn_support, user_ptr); + (cb)(id.c_str(), + name.c_str(), + num, + hwrt_support, + oidn_support, + is_optimised_for_arch, + user_ptr); num++; } } diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index e9c76113cef..5871c3282f2 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN class DeviceQueue; using OneAPIDeviceIteratorCallback = - void (*)(const char *, const char *, const int, bool, bool, void *); + void (*)(const char *, const char *, const int, bool, bool, bool, void *); class OneapiDevice : public GPUDevice { private: @@ -117,6 +117,7 @@ class OneapiDevice : public GPUDevice { void *usm_aligned_alloc_host(const size_t memory_size, const size_t alignment); void usm_free(void *usm_ptr); + static void architecture_information(const SyclDevice *device, string &name, bool &is_optimized); static char *device_capabilities(); static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr); diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index 56653efc0d7..743991c6876 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -25,6 +25,7 @@ enum DeviceKernel : int; # endif class SyclQueue; +class SyclDevice; typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);