Cycles: Show device optimizations status in preferences for oneAPI

With these changes, we can now mark devices which are expected to work as performant as possible, and devices which were not optimized for some reason. For example, because the device was released after the Blender release, making it impossible for developers to optimize for devices in already released unchangeable code. This is primarily relevant for the LTS versions, which are supported for two years and require proper communication about optimization status for the new devices released during this time. This is implemented for oneAPI devices. Other device types currently are marked as optimized for compatibility with old behavior, but may implement the same in the future. Pull Request: https://projects.blender.org/blender/blender/pulls/139751
2025-06-03 20:07:52 +02:00
parent 6805385cbe
commit 69091c5028
8 changed files with 119 additions and 19 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -740,6 +740,8 @@ This option is only for debugging purposes."
  # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
  # The supported devices can be retrieved from `ocloc` output when running
  # `ocloc compile --help`.
+  # If you have completed optimization work and now want to enable AoT for new Intel devices,
+  # update the optimization status in OneapiDevice::architecture_information.
  set(CYCLES_ONEAPI_INTEL_BINARIES_ARCH dg2 mtl lnl bmg CACHE STRING "\
 oneAPI Intel GPU architectures to build binaries for"
  )
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -269,7 +269,7 @@ def enum_openimagedenoise_denoiser(self, context):


 def enum_optix_denoiser(self, context):
-    if not context or bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')):
+    if not context or bool(context.preferences.addons[__package__].preferences.get_device_list('OPTIX')):
        return [('OPTIX', "OptiX", n_(
            "Use the OptiX AI denoiser with GPU acceleration, only available on NVIDIA GPUs when configured in the system tab in the user preferences"), 2)]
    return []
@@ -1668,16 +1668,19 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                # Update name in case it changed
                entry.name = device[0]

-    # Gets all devices types for a compute device type.
-    def get_devices_for_type(self, compute_device_type):
+    # Gets all devices types to display in the preferences for a compute device type.
+    # This includes the CPU device.
+    def get_devices_for_type(self, compute_device_type, device_list=None):
        # Layout of the device tuples: (Name, Type, Persistent ID)
-        device_list = self.get_device_list(compute_device_type)
+        if device_list is None:
+            device_list = self.get_device_list(compute_device_type)

        # Sort entries into lists
        devices = []
        cpu_devices = []
        for device in device_list:
            entry = self.find_existing_device_entry(device)
+            entry.optimized = device[7]
            if entry.type == compute_device_type:
                devices.append(entry)
            elif entry.type == 'CPU':
@@ -1768,9 +1771,20 @@ class CyclesPreferences(bpy.types.AddonPreferences):

        return False

-    def _draw_devices(self, layout, device_type, devices):
+    @staticmethod
+    def _format_device_name(name):
+        import unicodedata
+        return name.replace('(TM)', unicodedata.lookup('TRADE MARK SIGN')) \
+                   .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN')) \
+                   .replace('(R)', unicodedata.lookup('REGISTERED SIGN')) \
+                   .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
+
+    def _draw_devices(self, layout, device_type, device_list):
        box = layout.box()

+        # Get preference devices, including CPU.
+        devices = self.get_devices_for_type(device_type, device_list)
+
        found_device = False
        for device in devices:
            if device.type == device_type:
@@ -1839,15 +1853,10 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            return

        for device in devices:
-            import unicodedata
-            box.prop(
-                device, "use", text=device.name
-                .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
-                .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN'))
-                .replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
-                .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')),
-                translate=False
-            )
+            name = self._format_device_name(device.name)
+            if not device.optimized:
+                name += rpt_(" (Unoptimized Performance)")
+            box.prop(device, "use", text=name, translate=False)

    def draw_impl(self, layout, context):
        row = layout.row()
@@ -1857,14 +1866,14 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        if compute_device_type == 'NONE':
            return
        row = layout.row()
-        devices = self.get_devices_for_type(compute_device_type)
+        devices = self.get_device_list(compute_device_type)
        self._draw_devices(row, compute_device_type, devices)

        import _cycles
        has_peer_memory = False
        has_enabled_hardware_rt = False
        has_disabled_hardware_rt = False
-        for device in self.get_device_list(compute_device_type):
+        for device in devices:
            if not self.find_existing_device_entry(device).use:
                continue
            if device[1] != compute_device_type:
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -440,7 +440,7 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
  for (size_t i = 0; i < devices.size(); i++) {
    const DeviceInfo &device = devices[i];
    const string type_name = Device::string_from_type(device.type);
-    PyObject *device_tuple = PyTuple_New(7);
+    PyObject *device_tuple = PyTuple_New(8);
    PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
    PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
    PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
@@ -449,6 +449,7 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
    PyTuple_SET_ITEM(
        device_tuple, 5, PyBool_FromLong(device.denoisers & DENOISER_OPENIMAGEDENOISE));
    PyTuple_SET_ITEM(device_tuple, 6, PyBool_FromLong(device.denoisers & DENOISER_OPTIX));
+    PyTuple_SET_ITEM(device_tuple, 7, PyBool_FromLong(device.has_execution_optimization));
    PyTuple_SET_ITEM(ret, i, device_tuple);
  }

--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -94,6 +94,10 @@ class DeviceInfo {
  bool has_gpu_queue = false;           /* Device supports GPU queue. */
  bool use_hardware_raytracing = false; /* Use hardware instructions to accelerate ray tracing. */
  bool use_metalrt_by_default = false;  /* Use MetalRT by default. */
+  /* Indicate that device execution has been optimized by Blender or vendor developers.
+   * For LTS versions, this helps communicate that newer versions may have better performance. */
+  bool has_execution_optimization = true;
+
  KernelOptimizationLevel kernel_optimization_level =
      KERNEL_OPTIMIZATION_LEVEL_FULL;         /* Optimization level applied to path tracing
                                               * kernels (Metal only). */
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -99,6 +99,7 @@ static void device_iterator_cb(const char *id,
                               const int num,
                               bool hwrt_support,
                               bool oidn_support,
+                               bool has_execution_optimization,
                               void *user_ptr)
 {
  vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
@@ -140,6 +141,8 @@ static void device_iterator_cb(const char *id,
  (void)hwrt_support;
 #  endif

+  info.has_execution_optimization = has_execution_optimization;
+
  devices->push_back(info);
  VLOG_INFO << "Added device \"" << info.description << "\" with id \"" << info.id << "\".";

--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -1454,6 +1454,64 @@ std::vector<sycl::device> available_sycl_devices()
  return available_devices;
 }

+void OneapiDevice::architecture_information(const SyclDevice *device,
+                                            string &name,
+                                            bool &is_optimized)
+{
+  const sycl::ext::oneapi::experimental::architecture arch =
+      reinterpret_cast<const sycl::device *>(device)
+          ->get_info<sycl::ext::oneapi::experimental::info::device::architecture>();
+
+#  define FILL_ARCH_INFO(architecture_code, is_arch_optimised) \
+    case sycl::ext::oneapi::experimental::architecture ::architecture_code: \
+      name = #architecture_code; \
+      is_optimized = is_arch_optimised; \
+      break;
+
+  /* List of architectures that have been optimized by Intel and Blender developers.
+   *
+   * For example, Intel Rocket Lake iGPU (rkl) is not supported and not optimized,
+   * while Intel Arc Alchemist dGPU (dg2) was optimized for.
+   *
+   * Devices can changed from unoptimized to optimized manually, after DPC++ has
+   * been upgraded to support the architecture and CYCLES_ONEAPI_INTEL_BINARIES_ARCH
+   * in CMake includes the architecture. */
+  switch (arch) {
+    FILL_ARCH_INFO(intel_gpu_bdw, false)
+    FILL_ARCH_INFO(intel_gpu_skl, false)
+    FILL_ARCH_INFO(intel_gpu_kbl, false)
+    FILL_ARCH_INFO(intel_gpu_cfl, false)
+    FILL_ARCH_INFO(intel_gpu_apl, false)
+    FILL_ARCH_INFO(intel_gpu_glk, false)
+    FILL_ARCH_INFO(intel_gpu_whl, false)
+    FILL_ARCH_INFO(intel_gpu_aml, false)
+    FILL_ARCH_INFO(intel_gpu_cml, false)
+    FILL_ARCH_INFO(intel_gpu_icllp, false)
+    FILL_ARCH_INFO(intel_gpu_ehl, false)
+    FILL_ARCH_INFO(intel_gpu_tgllp, false)
+    FILL_ARCH_INFO(intel_gpu_rkl, false)
+    FILL_ARCH_INFO(intel_gpu_adl_s, false)
+    FILL_ARCH_INFO(intel_gpu_adl_p, false)
+    FILL_ARCH_INFO(intel_gpu_adl_n, false)
+    FILL_ARCH_INFO(intel_gpu_dg1, false)
+    FILL_ARCH_INFO(intel_gpu_dg2_g10, true)
+    FILL_ARCH_INFO(intel_gpu_dg2_g11, true)
+    FILL_ARCH_INFO(intel_gpu_dg2_g12, true)
+    FILL_ARCH_INFO(intel_gpu_pvc, false)
+    FILL_ARCH_INFO(intel_gpu_pvc_vg, false)
+    /* intel_gpu_mtl_u == intel_gpu_mtl_s == intel_gpu_arl_u == intel_gpu_arl_s */
+    FILL_ARCH_INFO(intel_gpu_mtl_u, true)
+    FILL_ARCH_INFO(intel_gpu_mtl_h, true)
+    FILL_ARCH_INFO(intel_gpu_bmg_g21, true)
+    FILL_ARCH_INFO(intel_gpu_lnl_m, true)
+
+    default:
+      name = "unknown";
+      is_optimized = false;
+      break;
+  }
+}
+
 char *OneapiDevice::device_capabilities()
 {
  std::stringstream capabilities;
@@ -1470,6 +1528,15 @@ char *OneapiDevice::device_capabilities()
    capabilities << "\t\tsycl::info::platform::name\t\t\t"
                 << device.get_platform().get_info<sycl::info::platform::name>() << "\n";

+    string arch_name;
+    bool is_optimised_for_arch;
+    architecture_information(
+        reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
+    capabilities << "\t\tsycl::info::device::architecture\t\t\t";
+    capabilities << arch_name << "\n";
+    capabilities << "\t\tsycl::info::device::is_cycles_optimized\t\t\t";
+    capabilities << is_optimised_for_arch << "\n";
+
 #  define WRITE_ATTR(attribute_name, attribute_variable) \
    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
                 << "\n";
@@ -1587,10 +1654,22 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
    bool oidn_support = false;
 #  endif
    std::string id = "ONEAPI_" + platform_name + "_" + name;
+
+    string arch_name;
+    bool is_optimised_for_arch;
+    architecture_information(
+        reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
+
    if (device.has(sycl::aspect::ext_intel_pci_address)) {
      id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
    }
-    (cb)(id.c_str(), name.c_str(), num, hwrt_support, oidn_support, user_ptr);
+    (cb)(id.c_str(),
+         name.c_str(),
+         num,
+         hwrt_support,
+         oidn_support,
+         is_optimised_for_arch,
+         user_ptr);
    num++;
  }
 }
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN
 class DeviceQueue;

 using OneAPIDeviceIteratorCallback =
-    void (*)(const char *, const char *, const int, bool, bool, void *);
+    void (*)(const char *, const char *, const int, bool, bool, bool, void *);

 class OneapiDevice : public GPUDevice {
 private:
@@ -117,6 +117,7 @@ class OneapiDevice : public GPUDevice {
  void *usm_aligned_alloc_host(const size_t memory_size, const size_t alignment);
  void usm_free(void *usm_ptr);

+  static void architecture_information(const SyclDevice *device, string &name, bool &is_optimized);
  static char *device_capabilities();
  static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr);

--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -25,6 +25,7 @@ enum DeviceKernel : int;
 #  endif

 class SyclQueue;
+class SyclDevice;

 typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);