From 69091c50284f5467d06eb8fe36bc4ed218ec4316 Mon Sep 17 00:00:00 2001
From: Nikita Sirgienko <nikita.sirgienko@intel.com>
Date: Tue, 3 Jun 2025 20:07:52 +0200
Subject: [PATCH] Cycles: Show device optimizations status in preferences for
 oneAPI

With these changes, we can now mark devices which are expected to work as
performant as possible, and devices which were not optimized for some reason.

For example, because the device was released after the Blender release,
making it impossible for developers to optimize for devices in already
released unchangeable code. This is primarily relevant for the LTS versions,
which are supported for two years and require proper communication about
optimization status for the new devices released during this time.

This is implemented for oneAPI devices. Other device types currently are
marked as optimized for compatibility with old behavior, but may implement
the same in the future.

Pull Request: https://projects.blender.org/blender/blender/pulls/139751
---
 CMakeLists.txt                              |  2 +
 intern/cycles/blender/addon/properties.py   | 41 +++++++----
 intern/cycles/blender/python.cpp            |  3 +-
 intern/cycles/device/device.h               |  4 +
 intern/cycles/device/oneapi/device.cpp      |  3 +
 intern/cycles/device/oneapi/device_impl.cpp | 81 ++++++++++++++++++++-
 intern/cycles/device/oneapi/device_impl.h   |  3 +-
 intern/cycles/kernel/device/oneapi/kernel.h |  1 +
 8 files changed, 119 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61b6bee66d0..e73bf5f526e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -740,6 +740,8 @@ This option is only for debugging purposes."
   # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
   # The supported devices can be retrieved from `ocloc` output when running
   # `ocloc compile --help`.
+  # If you have completed optimization work and now want to enable AoT for new Intel devices,
+  # update the optimization status in OneapiDevice::architecture_information.
   set(CYCLES_ONEAPI_INTEL_BINARIES_ARCH dg2 mtl lnl bmg CACHE STRING "\
 oneAPI Intel GPU architectures to build binaries for"
   )
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 3552055a5ad..25ed6db3e72 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -269,7 +269,7 @@ def enum_openimagedenoise_denoiser(self, context):
 
 
 def enum_optix_denoiser(self, context):
-    if not context or bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')):
+    if not context or bool(context.preferences.addons[__package__].preferences.get_device_list('OPTIX')):
         return [('OPTIX', "OptiX", n_(
             "Use the OptiX AI denoiser with GPU acceleration, only available on NVIDIA GPUs when configured in the system tab in the user preferences"), 2)]
     return []
@@ -1668,16 +1668,19 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                 # Update name in case it changed
                 entry.name = device[0]
 
-    # Gets all devices types for a compute device type.
-    def get_devices_for_type(self, compute_device_type):
+    # Gets all devices types to display in the preferences for a compute device type.
+    # This includes the CPU device.
+    def get_devices_for_type(self, compute_device_type, device_list=None):
         # Layout of the device tuples: (Name, Type, Persistent ID)
-        device_list = self.get_device_list(compute_device_type)
+        if device_list is None:
+            device_list = self.get_device_list(compute_device_type)
 
         # Sort entries into lists
         devices = []
         cpu_devices = []
         for device in device_list:
             entry = self.find_existing_device_entry(device)
+            entry.optimized = device[7]
             if entry.type == compute_device_type:
                 devices.append(entry)
             elif entry.type == 'CPU':
@@ -1768,9 +1771,20 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
         return False
 
-    def _draw_devices(self, layout, device_type, devices):
+    @staticmethod
+    def _format_device_name(name):
+        import unicodedata
+        return name.replace('(TM)', unicodedata.lookup('TRADE MARK SIGN')) \
+                   .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN')) \
+                   .replace('(R)', unicodedata.lookup('REGISTERED SIGN')) \
+                   .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
+
+    def _draw_devices(self, layout, device_type, device_list):
         box = layout.box()
 
+        # Get preference devices, including CPU.
+        devices = self.get_devices_for_type(device_type, device_list)
+
         found_device = False
         for device in devices:
             if device.type == device_type:
@@ -1839,15 +1853,10 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             return
 
         for device in devices:
-            import unicodedata
-            box.prop(
-                device, "use", text=device.name
-                .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
-                .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN'))
-                .replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
-                .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')),
-                translate=False
-            )
+            name = self._format_device_name(device.name)
+            if not device.optimized:
+                name += rpt_(" (Unoptimized Performance)")
+            box.prop(device, "use", text=name, translate=False)
 
     def draw_impl(self, layout, context):
         row = layout.row()
@@ -1857,14 +1866,14 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         if compute_device_type == 'NONE':
             return
         row = layout.row()
-        devices = self.get_devices_for_type(compute_device_type)
+        devices = self.get_device_list(compute_device_type)
         self._draw_devices(row, compute_device_type, devices)
 
         import _cycles
         has_peer_memory = False
         has_enabled_hardware_rt = False
         has_disabled_hardware_rt = False
-        for device in self.get_device_list(compute_device_type):
+        for device in devices:
             if not self.find_existing_device_entry(device).use:
                 continue
             if device[1] != compute_device_type:
diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp
index dbc87ff3abf..d4a8ade302a 100644
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -440,7 +440,7 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
   for (size_t i = 0; i < devices.size(); i++) {
     const DeviceInfo &device = devices[i];
     const string type_name = Device::string_from_type(device.type);
-    PyObject *device_tuple = PyTuple_New(7);
+    PyObject *device_tuple = PyTuple_New(8);
     PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
     PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
     PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
@@ -449,6 +449,7 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
     PyTuple_SET_ITEM(
         device_tuple, 5, PyBool_FromLong(device.denoisers & DENOISER_OPENIMAGEDENOISE));
     PyTuple_SET_ITEM(device_tuple, 6, PyBool_FromLong(device.denoisers & DENOISER_OPTIX));
+    PyTuple_SET_ITEM(device_tuple, 7, PyBool_FromLong(device.has_execution_optimization));
     PyTuple_SET_ITEM(ret, i, device_tuple);
   }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 55d1ed0c3b5..bed8bd3d220 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -94,6 +94,10 @@ class DeviceInfo {
   bool has_gpu_queue = false;           /* Device supports GPU queue. */
   bool use_hardware_raytracing = false; /* Use hardware instructions to accelerate ray tracing. */
   bool use_metalrt_by_default = false;  /* Use MetalRT by default. */
+  /* Indicate that device execution has been optimized by Blender or vendor developers.
+   * For LTS versions, this helps communicate that newer versions may have better performance. */
+  bool has_execution_optimization = true;
+
   KernelOptimizationLevel kernel_optimization_level =
       KERNEL_OPTIMIZATION_LEVEL_FULL;         /* Optimization level applied to path tracing
                                                * kernels (Metal only). */
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index 15c20a7b7e0..b4d5855bf17 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -99,6 +99,7 @@ static void device_iterator_cb(const char *id,
                                const int num,
                                bool hwrt_support,
                                bool oidn_support,
+                               bool has_execution_optimization,
                                void *user_ptr)
 {
   vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
@@ -140,6 +141,8 @@ static void device_iterator_cb(const char *id,
   (void)hwrt_support;
 #  endif
 
+  info.has_execution_optimization = has_execution_optimization;
+
   devices->push_back(info);
   VLOG_INFO << "Added device \"" << info.description << "\" with id \"" << info.id << "\".";
 
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index f0614b85bc2..270dfa91025 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -1454,6 +1454,64 @@ std::vector<sycl::device> available_sycl_devices()
   return available_devices;
 }
 
+void OneapiDevice::architecture_information(const SyclDevice *device,
+                                            string &name,
+                                            bool &is_optimized)
+{
+  const sycl::ext::oneapi::experimental::architecture arch =
+      reinterpret_cast<const sycl::device *>(device)
+          ->get_info<sycl::ext::oneapi::experimental::info::device::architecture>();
+
+#  define FILL_ARCH_INFO(architecture_code, is_arch_optimised) \
+    case sycl::ext::oneapi::experimental::architecture ::architecture_code: \
+      name = #architecture_code; \
+      is_optimized = is_arch_optimised; \
+      break;
+
+  /* List of architectures that have been optimized by Intel and Blender developers.
+   *
+   * For example, Intel Rocket Lake iGPU (rkl) is not supported and not optimized,
+   * while Intel Arc Alchemist dGPU (dg2) was optimized for.
+   *
+   * Devices can changed from unoptimized to optimized manually, after DPC++ has
+   * been upgraded to support the architecture and CYCLES_ONEAPI_INTEL_BINARIES_ARCH
+   * in CMake includes the architecture. */
+  switch (arch) {
+    FILL_ARCH_INFO(intel_gpu_bdw, false)
+    FILL_ARCH_INFO(intel_gpu_skl, false)
+    FILL_ARCH_INFO(intel_gpu_kbl, false)
+    FILL_ARCH_INFO(intel_gpu_cfl, false)
+    FILL_ARCH_INFO(intel_gpu_apl, false)
+    FILL_ARCH_INFO(intel_gpu_glk, false)
+    FILL_ARCH_INFO(intel_gpu_whl, false)
+    FILL_ARCH_INFO(intel_gpu_aml, false)
+    FILL_ARCH_INFO(intel_gpu_cml, false)
+    FILL_ARCH_INFO(intel_gpu_icllp, false)
+    FILL_ARCH_INFO(intel_gpu_ehl, false)
+    FILL_ARCH_INFO(intel_gpu_tgllp, false)
+    FILL_ARCH_INFO(intel_gpu_rkl, false)
+    FILL_ARCH_INFO(intel_gpu_adl_s, false)
+    FILL_ARCH_INFO(intel_gpu_adl_p, false)
+    FILL_ARCH_INFO(intel_gpu_adl_n, false)
+    FILL_ARCH_INFO(intel_gpu_dg1, false)
+    FILL_ARCH_INFO(intel_gpu_dg2_g10, true)
+    FILL_ARCH_INFO(intel_gpu_dg2_g11, true)
+    FILL_ARCH_INFO(intel_gpu_dg2_g12, true)
+    FILL_ARCH_INFO(intel_gpu_pvc, false)
+    FILL_ARCH_INFO(intel_gpu_pvc_vg, false)
+    /* intel_gpu_mtl_u == intel_gpu_mtl_s == intel_gpu_arl_u == intel_gpu_arl_s */
+    FILL_ARCH_INFO(intel_gpu_mtl_u, true)
+    FILL_ARCH_INFO(intel_gpu_mtl_h, true)
+    FILL_ARCH_INFO(intel_gpu_bmg_g21, true)
+    FILL_ARCH_INFO(intel_gpu_lnl_m, true)
+
+    default:
+      name = "unknown";
+      is_optimized = false;
+      break;
+  }
+}
+
 char *OneapiDevice::device_capabilities()
 {
   std::stringstream capabilities;
@@ -1470,6 +1528,15 @@ char *OneapiDevice::device_capabilities()
     capabilities << "\t\tsycl::info::platform::name\t\t\t"
                  << device.get_platform().get_info<sycl::info::platform::name>() << "\n";
 
+    string arch_name;
+    bool is_optimised_for_arch;
+    architecture_information(
+        reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
+    capabilities << "\t\tsycl::info::device::architecture\t\t\t";
+    capabilities << arch_name << "\n";
+    capabilities << "\t\tsycl::info::device::is_cycles_optimized\t\t\t";
+    capabilities << is_optimised_for_arch << "\n";
+
 #  define WRITE_ATTR(attribute_name, attribute_variable) \
     capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
                  << "\n";
@@ -1587,10 +1654,22 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
     bool oidn_support = false;
 #  endif
     std::string id = "ONEAPI_" + platform_name + "_" + name;
+
+    string arch_name;
+    bool is_optimised_for_arch;
+    architecture_information(
+        reinterpret_cast<const SyclDevice *>(&device), arch_name, is_optimised_for_arch);
+
     if (device.has(sycl::aspect::ext_intel_pci_address)) {
       id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
     }
-    (cb)(id.c_str(), name.c_str(), num, hwrt_support, oidn_support, user_ptr);
+    (cb)(id.c_str(),
+         name.c_str(),
+         num,
+         hwrt_support,
+         oidn_support,
+         is_optimised_for_arch,
+         user_ptr);
     num++;
   }
 }
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index e9c76113cef..5871c3282f2 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN
 class DeviceQueue;
 
 using OneAPIDeviceIteratorCallback =
-    void (*)(const char *, const char *, const int, bool, bool, void *);
+    void (*)(const char *, const char *, const int, bool, bool, bool, void *);
 
 class OneapiDevice : public GPUDevice {
  private:
@@ -117,6 +117,7 @@ class OneapiDevice : public GPUDevice {
   void *usm_aligned_alloc_host(const size_t memory_size, const size_t alignment);
   void usm_free(void *usm_ptr);
 
+  static void architecture_information(const SyclDevice *device, string &name, bool &is_optimized);
   static char *device_capabilities();
   static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr);
 
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index 56653efc0d7..743991c6876 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -25,6 +25,7 @@ enum DeviceKernel : int;
 #  endif
 
 class SyclQueue;
+class SyclDevice;
 
 typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);