Compositor: Support GPU OIDN denoising

This patch supports GPU OIDN denoising in the compositor. A new compositor performance option was added to allow choosing between CPU, GPU, and Auto device selection. Auto will use whatever the compositor is using for execution. The code is two folds, first, denoising code was adapted to use buffers as opposed to passing in pointers to filters directly, this is needed to support GPU devices. Second, device creation is now a bit more involved, it tries to choose the device is being used by the compositor for execution. Matching GPU devices is done by choosing the OIDN device that matches the UUID or LUID of the active GPU platform. We need both UUID and LUID because not all platforms support both. UUID is supported on all platforms except MacOS Metal, while LUID is only supported on Window and MacOS metal. If there is no active GPU device or matching is unsuccessful, we let OIDN choose the best device, which is typically the fastest. To support this case, UUID and LUID identifiers were added to the GPUPlatformGlobal and are initialized by the GPU backend if supported. OpenGL now requires GL_EXT_memory_object and GL_EXT_memory_object_win32 to support this use case, but it should function without it. Pull Request: https://projects.blender.org/blender/blender/pulls/136660
2025-04-04 11:17:08 +02:00
parent f17148458d
commit 56b0b709ea
16 changed files with 342 additions and 23 deletions
--- a/scripts/startup/bl_ui/properties_render.py
+++ b/scripts/startup/bl_ui/properties_render.py
@@ -777,6 +777,8 @@ class CompositorDenoisePerformanceButtonsPanel:
        layout.use_property_decorate = False

        col = layout.column()
+        row = col.row()
+        row.prop(rd, "compositor_denoise_device", text="Denoising Device", expand=True)
        col.prop(rd, "compositor_denoise_preview_quality", text="Preview Quality")
        col.prop(rd, "compositor_denoise_final_quality", text="Final Quality")

--- a/source/blender/compositor/CMakeLists.txt
+++ b/source/blender/compositor/CMakeLists.txt
@@ -142,9 +142,11 @@ set(SRC
  derived_resources/COM_denoised_auxiliary_pass.hh

  utilities/intern/gpu_material.cc
+  utilities/intern/oidn.cc

  utilities/COM_utilities_diagonals.hh
  utilities/COM_utilities_gpu_material.hh
+  utilities/COM_utilities_oidn.hh
 )

 set(LIB
--- a/source/blender/compositor/derived_resources/intern/denoised_auxiliary_pass.cc
+++ b/source/blender/compositor/derived_resources/intern/denoised_auxiliary_pass.cc
@@ -9,6 +9,7 @@

 #  include "BLI_assert.h"
 #  include "BLI_hash.hh"
+#  include "BLI_span.hh"

 #  include "MEM_guardedalloc.h"

@@ -18,6 +19,7 @@
 #  include "COM_context.hh"
 #  include "COM_denoised_auxiliary_pass.hh"
 #  include "COM_result.hh"
+#  include "COM_utilities_oidn.hh"

 #  include <OpenImageDenoise/oidn.hpp>

@@ -89,29 +91,31 @@ DenoisedAuxiliaryPass::DenoisedAuxiliaryPass(Context &context,

  /* Float3 results might be stored in 4-component textures due to hardware limitations, so we
   * need to use the pixel stride of the texture. */
-  const int pixel_stride = sizeof(float) *
-                           (context.use_gpu() ?
-                                GPU_texture_component_len(GPU_texture_format(pass)) :
-                                pass.channels_count());
+  const int channels_count = context.use_gpu() ?
+                                 GPU_texture_component_len(GPU_texture_format(pass)) :
+                                 pass.channels_count();
+  const int pixel_stride = sizeof(float) * channels_count;

-  oidn::DeviceRef device = oidn::newDevice(oidn::DeviceType::CPU);
+  oidn::DeviceRef device = create_oidn_device(context);
  device.commit();

+  const int64_t buffer_size = int64_t(width) * height * channels_count;
+  const MutableSpan<float> buffer_span = MutableSpan<float>(this->denoised_buffer, buffer_size);
+  oidn::BufferRef buffer = create_oidn_buffer(device, buffer_span);
+
  /* Denoise the pass in place, so set it to both the input and output. */
  oidn::FilterRef filter = device.newFilter("RT");
-  filter.setImage(get_pass_name(type),
-                  this->denoised_buffer,
-                  oidn::Format::Float3,
-                  width,
-                  height,
-                  0,
-                  pixel_stride);
-  filter.setImage(
-      "output", this->denoised_buffer, oidn::Format::Float3, width, height, 0, pixel_stride);
+  const char *pass_name = get_pass_name(type);
+  filter.setImage(pass_name, buffer, oidn::Format::Float3, width, height, 0, pixel_stride);
+  filter.setImage("output", buffer, oidn::Format::Float3, width, height, 0, pixel_stride);
  filter.set("quality", quality);
  filter.setProgressMonitorFunction(oidn_progress_monitor_function, &context);
  filter.commit();
  filter.execute();
+
+  if (buffer.getStorage() != oidn::Storage::Host) {
+    buffer.read(0, buffer_size * sizeof(float), this->denoised_buffer);
+  }
 }

 DenoisedAuxiliaryPass::~DenoisedAuxiliaryPass()
--- a/source/blender/compositor/utilities/COM_utilities_oidn.hh
+++ b/source/blender/compositor/utilities/COM_utilities_oidn.hh
@@ -0,0 +1,32 @@
+/* SPDX-FileCopyrightText: 2025 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+#  include "BLI_span.hh"
+
+#  include "COM_context.hh"
+
+#  include <OpenImageDenoise/oidn.hpp>
+
+namespace blender::compositor {
+
+/* Create an appropriate device based on the device preferences in the given context. Special
+ * attention is given to GPU devices, as multiple GPUs could exist, so the same GPU device used in
+ * the active GPU context is chosen. If no GPU context is active, OIDN chooses the best device,
+ * which is typically the fastest in the system. Such device selection makes execution more
+ * predictable and allows interoperability across APIs. */
+oidn::DeviceRef create_oidn_device(const Context &context);
+
+/* Creates a buffer on the given device that represents the given image. If the device can access
+ * host-side data, the returned buffer is a simple wrapper around the data, otherwise, the data is
+ * copied to a device-only buffer. It is thus expected that the given image data will outlive the
+ * returned buffer. */
+oidn::BufferRef create_oidn_buffer(const oidn::DeviceRef &device, const MutableSpan<float> image);
+
+}  // namespace blender::compositor
+
+#endif
--- a/source/blender/compositor/utilities/intern/oidn.cc
+++ b/source/blender/compositor/utilities/intern/oidn.cc
@@ -0,0 +1,111 @@
+/* SPDX-FileCopyrightText: 2025 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+#  include <cstdint>
+
+#  include "BLI_array.hh"
+#  include "BLI_assert.h"
+#  include "BLI_span.hh"
+
+#  include "GPU_platform.hh"
+
+#  include "COM_context.hh"
+#  include "COM_utilities_oidn.hh"
+
+#  include <OpenImageDenoise/oidn.hpp>
+
+namespace blender::compositor {
+
+oidn::DeviceRef create_oidn_gpu_device(const Context &context)
+{
+
+  /* The compositor uses CPU execution and does not have an active GPU context or device, so let
+   * OIDN select the best device, which is typically the fastest. */
+  if (!context.use_gpu()) {
+    return oidn::newDevice(oidn::DeviceType::Default);
+  }
+
+  /* Try to select the device that is used by the currently active GPU context. First, try to
+   * select the device based on the device LUID. */
+  const Span<uint8_t> platform_luid = GPU_platform_luid();
+  const uint32_t platform_luid_node_mask = GPU_platform_luid_node_mask();
+  const int devices_count = oidn::getNumPhysicalDevices();
+  for (int i = 0; i < devices_count; i++) {
+    oidn::PhysicalDeviceRef physical_device(i);
+    if (!physical_device.get<bool>("luidSupported")) {
+      continue;
+    }
+
+    oidn::LUID luid = physical_device.get<oidn::LUID>("luid");
+    uint32_t luid_node_mask = physical_device.get<uint32_t>("nodeMask");
+    if (platform_luid == Span<uint8_t>(luid.bytes, sizeof(luid.bytes)) &&
+        platform_luid_node_mask == luid_node_mask)
+    {
+      return physical_device.newDevice();
+    }
+  }
+
+  /* If LUID matching was unsuccessful, try to match based on UUID. We rely on multiple selection
+   * methods because not all platforms support both UUID and LUID, but all platforms support either
+   * one of them. UUID supports all except MacOS Metal, while LUID only supports Windows and MacOS
+   * Metal. Note that we prefer LUID as a first match because UUID is unreliable in practice as
+   * some implementations report the same UUID for different devices in the same machine. */
+  const Span<uint8_t> platform_uuid = GPU_platform_uuid();
+  for (int i = 0; i < devices_count; i++) {
+    oidn::PhysicalDeviceRef physical_device(i);
+    if (!physical_device.get<bool>("uuidSupported")) {
+      continue;
+    }
+
+    oidn::UUID uuid = physical_device.get<oidn::UUID>("uuid");
+    if (platform_uuid == Span<uint8_t>(uuid.bytes, sizeof(uuid.bytes))) {
+      return physical_device.newDevice();
+    }
+  }
+
+  return oidn::newDevice(oidn::DeviceType::Default);
+}
+
+oidn::DeviceRef create_oidn_device(const Context &context)
+{
+  const eCompositorDenoiseDevice preferred_denoise_device = static_cast<eCompositorDenoiseDevice>(
+      context.get_render_data().compositor_denoise_device);
+
+  switch (preferred_denoise_device) {
+    case SCE_COMPOSITOR_DENOISE_DEVICE_CPU:
+      return oidn::newDevice(oidn::DeviceType::CPU);
+    case SCE_COMPOSITOR_DENOISE_DEVICE_GPU:
+      return create_oidn_gpu_device(context);
+    case SCE_COMPOSITOR_DENOISE_DEVICE_AUTO:
+      if (!context.use_gpu()) {
+        return oidn::newDevice(oidn::DeviceType::CPU);
+      }
+      else {
+        return create_oidn_gpu_device(context);
+      }
+  }
+
+  BLI_assert_unreachable();
+  return oidn::newDevice(oidn::DeviceType::Default);
+}
+
+oidn::BufferRef create_oidn_buffer(const oidn::DeviceRef &device, const MutableSpan<float> image)
+{
+  /* The device can access host-side data, so create a shared buffer that wraps the data. */
+  const bool can_access_host_memory = device.get<bool>("systemMemorySupported");
+  if (can_access_host_memory) {
+    return device.newBuffer(image.data(), image.size_in_bytes());
+  }
+
+  /* Otherwise, create a device-only buffer and copy the data to it. */
+  oidn::BufferRef buffer = device.newBuffer(image.size_in_bytes(), oidn::Storage::Device);
+  buffer.write(0, image.size_in_bytes(), image.data());
+  return buffer;
+}
+
+}  // namespace blender::compositor
+
+#endif
--- a/source/blender/gpu/GPU_platform.hh
+++ b/source/blender/gpu/GPU_platform.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include <cstdint>
+#include <optional>
 #include <string>

 #include "BLI_span.hh"
@@ -90,3 +92,11 @@ const char *GPU_platform_support_level_key();
 const char *GPU_platform_gpu_name();
 GPUArchitectureType GPU_platform_architecture();
 blender::Span<GPUDevice> GPU_platform_devices_list();
+
+/* The UUID of the device. Can be an empty array, since it is not supported on all platforms. */
+blender::Span<uint8_t> GPU_platform_uuid();
+/* The LUID of the device. Can be an empty array, since it is not supported on all platforms. */
+blender::Span<uint8_t> GPU_platform_luid();
+/* A bit field with the nth bit active identifying the nth device with the same LUID. Only matters
+ * if LUID is defined. */
+uint32_t GPU_platform_luid_node_mask();
--- a/source/blender/gpu/intern/gpu_platform.cc
+++ b/source/blender/gpu/intern/gpu_platform.cc
@@ -9,6 +9,8 @@
 * with checks for drivers and GPU support.
 */

+#include <cstdint>
+
 #include "MEM_guardedalloc.h"

 #include "BLI_dynstr.h"
@@ -104,6 +106,9 @@ void GPUPlatformGlobal::clear()
  MEM_SAFE_FREE(support_key);
  MEM_SAFE_FREE(gpu_name);
  devices.clear_and_shrink();
+  device_uuid.reinitialize(0);
+  device_luid.reinitialize(0);
+  device_luid_node_mask = 0;
  initialized = false;
 }

@@ -179,4 +184,19 @@ blender::Span<GPUDevice> GPU_platform_devices_list()
  return GPG.devices.as_span();
 }

+blender::Span<uint8_t> GPU_platform_uuid()
+{
+  return GPG.device_uuid.as_span();
+}
+
+blender::Span<uint8_t> GPU_platform_luid()
+{
+  return GPG.device_luid.as_span();
+}
+
+uint32_t GPU_platform_luid_node_mask()
+{
+  return GPG.device_luid_node_mask;
+}
+
 /** \} */
--- a/source/blender/gpu/intern/gpu_platform_private.hh
+++ b/source/blender/gpu/intern/gpu_platform_private.hh
@@ -8,6 +8,9 @@

 #pragma once

+#include <cstdint>
+
+#include "BLI_array.hh"
 #include "BLI_vector.hh"

 #include "GPU_platform.hh"
@@ -30,6 +33,14 @@ class GPUPlatformGlobal {
  GPUArchitectureType architecture_type = GPU_ARCHITECTURE_IMR;
  Vector<GPUDevice> devices;

+  /* The UUID of the device. Can be an empty array, since it is not supported on all platforms. */
+  Array<uint8_t, 16> device_uuid;
+  /* The LUID of the device. Can be an empty array, since it is not supported on all platforms. */
+  Array<uint8_t, 8> device_luid;
+  /* A bit field with the nth bit active identifying the nth device with the same LUID. Only
+   * matters if device_luid is defined. */
+  uint32_t device_luid_node_mask;
+
  void init(eGPUDeviceType gpu_device,
            eGPUOSType os_type,
            eGPUDriverType driver_type,
--- a/source/blender/gpu/metal/mtl_backend.mm
+++ b/source/blender/gpu/metal/mtl_backend.mm
@@ -6,6 +6,8 @@
 * \ingroup gpu
 */

+#include <cstring>
+
 #include "BKE_global.hh"

 #include "gpu_backend.hh"
@@ -243,6 +245,17 @@ void MTLBackend::platform_init(MTLContext *ctx)
           renderer,
           version,
           architecture_type);
+
+  /* UUID is not supported on Metal. */
+  GPG.device_uuid.reinitialize(0);
+
+  /* LUID is registryID on Metal, or at least this is what libraries like OIDN expects. */
+  const uint64_t luid = mtl_device.registryID;
+  GPG.device_luid.reinitialize(sizeof(luid));
+  std::memcpy(GPG.device_luid.data(), &luid, sizeof(luid));
+
+  /* Metal only has one device per LUID, so only the first bit will always be active.. */
+  GPG.device_luid_node_mask = 1;
 }

 void MTLBackend::platform_exit()
--- a/source/blender/gpu/opengl/gl_backend.cc
+++ b/source/blender/gpu/opengl/gl_backend.cc
@@ -6,12 +6,15 @@
 * \ingroup gpu
 */

+#include <cstdint>
 #include <string>

 #include "BKE_global.hh"
 #if defined(WIN32)
 #  include "BLI_winstuff.h"
 #endif
+#include "BLI_array.hh"
+#include "BLI_span.hh"
 #include "BLI_string_ref.hh"
 #include "BLI_subprocess.hh"
 #include "BLI_threads.h"
@@ -230,6 +233,33 @@ void GLBackend::platform_init()
           renderer,
           version,
           GPU_ARCHITECTURE_IMR);
+
+  GPG.device_uuid.reinitialize(0);
+  GPG.device_luid.reinitialize(0);
+  GPG.device_luid_node_mask = 0;
+
+  if (epoxy_has_gl_extension("GL_EXT_memory_object")) {
+    GLint number_of_devices = 0;
+    glGetIntegerv(GL_NUM_DEVICE_UUIDS_EXT, &number_of_devices);
+    /* Multiple devices could be used by the context if certain extensions like multi-cast is used.
+     * But this is not used by Blender, so this should always be 1. */
+    BLI_assert(number_of_devices == 1);
+
+    GLubyte device_uuid[GL_UUID_SIZE_EXT] = {0};
+    glGetUnsignedBytei_vEXT(GL_DEVICE_UUID_EXT, 0, device_uuid);
+    GPG.device_uuid = Array<uint8_t, 16>(Span<uint8_t>(device_uuid, GL_UUID_SIZE_EXT));
+
+    /* LUID is only supported on Windows. */
+    if (epoxy_has_gl_extension("GL_EXT_memory_object_win32") && (os & GPU_OS_WIN)) {
+      GLubyte device_luid[GL_LUID_SIZE_EXT] = {0};
+      glGetUnsignedBytevEXT(GL_DEVICE_LUID_EXT, device_luid);
+      GPG.device_luid = Array<uint8_t, 8>(Span<uint8_t>(device_luid, GL_LUID_SIZE_EXT));
+
+      GLint node_mask = 0;
+      glGetIntegerv(GL_DEVICE_NODE_MASK_EXT, &node_mask);
+      GPG.device_luid_node_mask = uint32_t(node_mask);
+    }
+  }
 }

 void GLBackend::platform_exit()
--- a/source/blender/gpu/vulkan/vk_backend.cc
+++ b/source/blender/gpu/vulkan/vk_backend.cc
@@ -331,6 +331,19 @@ void VKBackend::platform_init(const VKDevice &device)
           GPU_ARCHITECTURE_IMR);
  GPG.devices = devices;

+  const VkPhysicalDeviceIDProperties &id_properties = device.physical_device_id_properties_get();
+
+  GPG.device_uuid = Array<uint8_t, 16>(Span<uint8_t>(id_properties.deviceUUID, VK_UUID_SIZE));
+
+  if (id_properties.deviceLUIDValid) {
+    GPG.device_luid = Array<uint8_t, 8>(Span<uint8_t>(id_properties.deviceUUID, VK_LUID_SIZE));
+    GPG.device_luid_node_mask = id_properties.deviceNodeMask;
+  }
+  else {
+    GPG.device_luid.reinitialize(0);
+    GPG.device_luid_node_mask = 0;
+  }
+
  CLOG_INFO(&LOG,
            0,
            "Using vendor [%s] device [%s] driver version [%s].",
--- a/source/blender/gpu/vulkan/vk_device.cc
+++ b/source/blender/gpu/vulkan/vk_device.cc
@@ -149,7 +149,9 @@ void VKDevice::init_physical_device_properties()
  vk_physical_device_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
  vk_physical_device_driver_properties_.sType =
      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES;
+  vk_physical_device_id_properties_.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
  vk_physical_device_properties.pNext = &vk_physical_device_driver_properties_;
+  vk_physical_device_driver_properties_.pNext = &vk_physical_device_id_properties_;

  vkGetPhysicalDeviceProperties2(vk_physical_device_, &vk_physical_device_properties);
  vk_physical_device_properties_ = vk_physical_device_properties.properties;
--- a/source/blender/gpu/vulkan/vk_device.hh
+++ b/source/blender/gpu/vulkan/vk_device.hh
@@ -193,6 +193,7 @@ class VKDevice : public NonCopyable {
  /** Limits of the device linked to this context. */
  VkPhysicalDeviceProperties vk_physical_device_properties_ = {};
  VkPhysicalDeviceDriverProperties vk_physical_device_driver_properties_ = {};
+  VkPhysicalDeviceIDProperties vk_physical_device_id_properties_ = {};
  VkPhysicalDeviceMemoryProperties vk_physical_device_memory_properties_ = {};
  /** Features support. */
  VkPhysicalDeviceFeatures vk_physical_device_features_ = {};
@@ -248,6 +249,11 @@ class VKDevice : public NonCopyable {
    return vk_physical_device_properties_;
  }

+  const VkPhysicalDeviceIDProperties &physical_device_id_properties_get() const
+  {
+    return vk_physical_device_id_properties_;
+  }
+
  const VkPhysicalDeviceFeatures &physical_device_features_get() const
  {
    return vk_physical_device_features_;
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@@ -830,9 +830,14 @@ typedef struct RenderData {
  /** Precision used by the GPU execution of the compositor tree. */
  int compositor_precision; /* eCompositorPrecision */

+  /** Device to use for denoise nodes in the compositor. */
+  int compositor_denoise_device; /* eCompositorDenoiseDevice */
+
  /** Global configuration for denoise compositor nodes. */
  int compositor_denoise_preview_quality; /* eCompositorDenoiseQaulity */
  int compositor_denoise_final_quality;   /* eCompositorDenoiseQaulity */
+
+  char _pad6[4];
 } RenderData;

 /** #RenderData::quality_flag */
@@ -865,6 +870,13 @@ typedef enum eCompositorPrecision {
  SCE_COMPOSITOR_PRECISION_FULL = 1,
 } eCompositorPrecision;

+/** #RenderData::compositor_denoise_device */
+typedef enum eCompositorDenoiseDevice {
+  SCE_COMPOSITOR_DENOISE_DEVICE_AUTO = 0,
+  SCE_COMPOSITOR_DENOISE_DEVICE_CPU = 1,
+  SCE_COMPOSITOR_DENOISE_DEVICE_GPU = 2,
+} eCompositorDenoiseDevice;
+
 /** #RenderData::compositor_denoise_preview_quality */
 /** #RenderData::compositor_denoise_final_quality */
 typedef enum eCompositorDenoiseQaulity {
--- a/source/blender/makesrna/intern/rna_scene.cc
+++ b/source/blender/makesrna/intern/rna_scene.cc
@@ -6826,6 +6826,25 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
      {0, nullptr, 0, nullptr, nullptr},
  };

+  static const EnumPropertyItem compositor_denoise_device_items[] = {
+      {SCE_COMPOSITOR_DENOISE_DEVICE_AUTO,
+       "AUTO",
+       0,
+       "Auto",
+       "Use the same device used by the compositor to process the denoise node"},
+      {SCE_COMPOSITOR_DENOISE_DEVICE_CPU,
+       "CPU",
+       0,
+       "CPU",
+       "Use the CPU to process the denoise node"},
+      {SCE_COMPOSITOR_DENOISE_DEVICE_GPU,
+       "GPU",
+       0,
+       "GPU",
+       "Use the GPU to process the denoise node if available, otherwise fallback to CPU"},
+      {0, nullptr, 0, nullptr, nullptr},
+  };
+
  static const EnumPropertyItem compositor_denoise_quality_items[] = {
      {SCE_COMPOSITOR_DENOISE_HIGH, "HIGH", 0, "High", "High quality"},
      {SCE_COMPOSITOR_DENOISE_BALANCED,
@@ -7578,6 +7597,15 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
      prop, "Compositor Precision", "The precision of compositor intermediate result");
  RNA_def_property_update(prop, NC_NODE | ND_DISPLAY, "rna_Scene_compositor_update");

+  prop = RNA_def_property(srna, "compositor_denoise_device", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_sdna(prop, nullptr, "compositor_denoise_device");
+  RNA_def_property_enum_items(prop, compositor_denoise_device_items);
+  RNA_def_property_enum_default(prop, SCE_COMPOSITOR_DENOISE_DEVICE_AUTO);
+  RNA_def_property_ui_text(prop,
+                           "Compositor Denoise Node Device",
+                           "The device to use to process the denoise nodes in the compositor");
+  RNA_def_property_update(prop, NC_NODE | ND_DISPLAY, "rna_Scene_compositor_update");
+
  prop = RNA_def_property(srna, "compositor_denoise_preview_quality", PROP_ENUM, PROP_NONE);
  RNA_def_property_enum_sdna(prop, nullptr, "compositor_denoise_preview_quality");
  RNA_def_property_enum_items(prop, compositor_denoise_quality_items);
--- a/source/blender/nodes/composite/nodes/node_composite_denoise.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_denoise.cc
@@ -10,6 +10,8 @@
 #  include "BLI_system.h"
 #endif

+#include "BLI_span.hh"
+
 #include "MEM_guardedalloc.h"

 #include "UI_interface.hh"
@@ -24,6 +26,7 @@
 #include "COM_derived_resources.hh"
 #include "COM_node_operation.hh"
 #include "COM_utilities.hh"
+#include "COM_utilities_oidn.hh"

 #include "node_composite_util.hh"

@@ -126,7 +129,7 @@ class DenoiseOperation : public NodeOperation {
    output_image.allocate_texture(input_image.domain());

 #ifdef WITH_OPENIMAGEDENOISE
-    oidn::DeviceRef device = oidn::newDevice(oidn::DeviceType::CPU);
+    oidn::DeviceRef device = create_oidn_device(this->context());
    device.set("setAffinity", false);
    device.commit();

@@ -151,9 +154,16 @@ class DenoiseOperation : public NodeOperation {
      input_color = const_cast<float *>(static_cast<const float *>(input_image.cpu_data().data()));
      output_color = static_cast<float *>(output_image.cpu_data().data());
    }
+
+    const int64_t buffer_size = int64_t(width) * height * input_image.channels_count();
+    const MutableSpan<float> input_buffer_span = MutableSpan<float>(input_color, buffer_size);
+    oidn::BufferRef input_buffer = create_oidn_buffer(device, input_buffer_span);
+    const MutableSpan<float> output_buffer_span = MutableSpan<float>(output_color, buffer_size);
+    oidn::BufferRef output_buffer = create_oidn_buffer(device, output_buffer_span);
+
    oidn::FilterRef filter = device.newFilter("RT");
-    filter.setImage("color", input_color, oidn::Format::Float3, width, height, 0, pixel_stride);
-    filter.setImage("output", output_color, oidn::Format::Float3, width, height, 0, pixel_stride);
+    filter.setImage("color", input_buffer, oidn::Format::Float3, width, height, 0, pixel_stride);
+    filter.setImage("output", output_buffer, oidn::Format::Float3, width, height, 0, pixel_stride);
    filter.set("hdr", use_hdr());
    filter.set("cleanAux", auxiliary_passes_are_clean());
    this->set_filter_quality(filter);
@@ -183,7 +193,11 @@ class DenoiseOperation : public NodeOperation {
        }
      }

-      filter.setImage("albedo", albedo, oidn::Format::Float3, width, height, 0, pixel_stride);
+      const MutableSpan<float> albedo_buffer_span = MutableSpan<float>(albedo, buffer_size);
+      oidn::BufferRef albedo_buffer = create_oidn_buffer(device, albedo_buffer_span);
+
+      filter.setImage(
+          "albedo", albedo_buffer, oidn::Format::Float3, width, height, 0, pixel_stride);
    }

    /* If the albedo and normal inputs are not single value inputs, set the normal input to the
@@ -213,18 +227,27 @@ class DenoiseOperation : public NodeOperation {

      /* Float3 results might be stored in 4-component textures due to hardware limitations, so we
       * need to use the pixel stride of the texture. */
-      int normal_pixel_stride = sizeof(float) *
-                                (this->context().use_gpu() ?
-                                     GPU_texture_component_len(GPU_texture_format(input_normal)) :
-                                     input_normal.channels_count());
+      const int normal_channels_count = this->context().use_gpu() ?
+                                            GPU_texture_component_len(
+                                                GPU_texture_format(input_normal)) :
+                                            input_normal.channels_count();
+      int normal_pixel_stride = sizeof(float) * normal_channels_count;
+
+      const int64_t normal_buffer_size = int64_t(width) * height * normal_channels_count;
+      const MutableSpan<float> normal_buffer_span = MutableSpan<float>(normal, normal_buffer_size);
+      oidn::BufferRef normal_buffer = create_oidn_buffer(device, normal_buffer_span);

      filter.setImage(
-          "normal", normal, oidn::Format::Float3, width, height, 0, normal_pixel_stride);
+          "normal", normal_buffer, oidn::Format::Float3, width, height, 0, normal_pixel_stride);
    }

    filter.commit();
    filter.execute();

+    if (output_buffer.getStorage() != oidn::Storage::Host) {
+      output_buffer.read(0, buffer_size * sizeof(float), output_color);
+    }
+
    if (this->context().use_gpu()) {
      GPU_texture_update(output_image, data_format, output_color);
    }