Cycles: Use RGBE for denoised guiding buffers to reduce memory usage

Co-authored-by: Brecht Van Lommel <brecht@blender.org>
2025-05-21 12:23:13 +02:00
parent 5cb6014efd
commit a4f8e0bfa2
37 changed files with 284 additions and 41 deletions
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -41,6 +41,7 @@ CPUKernels::CPUKernels()
      REGISTER_KERNEL_FILM_CONVERT(sample_count),
      REGISTER_KERNEL_FILM_CONVERT(float),
      REGISTER_KERNEL_FILM_CONVERT(light_path),
+      REGISTER_KERNEL_FILM_CONVERT(rgbe),
      REGISTER_KERNEL_FILM_CONVERT(float3),
      REGISTER_KERNEL_FILM_CONVERT(motion),
      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -119,6 +119,7 @@ class CPUKernels {
  KERNEL_FILM_CONVERT_FUNCTION(float)

  KERNEL_FILM_CONVERT_FUNCTION(light_path)
+  KERNEL_FILM_CONVERT_FUNCTION(rgbe)
  KERNEL_FILM_CONVERT_FUNCTION(float3)

  KERNEL_FILM_CONVERT_FUNCTION(motion)
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -126,6 +126,7 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+      FILM_CONVERT_KERNEL_AS_STRING(RGBE, rgbe)
      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
--- a/intern/cycles/integrator/denoiser_gpu.cpp
+++ b/intern/cycles/integrator/denoiser_gpu.cpp
@@ -316,7 +316,7 @@ void DenoiserGPU::denoise_color_read(const DenoiseContext &context, const Denois
  const PassAccessorGPU pass_accessor(
      denoiser_queue_.get(), pass_access_info, 1.0f, context.num_samples);

-  PassAccessor::Destination destination(pass_access_info.type);
+  PassAccessor::Destination destination(pass_access_info.type, pass_access_info.mode);
  destination.d_pixels = context.render_buffers->buffer.device_pointer;
  destination.num_components = 3;
  destination.pixel_offset = pass.denoised_offset;
--- a/intern/cycles/integrator/denoiser_oidn.cpp
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -53,7 +53,7 @@ class OIDNPass {
    offset = buffer_params.get_pass_offset(type, mode);
    need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);

-    const PassInfo pass_info = Pass::get_info(type);
+    const PassInfo pass_info = Pass::get_info(type, mode);
    num_components = pass_info.num_components;
    use_compositing = pass_info.use_compositing;
    use_denoising_albedo = pass_info.use_denoising_albedo;
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -33,9 +33,9 @@ PassAccessor::Destination::Destination(float *pixels, const int num_components)
 {
 }

-PassAccessor::Destination::Destination(const PassType pass_type)
+PassAccessor::Destination::Destination(const PassType pass_type, const PassMode pass_mode)
 {
-  const PassInfo pass_info = Pass::get_info(pass_type);
+  const PassInfo pass_info = Pass::get_info(pass_type, pass_mode);
  num_components = pass_info.num_components;
 }

@@ -126,12 +126,16 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
  const PassType type = pass_access_info_.type;
  const PassMode mode = pass_access_info_.mode;
  const PassInfo pass_info = Pass::get_info(
-      type, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
+      type, mode, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
  int num_written_components = pass_info.num_components;

  if (pass_info.num_components == 1) {
+    if (is_volume_guiding_pass(type)) {
+      get_pass_rgbe(render_buffers, buffer_params, destination);
+      num_written_components = 3;
+    }
    /* Single channel passes. */
-    if (mode == PassMode::DENOISED) {
+    else if (mode == PassMode::DENOISED) {
      /* Denoised passes store their final pixels, no need in special calculation. */
      get_pass_float(render_buffers, buffer_params, destination);
    }
@@ -226,8 +230,10 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
                                            const Destination &destination) const
 {
  const PassMode mode = pass_access_info_.mode;
-  const PassInfo &pass_info = Pass::get_info(
-      pass_access_info_.type, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+                                             mode,
+                                             pass_access_info_.include_albedo,
+                                             pass_access_info_.is_lightgroup);

  kfilm_convert->pass_offset = pass_access_info_.offset;
  kfilm_convert->pass_stride = buffer_params.pass_stride;
@@ -290,8 +296,10 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
    return false;
  }

-  const PassInfo pass_info = Pass::get_info(
-      pass_access_info_.type, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
+  const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+                                            pass_access_info_.mode,
+                                            pass_access_info_.include_albedo,
+                                            pass_access_info_.is_lightgroup);

  const BufferParams &buffer_params = render_buffers->params;

--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -48,7 +48,7 @@ class PassAccessor {

    /* Destination will be initialized with the number of components which is native for the given
     * pass type. */
-    explicit Destination(const PassType pass_type);
+    explicit Destination(const PassType pass_type, const PassMode pass_mode);

    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
    float *pixels = nullptr;
@@ -138,6 +138,7 @@ class PassAccessor {
  /* Float3 passes. */
  DECLARE_PASS_ACCESSOR(light_path)
  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(rgbe)
  DECLARE_PASS_ACCESSOR(float3)

  /* Float4 passes. */
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -112,6 +112,7 @@ DEFINE_PASS_ACCESSOR(float)
 /* Float3 passes. */
 DEFINE_PASS_ACCESSOR(light_path)
 DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(rgbe)
 DEFINE_PASS_ACCESSOR(float3)

 /* Float4 passes. */
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -47,6 +47,7 @@ class PassAccessorCPU : public PassAccessor {
  /* Float3 passes. */
  DECLARE_PASS_ACCESSOR(light_path)
  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(rgbe)
  DECLARE_PASS_ACCESSOR(float3)

  /* Float4 passes. */
--- a/intern/cycles/integrator/pass_accessor_gpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -96,6 +96,7 @@ DEFINE_PASS_ACCESSOR(float, FLOAT);

 /* Float3 passes. */
 DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(rgbe, RGBE);
 DEFINE_PASS_ACCESSOR(float3, FLOAT3);

 /* Float4 passes. */
--- a/intern/cycles/integrator/pass_accessor_gpu.h
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -40,6 +40,7 @@ class PassAccessorGPU : public PassAccessor {

  /* Float3 passes. */
  DECLARE_PASS_ACCESSOR(light_path);
+  DECLARE_PASS_ACCESSOR(rgbe);
  DECLARE_PASS_ACCESSOR(float3);

  /* Float4 passes. */
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -186,9 +186,9 @@ PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMod
 }

 PassAccessor::Destination PathTraceWork::get_display_destination_template(
-    const PathTraceDisplay *display) const
+    const PathTraceDisplay *display, const PassMode mode) const
 {
-  PassAccessor::Destination destination(film_->get_display_pass());
+  PassAccessor::Destination destination(film_->get_display_pass(), mode);

  const int2 display_texture_size = display->get_texture_size();
  const int texture_x = effective_buffer_params_.full_x - effective_big_tile_params_.full_x +
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -165,8 +165,8 @@ class PathTraceWork {

  /* Get destination which offset and stride are configured so that writing to it will write to a
   * proper location of GPU display texture, taking current tile and device slice into account. */
-  PassAccessor::Destination get_display_destination_template(
-      const PathTraceDisplay *display) const;
+  PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display,
+                                                             const PassMode mode) const;

  /* Device which will be used for path tracing.
   * Note that it is an actual render device (and never is a multi-device). */
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -197,7 +197,7 @@ void PathTraceWorkCPU::copy_to_display(PathTraceDisplay *display,

  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);

-  PassAccessor::Destination destination = get_display_destination_template(display);
+  PassAccessor::Destination destination = get_display_destination_template(display, pass_mode);
  destination.pixels_half_rgba = rgba_half;

  tbb::task_arena local_arena = local_tbb_arena_create(device_);
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -1018,7 +1018,7 @@ void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
    queue_->zero_to_device(display_rgba_half_);
  }

-  PassAccessor::Destination destination(film_->get_display_pass());
+  PassAccessor::Destination destination(film_->get_display_pass(), pass_mode);
  destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;

  get_render_tile_film_pixels(destination, pass_mode, num_samples);
@@ -1045,7 +1045,7 @@ bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display,
    return false;
  }

-  PassAccessor::Destination destination = get_display_destination_template(display);
+  PassAccessor::Destination destination = get_display_destination_template(display, pass_mode);
  destination.d_pixels_half_rgba = d_rgba_half;

  get_render_tile_film_pixels(destination, pass_mode, num_samples);
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -420,6 +420,7 @@ set(SRC_UTIL_HEADERS
  ../util/types_int4.h
  ../util/types_int8.h
  ../util/types_spectrum.h
+  ../util/types_rgbe.h
  ../util/types_uchar2.h
  ../util/types_uchar3.h
  ../util/types_uchar4.h
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -54,6 +54,7 @@ KERNEL_FILM_CONVERT_FUNCTION(volume_majorant)
 KERNEL_FILM_CONVERT_FUNCTION(float)

 KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(rgbe)
 KERNEL_FILM_CONVERT_FUNCTION(float3)

 KERNEL_FILM_CONVERT_FUNCTION(motion)
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -374,6 +374,7 @@ KERNEL_FILM_CONVERT_FUNCTION(volume_majorant, true)
 KERNEL_FILM_CONVERT_FUNCTION(float, true)

 KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(rgbe, false)
 KERNEL_FILM_CONVERT_FUNCTION(float3, false)

 KERNEL_FILM_CONVERT_FUNCTION(motion, false)
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -892,6 +892,7 @@ KERNEL_FILM_CONVERT_VARIANT(float, 1)

 /* 3 channel inputs */
 KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(rgbe, 3)
 KERNEL_FILM_CONVERT_VARIANT(float3, 3)

 /* 4 channel inputs */
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -277,6 +277,7 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,
 #define coshf(x) cosh(float(x))
 #define tanhf(x) tanh(float(x))
 #define saturatef(x) saturate(float(x))
+#define ldexpf(x, y) ldexp(float(x), int(y))

 /* Use native functions with possibly lower precision for performance,
 * no issues found so far. */
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -224,6 +224,7 @@ ccl_device_forceinline int __float_as_int(const float x)
 #define fminf(x, y) sycl::fmin((x), (y))
 #define fmodf(x, y) sycl::fmod((x), (y))
 #define lgammaf(x) sycl::lgamma((x))
+#define ldexpf(x, y) sycl::ldexp((x), (y))

 #define cosf(x) sycl::native::cos(((float)(x)))
 #define sinf(x) sycl::native::sin(((float)(x)))
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -635,6 +635,7 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
      DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
      DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
      DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
+      DEVICE_KERNEL_FILM_CONVERT(rgbe, RGBE);
      DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
      DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
      DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -171,6 +171,22 @@ ccl_device_inline void film_get_pass_pixel_volume_majorant(
  pixel[0] = (*count != 0.0f) ? expf(-(f * scale_exposure) / *count) : 0.0f;
 }

+ccl_device_inline void film_get_pass_pixel_rgbe(const ccl_global KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                const ccl_global float *ccl_restrict buffer,
+                                                ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const ccl_global float *in = buffer + kfilm_convert->pass_offset;
+  const float3 f = rgbe_to_rgb(RGBE(*in));
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
 ccl_device_inline void film_get_pass_pixel_float(const ccl_global KernelFilmConvert *ccl_restrict
                                                     kfilm_convert,
                                                 const ccl_global float *ccl_restrict buffer,
--- a/intern/cycles/kernel/film/volume_guiding_denoise.h
+++ b/intern/cycles/kernel/film/volume_guiding_denoise.h
@@ -60,15 +60,13 @@ ccl_device void volume_guiding_filter_x(KernelGlobals kg,
    const float weight = gaussian_params[dx] /
                         __float_as_uint(buffer[kernel_data.film.pass_sample_count]);

-    scatter += fabs(kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter)) *
-               weight;
-    transmit += fabs(kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit)) *
-                weight;
+    scatter += kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter) * weight;
+    transmit += kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit) * weight;
  }

  /* Write to the buffer. */
-  film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
-  film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
+  film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
+  film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
 }

 ccl_device void volume_guiding_filter_y(KernelGlobals kg,
@@ -109,9 +107,9 @@ ccl_device void volume_guiding_filter_y(KernelGlobals kg,
    else {
      ccl_global float *buffer = film_pass_pixel_render_buffer(
          kg, x, y, offset, stride, render_buffer);
-      scatter_neighbors[i] = kernel_read_pass_float3(
-          buffer + kernel_data.film.pass_volume_scatter_denoised);
-      transmit_neighbors[i] = kernel_read_pass_float3(
+      scatter_neighbors[i] = kernel_read_pass_rgbe(buffer +
+                                                   kernel_data.film.pass_volume_scatter_denoised);
+      transmit_neighbors[i] = kernel_read_pass_rgbe(
          buffer + kernel_data.film.pass_volume_transmit_denoised);
    }
  }
@@ -128,9 +126,9 @@ ccl_device void volume_guiding_filter_y(KernelGlobals kg,
    else {
      ccl_global float *buffer = film_pass_pixel_render_buffer(
          kg, x, next_y, offset, stride, render_buffer);
-      scatter_neighbors[index] = kernel_read_pass_float3(
+      scatter_neighbors[index] = kernel_read_pass_rgbe(
          buffer + kernel_data.film.pass_volume_scatter_denoised);
-      transmit_neighbors[index] = kernel_read_pass_float3(
+      transmit_neighbors[index] = kernel_read_pass_rgbe(
          buffer + kernel_data.film.pass_volume_transmit_denoised);
    }

@@ -147,8 +145,10 @@ ccl_device void volume_guiding_filter_y(KernelGlobals kg,
    /* Write to the buffers. */
    ccl_global float *buffer = film_pass_pixel_render_buffer(
        kg, x, y, offset, stride, render_buffer);
-    film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
-    film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
+    film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_scatter_denoised,
+                             fabs(scatter));
+    film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_transmit_denoised,
+                             fabs(transmit));
  }
 }

--- a/intern/cycles/kernel/film/write.h
+++ b/intern/cycles/kernel/film/write.h
@@ -10,6 +10,8 @@

 #include "kernel/util/colorspace.h"

+#include "util/types_rgbe.h"
+
 #ifdef __KERNEL_GPU__
 #  include "util/atomic.h"
 #  define __ATOMIC_PASS_WRITE__
@@ -108,6 +110,12 @@ ccl_device_inline void film_write_pass_float4(ccl_global float *ccl_restrict buf
 #endif
 }

+ccl_device_inline void film_overwrite_pass_rgbe(ccl_global float *ccl_restrict buffer,
+                                                const float3 value)
+{
+  *buffer = rgb_to_rgbe(value).f;
+}
+
 /* Overwrite for passes that only write on sample 0. This assumes only a single thread will write
 * to this pixel and no atomics are needed. */

@@ -142,4 +150,9 @@ ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict
  return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
 }

+ccl_device_inline float3 kernel_read_pass_rgbe(const ccl_global float *ccl_restrict buffer)
+{
+  return rgbe_to_rgb(RGBE(*buffer));
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -1012,9 +1012,9 @@ ccl_device_inline void volume_scatter_probability_get(KernelGlobals kg,

  /* Contribution based criterion, see Eq. (15). */
  const float L_scattered = reduce_add(
-      kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised));
+      kernel_read_pass_rgbe(buffer + kernel_data.film.pass_volume_scatter_denoised));
  const float L_transmitted = reduce_add(
-      kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised));
+      kernel_read_pass_rgbe(buffer + kernel_data.film.pass_volume_transmit_denoised));
  const float L_volume = L_transmitted + L_scattered;

  /* Compute guided scattering probability. */
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1885,6 +1885,7 @@ enum DeviceKernel : int {
  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
  DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
+  DECLARE_FILM_CONVERT_KERNEL(RGBE),
  DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
  DECLARE_FILM_CONVERT_KERNEL(MOTION),
  DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
--- a/intern/cycles/scene/pass.cpp
+++ b/intern/cycles/scene/pass.cpp
@@ -143,7 +143,7 @@ Pass::Pass() : Node(get_node_type()), is_auto_(false) {}

 PassInfo Pass::get_info() const
 {
-  return get_info(type, include_albedo, !lightgroup.empty());
+  return get_info(type, mode, include_albedo, !lightgroup.empty());
 }

 bool Pass::is_written() const
@@ -151,7 +151,10 @@ bool Pass::is_written() const
  return get_info().is_written;
 }

-PassInfo Pass::get_info(const PassType type, const bool include_albedo, const bool is_lightgroup)
+PassInfo Pass::get_info(const PassType type,
+                        const PassMode mode,
+                        const bool include_albedo,
+                        const bool is_lightgroup)
 {
  PassInfo pass_info;

@@ -280,10 +283,9 @@ PassInfo Pass::get_info(const PassType type, const bool include_albedo, const bo
      break;
    case PASS_VOLUME_SCATTER:
    case PASS_VOLUME_TRANSMIT:
-      /* TODO(weizhen): Gaussian filter only needs 1 component, but we can have negative pixel
-       * values in some channels, preventing us from simply add them together; besides, using RGB
-       * channels is better for visualization. We can optimize the memory by using RGBE format. */
-      pass_info.num_components = 3;
+      /* Noisy buffer needs higher precision for accumulating the contribution, denoised buffer is
+       * used directly and thus can have lower resolution. */
+      pass_info.num_components = (mode == PassMode::NOISY) ? 3 : 1;
      pass_info.use_exposure = true;
      pass_info.use_filter = false;
      pass_info.support_denoise = true;
--- a/intern/cycles/scene/pass.h
+++ b/intern/cycles/scene/pass.h
@@ -75,6 +75,7 @@ class Pass : public Node {
  static const NodeEnum *get_mode_enum();

  static PassInfo get_info(PassType type,
+                           const PassMode mode = PassMode::DENOISED,
                           const bool include_albedo = false,
                           const bool is_lightgroup = false);

--- a/intern/cycles/session/buffers.cpp
+++ b/intern/cycles/session/buffers.cpp
@@ -68,7 +68,7 @@ BufferPass::BufferPass(const Pass *scene_pass)

 PassInfo BufferPass::get_info() const
 {
-  return Pass::get_info(type, include_albedo, !lightgroup.empty());
+  return Pass::get_info(type, mode, include_albedo, !lightgroup.empty());
 }

 /* --------------------------------------------------------------------
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -39,6 +39,7 @@ set(SRC
  util_math_fast_test.cpp
  util_math_float3_test.cpp
  util_math_float4_test.cpp
+  util_rgbe_test.cpp
  util_md5_test.cpp
  util_path_test.cpp
  util_string_test.cpp
--- a/intern/cycles/test/util_rgbe_test.cpp
+++ b/intern/cycles/test/util_rgbe_test.cpp
@@ -0,0 +1,65 @@
+/* SPDX-FileCopyrightText: 2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include <gtest/gtest.h>
+
+#include "util/log.h"
+#include "util/types_rgbe.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(RGBE, round_trip)
+{
+  {
+    const float3 f = make_float3(7.334898f, 5.811583f, 2.414717f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(7.34375f, 5.8125f, 2.40625f));
+  }
+
+  {
+    const float3 f = make_float3(0.08750992f, 0.05150064f, 0.24991725f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(0.087890625f, 0.05078125f, 0.25f));
+  }
+
+  {
+    const float3 f = make_float3(4e-6f, 30257.0f, 1.0f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(0.0f, 30208.0f, 0.0f));
+  }
+
+  {
+    const float3 f = zero_float3();
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), zero_float3());
+  }
+
+  {
+    const float3 f = make_float3(5.9e-8f, 0.0f, 0.0f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), zero_float3());
+  }
+
+  {
+    const float3 f = make_float3(6.0e-8f, 0.0f, 0.0f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(1.1920928955078125e-7f, 0.0f, 0.0f));
+  }
+
+  {
+    const float3 f = make_float3(-0.863880f, 0.558654f, -0.223357f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(-0.86328125f, 0.55859375f, -0.22265625f));
+  }
+
+  {
+    const float3 f = make_float3(-FLT_MAX, FLT_MAX, 0.0f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(-65280.0f, 65280.0f, 0.0f));
+  }
+
+  {
+    const float inf = __uint_as_float(0x7f800000);
+    const float3 f = make_float3(inf, 127.0f, 129.0f);
+    EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(65280.0f, 0.0f, 256.0f));
+  }
+
+  {
+    /* No test for NaN, undefined behaviour. */
+  }
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -115,6 +115,7 @@ set(SRC_HEADERS
  types_int4.h
  types_int8.h
  types_spectrum.h
+  types_rgbe.h
  types_uchar2.h
  types_uchar3.h
  types_uchar4.h
--- a/intern/cycles/util/math_base.h
+++ b/intern/cycles/util/math_base.h
@@ -668,6 +668,11 @@ ccl_device_inline float xor_mask(const float x, const uint y)
  return __uint_as_float(__float_as_uint(x) ^ y);
 }

+ccl_device_inline float or_mask(const float x, const uint y)
+{
+  return __uint_as_float(__float_as_uint(x) | y);
+}
+
 ccl_device float bits_to_01(const uint bits)
 {
  return bits * (1.0f / (float)0xFFFFFFFF);
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -341,6 +341,11 @@ ccl_device_inline float vector_angle(const float3 a, const float3 b)
  return 2.0f * fast_atan2f(len(a - b), len(a + b));
 }

+ccl_device_inline int floor_log2f(const float x)
+{
+  return (int)(__float_as_uint(x) >> 23) - 127;
+}
+
 /* Based on:
 *
 *   https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
@@ -351,7 +356,7 @@ ccl_device float fast_log2f(float x)
   * negative values/NAN's. */
  x = clamp(x, FLT_MIN, FLT_MAX);
  const unsigned bits = __float_as_uint(x);
-  const int exponent = (int)(bits >> 23) - 127;
+  const int exponent = floor_log2f(x);
  const float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
  /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
   * 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -26,6 +26,7 @@
 #include "util/types_float4.h"  // IWYU pragma: export
 #include "util/types_float8.h"  // IWYU pragma: export

+#include "util/types_rgbe.h"      // IWYU pragma: export
 #include "util/types_spectrum.h"  // IWYU pragma: export

 #include "util/types_dual.h"  // IWYU pragma: export
--- a/intern/cycles/util/types_rgbe.h
+++ b/intern/cycles/util/types_rgbe.h
@@ -0,0 +1,109 @@
+/* SPDX-FileCopyrightText: 2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#pragma once
+
+#include "util/math_fast.h"
+#include "util/math_float3.h"
+#include "util/types_base.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct RGBE {
+  union {
+    struct {
+      uint8_t r, g, b, e;
+    };
+    float f;
+  };
+
+  RGBE() = default;
+
+  ccl_device_inline_method RGBE(const float f_) : f(f_) {}
+};
+
+static_assert(sizeof(RGBE) == 4, "RGBE expected to be exactly 4 bytes");
+
+/**
+ * RGBE format represents an RGB value with 4 bytes.
+ *
+ * The original implementation by Greg Ward uses 8 bits for RGB each, plus 8 bits shared exponent.
+ * It has the same relative precision as the 0 to 255 range of standard 24-bit image formats, but
+ * offers extended intensity range:
+ * https://www.graphics.cornell.edu/~bjw/rgbe.html
+ * GL_EXT_texture_shared_exponent uses 9 bits for RGB and 5 bits for exponent instead, with smaller
+ * range but higher precision:
+ * https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
+ *
+ * Our implementation is mostly based on GL_EXT_texture_shared_exponent, but uses 8 bits for RGB
+ * each, and adds 3 sign bits to represent negative values. The memory layout is as follows:
+ *
+ *        xxxxxxxx  xxxxxxxx  xxxxxxxx  xxx  xxxxx
+ *          m(R)      m(G)      m(B)    sgn   exp
+ *
+ * Each float component is interpreted as
+ *                sgn          exp - bias
+ *        f = (-1)    * 0.m * 2
+ *
+ * We choose a bias of 15, so that the largest representable value is
+ *        RGBE_MAX = 0.11111111 * 2^(31 - 15) = 65280,
+ * and the smallest positive representable value is
+ *        RGBE_MIN = 0.00000001 * 2^(0 - 15) = 1.1920929e-7
+ */
+
+#define RGBE_EXP_BIAS 15
+#define RGBE_MANTISSA_BITS 8
+#define RGBE_EXPONENT_BITS 5
+#define RGBE_MAX 65280.0f
+
+ccl_device RGBE rgb_to_rgbe(float3 rgb)
+{
+  const float max_v = min(reduce_max(fabs(rgb)), RGBE_MAX);
+  if (max_v < ldexpf(0.5f, -RGBE_EXP_BIAS - RGBE_MANTISSA_BITS)) {
+    return RGBE(0.0f);
+  }
+
+  int e = max(-RGBE_EXP_BIAS - 1, floor_log2f(max_v)) + 1;
+  float v = ldexpf(1.0f, RGBE_MANTISSA_BITS - e);
+
+  /* The original implementation by Greg Ward uses `floor`, causing systematic bias when
+   * accumulated in a buffer.
+   * We use `round` instead, but need to deal with overflow. */
+  if (int(roundf(max_v * v)) == power_of_2(RGBE_MANTISSA_BITS)) {
+    e += 1;
+    v *= 0.5f;
+  }
+
+  /* Get sign bits. */
+  const uint sign_bits = ((__float_as_uint(rgb.x) >> 31) << 7) |
+                         ((__float_as_uint(rgb.y) >> 31) << 6) |
+                         ((__float_as_uint(rgb.z) >> 31) << 5);
+
+  RGBE rgbe;
+  rgb = min(round(fabs(rgb) * v), make_float3(255.0f));
+  rgbe.r = uint8_t(rgb.x);
+  rgbe.g = uint8_t(rgb.y);
+  rgbe.b = uint8_t(rgb.z);
+  rgbe.e = uint8_t(((e + RGBE_EXP_BIAS) & 0x1Fu) | sign_bits);
+  return rgbe;
+}
+
+ccl_device_inline float3 rgbe_to_rgb(const RGBE rgbe)
+{
+  if (rgbe.f == 0.0f) {
+    return zero_float3();
+  }
+
+  const int e = rgbe.e & 0x1Fu;
+  const float f = ldexpf(1.0f, e - (int)(RGBE_EXP_BIAS + RGBE_MANTISSA_BITS));
+  float3 result = make_float3(rgbe.r, rgbe.g, rgbe.b) * f;
+
+  /* Set sign bits. */
+  result.x = or_mask(result.x, (uint(rgbe.e) & 0x80u) << 24);
+  result.y = or_mask(result.y, (uint(rgbe.e) & 0x40u) << 25);
+  result.z = or_mask(result.z, (uint(rgbe.e) & 0x20u) << 26);
+  return result;
+}
+
+CCL_NAMESPACE_END