Cycles: Volume Scattering Probability Guiding

Guide the probability to scatter in or transmit through the volume. Only applied for primary rays. Co-authored-by: Brecht Van Lommel <brecht@blender.org>
2025-02-25 19:11:08 +01:00
parent a7283fc1d5
commit 5cb6014efd
50 changed files with 1400 additions and 272 deletions
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -194,6 +194,7 @@ def list_render_passes(scene, srl):
    if srl.use_pass_uv:                    yield ("UV",            "UVA",  'VECTOR')
    if srl.use_pass_object_index:          yield ("IndexOB",       "X",    'VALUE')
    if srl.use_pass_material_index:        yield ("IndexMA",       "X",    'VALUE')
+    if crl.use_pass_volume_majorant:       yield ("Volume Majorant", "Z",  'VALUE')

    # Light passes.
    if srl.use_pass_diffuse_direct:        yield ("DiffDir",       "RGB",  'COLOR')
@@ -207,6 +208,8 @@ def list_render_passes(scene, srl):
    if srl.use_pass_transmission_color:    yield ("TransCol",      "RGB",  'COLOR')
    if crl.use_pass_volume_direct:         yield ("VolumeDir",     "RGB",  'COLOR')
    if crl.use_pass_volume_indirect:       yield ("VolumeInd",     "RGB",  'COLOR')
+    if crl.use_pass_volume_scatter:        yield ("Volume Scatter",     "RGB",  'COLOR')
+    if crl.use_pass_volume_transmit:       yield ("Volume Transmit",     "RGB",  'COLOR')
    if srl.use_pass_emit:                  yield ("Emit",          "RGB",  'COLOR')
    if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
    if srl.use_pass_ambient_occlusion:     yield ("AO",            "RGB",  'COLOR')
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -245,6 +245,12 @@ enum_view3d_shading_render_pass = (
    ('SAMPLE_COUNT', "Sample Count", "Per-pixel number of samples"),
 )

+enum_view3d_debug_render_pass = (
+    ('VOLUME_SCATTER', "Volume Scatter", "Show the contribution of scattered ray in volume"),
+    ('VOLUME_TRANSMIT', "Volume Transmit", "Show the contribution of transmitted ray in volume"),
+    ('VOLUME_MAJORANT', "Volume Majorant", "Show the majorant transmittance of the volume")
+)
+
 enum_guiding_distribution = (
    ('PARALLAX_AWARE_VMM', "Parallax-Aware VMM", "Use Parallax-aware von Mises-Fisher models as directional distribution", 0),
    ('DIRECTIONAL_QUAD_TREE', "Directional Quad Tree", "Use Directional Quad Trees as directional distribution", 1),
@@ -1485,6 +1491,24 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
        default=False,
        update=update_render_passes,
    )
+    use_pass_volume_scatter: BoolProperty(
+        name="Volume Scatter",
+        description="Contribution of paths that scattered in the volume at the primary ray",
+        default=False,
+        update=update_render_passes,
+    )
+    use_pass_volume_transmit: BoolProperty(
+        name="Volume Transmit",
+        description="Contribution of paths that transmitted through the volume at the primary ray",
+        default=False,
+        update=update_render_passes,
+    )
+    use_pass_volume_majorant: BoolProperty(
+        name="Volume Majorant",
+        description="Majorant transmittance of the volume",
+        default=False,
+        update=update_render_passes,
+    )

    use_pass_shadow_catcher: BoolProperty(
        name="Shadow Catcher",
@@ -1909,10 +1933,14 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 class CyclesView3DShadingSettings(bpy.types.PropertyGroup):
    __slots__ = ()

+    prefs = bpy.context.preferences
+    use_debug = prefs.experimental.use_cycles_debug and prefs.view.show_developer_ui
+
    render_pass: EnumProperty(
        name="Render Pass",
        description="Render pass to show in the 3D Viewport",
-        items=enum_view3d_shading_render_pass,
+        items=enum_view3d_shading_render_pass +
+        enum_view3d_debug_render_pass if use_debug else enum_view3d_shading_render_pass,
        default='COMBINED',
    )
    show_active_pixels: BoolProperty(
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1054,6 +1054,13 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
        col.prop(cycles_view_layer, "use_pass_volume_direct", text="Direct")
        col.prop(cycles_view_layer, "use_pass_volume_indirect", text="Indirect")

+        prefs = context.preferences
+        use_debug = prefs.experimental.use_cycles_debug and prefs.view.show_developer_ui
+        if use_debug:
+            col.prop(cycles_view_layer, "use_pass_volume_scatter", text="Scatter")
+            col.prop(cycles_view_layer, "use_pass_volume_transmit", text="Transmit")
+            col.prop(cycles_view_layer, "use_pass_volume_majorant", text="Majorant")
+
        col = layout.column(heading="Other", align=True)
        col.prop(view_layer, "use_pass_emit", text="Emission")
        col.prop(view_layer, "use_pass_environment")
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -686,6 +686,9 @@ static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode
  MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT, false);
  MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT, false);
  MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT, false);
+  MAP_PASS("Volume Scatter", PASS_VOLUME_SCATTER, false);
+  MAP_PASS("Volume Transmit", PASS_VOLUME_TRANSMIT, false);
+  MAP_PASS("Volume Majorant", PASS_VOLUME_MAJORANT, false);

  MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR, false);
  MAP_PASS("GlossCol", PASS_GLOSSY_COLOR, false);
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -29,11 +29,15 @@ CPUKernels::CPUKernels()
      REGISTER_KERNEL(adaptive_sampling_convergence_check),
      REGISTER_KERNEL(adaptive_sampling_filter_x),
      REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Volume Scattering Probability Guiding. */
+      REGISTER_KERNEL(volume_guiding_filter_x),
+      REGISTER_KERNEL(volume_guiding_filter_y),
      /* Cryptomatte. */
      REGISTER_KERNEL(cryptomatte_postprocess),
      /* Film Convert. */
      REGISTER_KERNEL_FILM_CONVERT(depth),
      REGISTER_KERNEL_FILM_CONVERT(mist),
+      REGISTER_KERNEL_FILM_CONVERT(volume_majorant),
      REGISTER_KERNEL_FILM_CONVERT(sample_count),
      REGISTER_KERNEL_FILM_CONVERT(float),
      REGISTER_KERNEL_FILM_CONVERT(light_path),
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -54,28 +54,38 @@ class CPUKernels {
                                 const int offset,
                                 int stride)>;

-  using AdaptiveSamplingFilterXFunction =
-      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
-                                 ccl_global float *render_buffer,
-                                 const int y,
-                                 const int start_x,
-                                 const int width,
-                                 const int offset,
-                                 int stride)>;
+  using FilterXFunction = CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
+                                                     ccl_global float *render_buffer,
+                                                     const int y,
+                                                     const int start_x,
+                                                     const int width,
+                                                     const int offset,
+                                                     int stride)>;

-  using AdaptiveSamplingFilterYFunction =
-      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
-                                 ccl_global float *render_buffer,
-                                 const int x,
-                                 const int start_y,
-                                 const int height,
-                                 const int offset,
-                                 int stride)>;
+  using FilterYFunction = CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
+                                                     ccl_global float *render_buffer,
+                                                     const int x,
+                                                     const int start_y,
+                                                     const int height,
+                                                     const int offset,
+                                                     int stride)>;

  AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;

-  AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
-  AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+  FilterXFunction adaptive_sampling_filter_x;
+  FilterYFunction adaptive_sampling_filter_y;
+
+  /* Volume Scattering Probability Guiding. */
+  CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
+                             ccl_global float *render_buffer,
+                             const int y,
+                             const int center_x,
+                             const int min_x,
+                             const int max_x,
+                             const int offset,
+                             int stride)>
+      volume_guiding_filter_x;
+  FilterYFunction volume_guiding_filter_y;

  /* Cryptomatte. */

@@ -104,6 +114,7 @@ class CPUKernels {

  KERNEL_FILM_CONVERT_FUNCTION(depth)
  KERNEL_FILM_CONVERT_FUNCTION(mist)
+  KERNEL_FILM_CONVERT_FUNCTION(volume_majorant)
  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
  KERNEL_FILM_CONVERT_FUNCTION(float)

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -122,6 +122,7 @@ const char *device_kernel_as_string(DeviceKernel kernel)

      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(VOLUME_MAJORANT, volume_majorant)
      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
@@ -154,6 +155,12 @@ const char *device_kernel_as_string(DeviceKernel kernel)
    case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
      return "filter_color_postprocess";

+    /* Volume Scattering Probability Guiding. */
+    case DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X:
+      return "volume_guiding_filter_x";
+    case DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y:
+      return "volume_guiding_filter_y";
+
    /* Cryptomatte. */
    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
      return "cryptomatte_postprocess";
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -4,6 +4,7 @@

 #include "integrator/pass_accessor.h"

+#include "kernel/types.h"
 #include "session/buffers.h"

 #include "util/log.h"
@@ -140,6 +141,9 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
    else if (type == PASS_MIST) {
      get_pass_mist(render_buffers, buffer_params, destination);
    }
+    else if (type == PASS_VOLUME_MAJORANT) {
+      get_pass_volume_majorant(render_buffers, buffer_params, destination);
+    }
    else if (type == PASS_SAMPLE_COUNT) {
      get_pass_sample_count(render_buffers, buffer_params, destination);
    }
--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -131,6 +131,7 @@ class PassAccessor {
  /* Float (scalar) passes. */
  DECLARE_PASS_ACCESSOR(depth)
  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(volume_majorant)
  DECLARE_PASS_ACCESSOR(sample_count)
  DECLARE_PASS_ACCESSOR(float)

--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -105,6 +105,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
 /* Float (scalar) passes. */
 DEFINE_PASS_ACCESSOR(depth)
 DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(volume_majorant)
 DEFINE_PASS_ACCESSOR(sample_count)
 DEFINE_PASS_ACCESSOR(float)

--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -40,6 +40,7 @@ class PassAccessorCPU : public PassAccessor {
  /* Float (scalar) passes. */
  DECLARE_PASS_ACCESSOR(depth)
  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(volume_majorant)
  DECLARE_PASS_ACCESSOR(sample_count)
  DECLARE_PASS_ACCESSOR(float)

--- a/intern/cycles/integrator/pass_accessor_gpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -90,6 +90,7 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
 /* Float (scalar) passes. */
 DEFINE_PASS_ACCESSOR(depth, DEPTH);
 DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(volume_majorant, VOLUME_MAJORANT);
 DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
 DEFINE_PASS_ACCESSOR(float, FLOAT);

--- a/intern/cycles/integrator/pass_accessor_gpu.h
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -34,6 +34,7 @@ class PassAccessorGPU : public PassAccessor {
  /* Float (scalar) passes. */
  DECLARE_PASS_ACCESSOR(depth);
  DECLARE_PASS_ACCESSOR(mist);
+  DECLARE_PASS_ACCESSOR(volume_majorant);
  DECLARE_PASS_ACCESSOR(sample_count);
  DECLARE_PASS_ACCESSOR(float);

--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -196,6 +196,9 @@ void PathTrace::render_pipeline(RenderWork render_work)

  rebalance(render_work);

+  /* Reset sample limit. */
+  render_scheduler_.set_limit_samples_per_update(0);
+
  /* Prepare all per-thread guiding structures before we start with the next rendering
   * iteration/progression. */
  const bool use_guiding = device_scene_->data.integrator.use_guiding;
@@ -203,6 +206,13 @@ void PathTrace::render_pipeline(RenderWork render_work)
    guiding_prepare_structures();
  }

+  const bool has_volume = device_scene_->data.integrator.use_volumes;
+  if (has_volume) {
+    const uint num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+    const uint limit = next_power_of_two(num_rendered_samples) - num_rendered_samples;
+    render_scheduler_.set_limit_samples_per_update(limit);
+  }
+
  path_trace(render_work);
  if (render_cancel_.is_requested) {
    return;
@@ -230,6 +240,11 @@ void PathTrace::render_pipeline(RenderWork render_work)
    return;
  }

+  denoise_volume_guiding_buffers(render_work, has_volume);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
  write_tile_buffer(render_work);
  update_display(render_work);

@@ -634,6 +649,26 @@ void PathTrace::denoise(const RenderWork &render_work)
  render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
 }

+void PathTrace::denoise_volume_guiding_buffers(const RenderWork &render_work,
+                                               const bool has_volume)
+{
+  if (!has_volume || !render_scheduler_.volume_guiding_need_denoise()) {
+    return;
+  }
+
+  LOG_WORK << "Denoise volume guiding buffers.";
+
+  const double start_time = time_dt();
+
+  /* TODO: in the multi-GPU case, we can denoise on one device and copy to the rest, instead of
+   * denoising on each device separately. */
+  parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->denoise_volume_guiding_buffers();
+  });
+
+  render_scheduler_.report_volume_guiding_denoise_time(render_work, time_dt() - start_time);
+}
+
 void PathTrace::set_output_driver(unique_ptr<OutputDriver> driver)
 {
  output_driver_ = std::move(driver);
@@ -714,10 +749,12 @@ void PathTrace::update_display(const RenderWork &render_work)
      return;
    }

-    const PassMode pass_mode = render_work.display.use_denoised_result &&
-                                       render_state_.has_denoised_result ?
-                                   PassMode::DENOISED :
-                                   PassMode::NOISY;
+    const PassType pass_type = film_->get_display_pass();
+    const bool show_denoised = (render_work.display.use_denoised_result &&
+                                has_denoised_result()) ||
+                               is_volume_guiding_pass(pass_type);
+
+    const PassMode pass_mode = show_denoised ? PassMode::DENOISED : PassMode::NOISY;

    /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
     * all works in parallel. */
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -210,6 +210,7 @@ class PathTrace {
  void path_trace(RenderWork &render_work);
  void adaptive_sample(RenderWork &render_work);
  void denoise(const RenderWork &render_work);
+  void denoise_volume_guiding_buffers(const RenderWork &render_work, const bool has_volume);
  void cryptomatte_postprocess(const RenderWork &render_work);
  void update_display(const RenderWork &render_work);
  void rebalance(const RenderWork &render_work);
--- a/intern/cycles/integrator/path_trace_tile.cpp
+++ b/intern/cycles/integrator/path_trace_tile.cpp
@@ -42,7 +42,8 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name,
    return false;
  }

-  const bool has_denoised_result = path_trace_.has_denoised_result();
+  const bool has_denoised_result = path_trace_.has_denoised_result() ||
+                                   is_volume_guiding_pass(pass->type);
  if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
    pass = buffer_params.find_pass(pass->type);
    if (pass == nullptr) {
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -125,6 +125,9 @@ class PathTraceWork {
  virtual int adaptive_sampling_converge_filter_count_active(const float threshold,
                                                             bool reset) = 0;

+  /* Denoise Volume Scattering Probability Guiding buffers. */
+  virtual void denoise_volume_guiding_buffers() = 0;
+
  /* Run cryptomatte pass post-processing kernels. */
  virtual void cryptomatte_postproces() = 0;

--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -303,6 +303,45 @@ void PathTraceWorkCPU::cryptomatte_postproces()
  });
 }

+void PathTraceWorkCPU::denoise_volume_guiding_buffers()
+{
+  const int min_x = effective_buffer_params_.full_x;
+  const int min_y = effective_buffer_params_.full_y;
+  const int max_x = effective_buffer_params_.width + min_x;
+  const int max_y = effective_buffer_params_.height + min_y;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  const blocked_range2d<int> range(min_x, max_x, min_y, max_y);
+
+  /* Filter in x direction. */
+  local_arena.execute([&]() {
+    parallel_for(range, [&](const blocked_range2d<int> r) {
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
+      for (int y = r.cols().begin(); y < r.cols().end(); ++y) {
+        for (int x = r.rows().begin(); x < r.rows().end(); ++x) {
+          kernels_.volume_guiding_filter_x(
+              kernel_globals, render_buffer, y, x, min_x, max_x, offset, stride);
+        }
+      }
+    });
+  });
+
+  /* Filter in y direction. Unlike `filter_x`, the inner loop of `filter_y` is serially run inside
+   * the kernel, to avoid the need of intermediate buffers. */
+  local_arena.execute([&]() {
+    parallel_for(min_x, max_x, [&](int x) {
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
+      kernels_.volume_guiding_filter_y(
+          kernel_globals, render_buffer, x, min_y, max_y, offset, stride);
+    });
+  });
+}
+
 #if defined(WITH_PATH_GUIDING)
 /* NOTE: It seems that this is called before every rendering iteration/progression and not once per
 * rendering. May be we find a way to call it only once per rendering. */
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -51,6 +51,7 @@ class PathTraceWorkCPU : public PathTraceWork {

  int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override;
  void cryptomatte_postproces() override;
+  void denoise_volume_guiding_buffers() override;

 #if defined(WITH_PATH_GUIDING)
  /* Initializes the per-thread guiding kernel data. The function sets the pointers to the
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -1176,6 +1176,29 @@ void PathTraceWorkGPU::cryptomatte_postproces()
  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
 }

+void PathTraceWorkGPU::denoise_volume_guiding_buffers()
+{
+  const DeviceKernelArguments args(&buffers_->buffer.device_pointer,
+                                   &effective_buffer_params_.full_x,
+                                   &effective_buffer_params_.full_y,
+                                   &effective_buffer_params_.width,
+                                   &effective_buffer_params_.height,
+                                   &effective_buffer_params_.offset,
+                                   &effective_buffer_params_.stride);
+
+  {
+    const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+    DCHECK_GT(work_size, 0);
+    queue_->enqueue(DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X, work_size, args);
+  }
+
+  {
+    const int work_size = effective_buffer_params_.width;
+    DCHECK_GT(work_size, 0);
+    queue_->enqueue(DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y, work_size, args);
+  }
+}
+
 bool PathTraceWorkGPU::copy_render_buffers_from_device()
 {
  /* May not exist if cancelled before rendering started. */
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -48,6 +48,7 @@ class PathTraceWorkGPU : public PathTraceWork {

  int adaptive_sampling_converge_filter_count_active(const float threshold, bool reset) override;
  void cryptomatte_postproces() override;
+  void denoise_volume_guiding_buffers() override;

 protected:
  void alloc_integrator_soa();
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -55,7 +55,12 @@ bool RenderScheduler::is_denoiser_gpu_used() const

 void RenderScheduler::set_limit_samples_per_update(const int limit_samples)
 {
-  limit_samples_per_update_ = limit_samples;
+  if (limit_samples_per_update_) {
+    limit_samples_per_update_ = min(limit_samples_per_update_, limit_samples);
+  }
+  else {
+    limit_samples_per_update_ = limit_samples;
+  }
 }

 void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
@@ -169,6 +174,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params)
  adaptive_filter_time_.reset();
  display_update_time_.reset();
  rebalance_time_.reset();
+  volume_guiding_denoise_time_.reset();
 }

 void RenderScheduler::reset_for_next_tile()
@@ -547,6 +553,23 @@ void RenderScheduler::report_denoise_time(const RenderWork &render_work, const d
  LOG_WORK << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
 }

+void RenderScheduler::report_volume_guiding_denoise_time(const RenderWork &render_work,
+                                                         const double time)
+{
+  volume_guiding_denoise_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_report_reset_average(render_work)) {
+    volume_guiding_denoise_time_.reset_average();
+  }
+
+  volume_guiding_denoise_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  LOG_WORK << "Average volume guiding denoising time: "
+           << volume_guiding_denoise_time_.get_average() << " seconds.";
+}
+
 void RenderScheduler::report_display_update_time(const RenderWork &render_work, const double time)
 {
  display_update_time_.add_wall(time);
@@ -963,6 +986,20 @@ float RenderScheduler::work_adaptive_threshold() const
  return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
 }

+bool RenderScheduler::volume_guiding_need_denoise() const
+{
+  if (!is_power_of_two(get_num_rendered_samples())) {
+    return false;
+  }
+
+  if (done()) {
+    /* No need to denoise after the last sample. */
+    return false;
+  }
+
+  return true;
+}
+
 bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
 {
  delayed = false;
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -207,6 +207,9 @@ class RenderScheduler {
  void report_rebalance_time(const RenderWork &render_work,
                             const double time,
                             bool balance_changed);
+  void report_volume_guiding_denoise_time(const RenderWork &render_work, const double time);
+
+  bool volume_guiding_need_denoise() const;

  /* Generate full multi-line report of the rendering process, including rendering parameters,
   * times, and so on. */
@@ -435,6 +438,7 @@ class RenderScheduler {
  TimeWithAverage denoise_time_;
  TimeWithAverage display_update_time_;
  TimeWithAverage rebalance_time_;
+  TimeWithAverage volume_guiding_denoise_time_;

  /* Whether cryptomatte-related work will be scheduled. */
  bool need_schedule_cryptomatte_ = false;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -283,6 +283,7 @@ set(SRC_KERNEL_FILM_HEADERS
  film/aov_passes.h
  film/data_passes.h
  film/denoising_passes.h
+  film/volume_guiding_denoise.h
  film/cryptomatte_passes.h
  film/light_passes.h
  film/read.h
--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -103,6 +103,12 @@ KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_direct)
 KERNEL_STRUCT_MEMBER(film, int, pass_glossy_direct)
 KERNEL_STRUCT_MEMBER(film, int, pass_transmission_direct)
 KERNEL_STRUCT_MEMBER(film, int, pass_volume_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_scatter)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_scatter_denoised)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_transmit)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_transmit_denoised)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_majorant)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_majorant_sample_count)
 KERNEL_STRUCT_MEMBER(film, int, pass_emission)
 KERNEL_STRUCT_MEMBER(film, int, pass_background)
 KERNEL_STRUCT_MEMBER(film, int, pass_ao)
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -50,6 +50,7 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 KERNEL_FILM_CONVERT_FUNCTION(depth)
 KERNEL_FILM_CONVERT_FUNCTION(mist)
 KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(volume_majorant)
 KERNEL_FILM_CONVERT_FUNCTION(float)

 KERNEL_FILM_CONVERT_FUNCTION(light_path)
@@ -123,4 +124,24 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobal
                                                        ccl_global float *render_buffer,
                                                        int pixel_index);

+/* --------------------------------------------------------------------
+ * Volume Scattering Probability Guiding.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(volume_guiding_filter_x)(const ThreadKernelGlobalsCPU *kg,
+                                                        ccl_global float *render_buffer,
+                                                        const int y,
+                                                        const int center_x,
+                                                        const int min_x,
+                                                        const int max_x,
+                                                        const int offset,
+                                                        int stride);
+void KERNEL_FUNCTION_FULL_NAME(volume_guiding_filter_y)(const ThreadKernelGlobalsCPU *kg,
+                                                        ccl_global float *render_buffer,
+                                                        const int x,
+                                                        const int center_y,
+                                                        const int height,
+                                                        const int offset,
+                                                        int stride);
+
 #undef KERNEL_ARCH
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -29,6 +29,7 @@
 #    include "kernel/film/adaptive_sampling.h"
 #    include "kernel/film/cryptomatte_passes.h"
 #    include "kernel/film/read.h"
+#    include "kernel/film/volume_guiding_denoise.h"

 #    include "kernel/bake/bake.h"

@@ -243,6 +244,56 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobal
 #endif
 }

+/* --------------------------------------------------------------------
+ * Volume Scattering Probability Guiding.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(volume_guiding_filter_x)(const ThreadKernelGlobalsCPU *kg,
+                                                        ccl_global float *render_buffer,
+                                                        const int y,
+                                                        const int center_x,
+                                                        const int min_x,
+                                                        const int max_x,
+                                                        const int offset,
+                                                        const int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, volume_guiding_filter_x);
+  (void)kg;
+  (void)render_buffer;
+  (void)y;
+  (void)center_x;
+  (void)min_x;
+  (void)max_x;
+  (void)offset;
+  (void)stride;
+#else
+  volume_guiding_filter_x(kg, render_buffer, y, center_x, min_x, max_x, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(volume_guiding_filter_y)(const ThreadKernelGlobalsCPU *kg,
+                                                        ccl_global float *render_buffer,
+                                                        const int x,
+                                                        const int min_y,
+                                                        const int max_y,
+                                                        const int offset,
+                                                        const int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, volume_guiding_filter_y);
+  (void)kg;
+  (void)render_buffer;
+  (void)x;
+  (void)min_y;
+  (void)max_y;
+  (void)offset;
+  (void)stride;
+#else
+  volume_guiding_filter_y(kg, render_buffer, x, min_y, max_y, offset, stride);
+#endif
+}
+
 /* --------------------------------------------------------------------
 * Film Convert.
 */
@@ -319,6 +370,7 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobal
 KERNEL_FILM_CONVERT_FUNCTION(depth, true)
 KERNEL_FILM_CONVERT_FUNCTION(mist, true)
 KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(volume_majorant, true)
 KERNEL_FILM_CONVERT_FUNCTION(float, true)

 KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -42,6 +42,7 @@
 #include "kernel/bake/bake.h"

 #include "kernel/film/adaptive_sampling.h"
+#include "kernel/film/volume_guiding_denoise.h"

 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_end.h"
@@ -885,6 +886,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
 /* 1 channel inputs */
 KERNEL_FILM_CONVERT_VARIANT(depth, 1)
 KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(volume_majorant, 1)
 KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
 KERNEL_FILM_CONVERT_VARIANT(float, 1)

@@ -1199,3 +1201,47 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  }
 }
 ccl_gpu_kernel_postfix
+
+/* --------------------------------------------------------------------
+ * Volume Scattering Probability Guiding.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    ccl_gpu_kernel_signature(volume_guiding_filter_x,
+                             ccl_global float *render_buffer,
+                             const int sx,
+                             const int sy,
+                             const int sw,
+                             const int sh,
+                             const int offset,
+                             const int stride)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index % sw;
+
+  if (y < sh) {
+    ccl_gpu_kernel_call(volume_guiding_filter_x(
+        nullptr, render_buffer, sy + y, sx + x, sx, sx + sw, offset, stride));
+  }
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    ccl_gpu_kernel_signature(volume_guiding_filter_y,
+                             ccl_global float *render_buffer,
+                             const int sx,
+                             const int sy,
+                             const int sw,
+                             const int sh,
+                             const int offset,
+                             const int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    ccl_gpu_kernel_call(
+        volume_guiding_filter_y(nullptr, render_buffer, sx + x, sy, sy + sh, offset, stride));
+  }
+}
+ccl_gpu_kernel_postfix
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -603,6 +603,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
          break;
        }
+        case DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_volume_guiding_filter_x);
+          break;
+        }
+        case DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_volume_guiding_filter_y);
+          break;
+        }

        /* clang-format off */
    #  define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
@@ -621,6 +631,7 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,

      DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
      DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
+      DEVICE_KERNEL_FILM_CONVERT(volume_majorant, VOLUME_MAJORANT);
      DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
      DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
      DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
--- a/intern/cycles/kernel/film/light_passes.h
+++ b/intern/cycles/kernel/film/light_passes.h
@@ -197,6 +197,26 @@ ccl_device void film_write_adaptive_buffer(KernelGlobals kg,
  }
 }

+/* Write the volume and surface contribution for volume scattering probability guiding. */
+ccl_device_inline void film_write_volume_scattering_guiding_pass(KernelGlobals kg,
+                                                                 ccl_global float *ccl_restrict
+                                                                     buffer,
+                                                                 const uint32_t path_flag,
+                                                                 const Spectrum contribution)
+{
+  int pass_offset = PASS_UNUSED;
+  if (path_flag & PATH_RAY_VOLUME_PRIMARY_TRANSMIT) {
+    pass_offset = kernel_data.film.pass_volume_transmit;
+  }
+  else if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+    pass_offset = kernel_data.film.pass_volume_scatter;
+  }
+
+  if (pass_offset != PASS_UNUSED) {
+    film_write_pass_spectrum(buffer + pass_offset, contribution);
+  }
+}
+
 /* --------------------------------------------------------------------
 * Shadow catcher.
 */
@@ -337,6 +357,7 @@ ccl_device_inline void film_write_combined_pass(KernelGlobals kg,
  }

  film_write_adaptive_buffer(kg, sample, contribution, buffer);
+  film_write_volume_scattering_guiding_pass(kg, buffer, path_flag, contribution);
 }

 /* Write combined pass with transparency. */
@@ -361,6 +382,7 @@ ccl_device_inline void film_write_combined_transparent_pass(KernelGlobals kg,
  }

  film_write_adaptive_buffer(kg, sample, contribution, buffer);
+  film_write_volume_scattering_guiding_pass(kg, buffer, path_flag, contribution);
 }

 /* Write background or emission to appropriate pass. */
@@ -575,6 +597,12 @@ ccl_device_inline void film_write_transparent(KernelGlobals kg,
 #ifdef __SHADOW_CATCHER__
  film_write_shadow_catcher_transparent_only(kg, path_flag, transparent, buffer);
 #endif
+
+  if (path_flag & PATH_RAY_VOLUME_PRIMARY_TRANSMIT) {
+    kernel_assert(kernel_data.film.pass_volume_transmit != PASS_UNUSED);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_volume_transmit,
+                             make_spectrum(transparent));
+  }
 }

 /* Write holdout to render buffer. */
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -154,6 +154,23 @@ ccl_device_inline void film_get_pass_pixel_sample_count(
  pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
 }

+ccl_device_inline void film_get_pass_pixel_volume_majorant(
+    const ccl_global KernelFilmConvert *ccl_restrict kfilm_convert,
+    const ccl_global float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const ccl_global float *in = buffer + kfilm_convert->pass_offset;
+  const ccl_global float *count = buffer + kfilm_convert->pass_divide;
+  const float f = *in;
+
+  pixel[0] = (*count != 0.0f) ? expf(-(f * scale_exposure) / *count) : 0.0f;
+}
+
 ccl_device_inline void film_get_pass_pixel_float(const ccl_global KernelFilmConvert *ccl_restrict
                                                     kfilm_convert,
                                                 const ccl_global float *ccl_restrict buffer,
--- a/intern/cycles/kernel/film/volume_guiding_denoise.h
+++ b/intern/cycles/kernel/film/volume_guiding_denoise.h
@@ -0,0 +1,155 @@
+/* SPDX-FileCopyrightText: 2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#pragma once
+
+#include "kernel/film/write.h"
+
+/* Denoise volume scattering probability guiding buffers. */
+
+CCL_NAMESPACE_BEGIN
+
+/* Two-pass Gaussian filter. */
+ccl_device void volume_guiding_filter_x(KernelGlobals kg,
+                                        ccl_global float *render_buffer,
+                                        const int y,
+                                        const int center_x,
+                                        const int min_x,
+                                        const int max_x,
+                                        const int offset,
+                                        const int stride)
+{
+  kernel_assert(kernel_data.film.pass_volume_scatter != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int radius = 5;
+  const int filter_width = radius * 2 + 1;
+
+  /* sigma = 1.5 with integral according to
+   * https://lisyarus.github.io/blog/posts/blur-coefficients-generator.html
+   * https://bartwronski.com/2021/10/31/practical-gaussian-filter-binomial-filter-and-small-sigma-gaussians/
+   */
+  const float gaussian_params[filter_width] = {0.0012273699895602f,
+                                               0.0084674212370284f,
+                                               0.0379843612914121f,
+                                               0.1108921888487800f,
+                                               0.2108379677336155f,
+                                               0.2611813817992076f,
+                                               0.2108379677336155f,
+                                               0.1108921888487800f,
+                                               0.0379843612914121f,
+                                               0.0084674212370284f,
+                                               0.0012273699895602f};
+
+  ccl_global float *buffer = film_pass_pixel_render_buffer(
+      kg, center_x, y, offset, stride, render_buffer);
+
+  /* Apply Gaussian filter in x direction. */
+  float3 scatter = zero_float3(), transmit = zero_float3();
+  for (int dx = 0; dx < filter_width; dx++) {
+    const int x = center_x + dx - radius;
+    if (x < min_x || x >= max_x) {
+      /* Ignore boundary pixels. */
+      continue;
+    }
+
+    ccl_global float *buffer = film_pass_pixel_render_buffer(
+        kg, x, y, offset, stride, render_buffer);
+
+    const float weight = gaussian_params[dx] /
+                         __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+
+    scatter += fabs(kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter)) *
+               weight;
+    transmit += fabs(kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit)) *
+                weight;
+  }
+
+  /* Write to the buffer. */
+  film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
+  film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
+}
+
+ccl_device void volume_guiding_filter_y(KernelGlobals kg,
+                                        ccl_global float *render_buffer,
+                                        const int x,
+                                        const int min_y,
+                                        const int max_y,
+                                        const int offset,
+                                        const int stride)
+{
+  kernel_assert(kernel_data.film.pass_volume_scatter != PASS_UNUSED);
+
+  const int radius = 5;
+  const int filter_width = radius * 2 + 1;
+
+  const float gaussian_params[filter_width] = {0.0012273699895602f,
+                                               0.0084674212370284f,
+                                               0.0379843612914121f,
+                                               0.1108921888487800f,
+                                               0.2108379677336155f,
+                                               0.2611813817992076f,
+                                               0.2108379677336155f,
+                                               0.1108921888487800f,
+                                               0.0379843612914121f,
+                                               0.0084674212370284f,
+                                               0.0012273699895602f};
+
+  /* Store neighboring values to avoid overwriting. */
+  float3 scatter_neighbors[filter_width], transmit_neighbors[filter_width];
+
+  /* Initialze neighbors. */
+  for (int i = 0; i < filter_width; i++) {
+    const int y = min_y + i;
+    if (i >= radius || y < min_y || y >= max_y) {
+      /* Out-of-boundary neighbors are initialized with zero. */
+      scatter_neighbors[i] = transmit_neighbors[i] = zero_float3();
+    }
+    else {
+      ccl_global float *buffer = film_pass_pixel_render_buffer(
+          kg, x, y, offset, stride, render_buffer);
+      scatter_neighbors[i] = kernel_read_pass_float3(
+          buffer + kernel_data.film.pass_volume_scatter_denoised);
+      transmit_neighbors[i] = kernel_read_pass_float3(
+          buffer + kernel_data.film.pass_volume_transmit_denoised);
+    }
+  }
+
+  /* Apply Gaussian filter in y direction. */
+  int index = radius;
+  for (int y = min_y; y < max_y; y++) {
+    /* Fetch the furthest neighbor to the right. */
+    const int next_y = y + radius;
+    if (next_y < min_y || next_y >= max_y) {
+      scatter_neighbors[index] = zero_float3();
+      transmit_neighbors[index] = zero_float3();
+    }
+    else {
+      ccl_global float *buffer = film_pass_pixel_render_buffer(
+          kg, x, next_y, offset, stride, render_buffer);
+      scatter_neighbors[index] = kernel_read_pass_float3(
+          buffer + kernel_data.film.pass_volume_scatter_denoised);
+      transmit_neighbors[index] = kernel_read_pass_float3(
+          buffer + kernel_data.film.pass_volume_transmit_denoised);
+    }
+
+    /* Slide the kernel to the right. */
+    index = (index + 1) % filter_width;
+
+    /* Apply convolution. */
+    float3 scatter = zero_float3(), transmit = zero_float3();
+    for (int i = 0; i < filter_width; i++) {
+      scatter += gaussian_params[i] * scatter_neighbors[(index + i) % filter_width];
+      transmit += gaussian_params[i] * transmit_neighbors[(index + i) % filter_width];
+    }
+
+    /* Write to the buffers. */
+    ccl_global float *buffer = film_pass_pixel_render_buffer(
+        kg, x, y, offset, stride, render_buffer);
+    film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
+    film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
+  }
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/film/write.h
+++ b/intern/cycles/kernel/film/write.h
@@ -39,6 +39,18 @@ ccl_device_forceinline ccl_global float *film_pass_pixel_render_buffer_shadow(
  return render_buffer + render_buffer_offset;
 }

+ccl_device_forceinline ccl_global float *film_pass_pixel_render_buffer(
+    KernelGlobals kg,
+    const int x,
+    const int y,
+    const int offset,
+    const int stride,
+    ccl_global float *ccl_restrict render_buffer)
+{
+  const int render_pixel_index = offset + x + y * stride;
+  return render_buffer + (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+}
+
 /* Accumulate in passes. */

 ccl_device_inline void film_write_pass_float(ccl_global float *ccl_restrict buffer,
@@ -120,7 +132,7 @@ ccl_device_inline float kernel_read_pass_float(const ccl_global float *ccl_restr
  return *buffer;
 }

-ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
+ccl_device_inline float3 kernel_read_pass_float3(const ccl_global float *ccl_restrict buffer)
 {
  return make_float3(buffer[0], buffer[1], buffer[2]);
 }
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -236,7 +236,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
      integrator_path_next(state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
    }
    else {
-      integrator_path_terminate(state, current_kernel);
+      integrator_path_terminate(kg, state, render_buffer, current_kernel);
    }
    return;
  }
@@ -276,14 +276,14 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
 #endif
      }
      else {
-        integrator_path_terminate(state, current_kernel);
+        integrator_path_terminate(kg, state, render_buffer, current_kernel);
      }
    }
  }
  else {
    /* Nothing hit, continue with background kernel. */
    if (integrator_intersect_skip_lights(kg, state)) {
-      integrator_path_terminate(state, current_kernel);
+      integrator_path_terminate(kg, state, render_buffer, current_kernel);
    }
    else {
      integrator_path_next(state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
@@ -338,7 +338,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
  }
  /* Nothing hit, continue with background kernel. */
  if (integrator_intersect_skip_lights(kg, state)) {
-    integrator_path_terminate(state, current_kernel);
+    integrator_path_terminate(kg, state, render_buffer, current_kernel);
  }
  else {
    integrator_path_next(state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
--- a/intern/cycles/kernel/integrator/intersect_subsurface.h
+++ b/intern/cycles/kernel/integrator/intersect_subsurface.h
@@ -18,7 +18,7 @@ ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorStat
  }
 #endif

-  integrator_path_terminate(state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+  integrator_path_terminate(kg, state, nullptr, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -62,6 +62,7 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX;
  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
  INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput;
+  INTEGRATOR_STATE_WRITE(state, path, optical_depth) = 0.0f;
 #if defined(__PATH_GUIDING__)
  if ((kernel_data.kernel_features & KERNEL_FEATURE_PATH_GUIDING)) {
    INTEGRATOR_STATE_WRITE(state, path, unguided_throughput) = 1.0f;
@@ -159,6 +160,10 @@ ccl_device_inline void path_state_next(KernelGlobals kg,
    if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
      flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
    }
+
+    if (bounce == 1) {
+      flag &= ~PATH_RAY_VOLUME_PRIMARY_TRANSMIT;
+    }
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -204,7 +204,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
  }
 #endif

-  integrator_path_terminate(state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  integrator_path_terminate(kg, state, render_buffer, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -80,7 +80,7 @@ ccl_device void integrator_shade_light(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, transparent_bounce) = transparent_bounce;

  if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-    integrator_path_terminate(state, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    integrator_path_terminate(kg, state, render_buffer, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
    return;
  }

--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -847,7 +847,7 @@ ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg,
 {
  const int continue_path_label = integrate_surface<node_feature_mask>(kg, state, render_buffer);
  if (continue_path_label == LABEL_NONE) {
-    integrator_path_terminate(state, current_kernel);
+    integrator_path_terminate(kg, state, render_buffer, current_kernel);
    return;
  }

--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -458,19 +458,13 @@ ccl_device_inline bool volume_octree_advance(KernelGlobals kg,
                                             const IntegratorGenericState state,
                                             const ccl_private RNGState *rng_state,
                                             const uint32_t path_flag,
-                                             ccl_private OctreeTracing &octree,
-                                             const int step)
+                                             ccl_private OctreeTracing &octree)
 {
  if (octree.t.max >= ray->tmax) {
    /* Reached the last segment. */
    return false;
  }

-  if (step >= VOLUME_MAX_STEPS) {
-    /* Exceeds maximal steps. */
-    return false;
-  }
-
  if (octree.next_scale > MANTISSA_BITS) {
    if (fabsf(octree.t.max - ray->tmax) <= OVERLAP_EXP) {
      /* This could happen due to numerical issues, when the bounding box overlaps with a
@@ -530,7 +524,7 @@ ccl_device_inline bool volume_octree_advance_shadow(KernelGlobals kg,
  const float tmin = octree.t.min;

  while (octree.t.is_empty() || sigma.range() * octree.t.length() < 1.0f) {
-    if (!volume_octree_advance<true>(kg, ray, sd, state, rng_state, path_flag, octree, 0)) {
+    if (!volume_octree_advance<true>(kg, ray, sd, state, rng_state, path_flag, octree)) {
      return !octree.t.is_empty();
    }

@@ -766,24 +760,37 @@ ccl_device_inline bool volume_valid_direct_ray_segment(KernelGlobals kg,
 /* Volume Integration */

 struct VolumeIntegrateState {
-  /* Random numbers for scattering. */
+  /* Random number. */
  float rscatter;
-  float rchannel;

-  /* Multiple importance sampling. */
+  /* Method used for sampling direct scatter position. */
  VolumeSampleMethod direct_sample_method;
-  bool use_mis;
  /* Probability of sampling the scatter position using null scattering. */
  float distance_pdf;
  /* Probability of sampling the scatter position using equiangular sampling. */
  float equiangular_pdf;
+  /* Majorant density at the equiangular scatter position. Used to compute the pdf. */
+  float sigma_max;

  /* Ratio tracking estimator of the volume transmittance, with MIS applied. */
  float transmittance;
+  /* Current shading position. */
+  float t;
+  /* Majorant optical depth until now. */
+  float optical_depth;
  /* Steps taken while tracking. Should not exceed `VOLUME_MAX_STEPS`. */
-  int step;
+  uint16_t step;
+  /* Multiple importance sampling. */
+  bool use_mis;

-  bool stop;
+  /* Volume scattering probability guiding. */
+  bool vspg;
+  /* The guided probability that the ray is scattered in the volume. `P_vol` in the paper. */
+  float scatter_prob;
+  /* Minimal scale of majorant for achieving the desired scatter probability. */
+  float majorant_scale;
+  /* Scale to apply after direct throughput due to Russian Roulette. */
+  float direct_rr_scale;

  /* Extra fields for path guiding and denoising. */
  Spectrum emission;
@@ -792,14 +799,432 @@ struct VolumeIntegrateState {
 #  endif
 };

-ccl_device bool volume_integrate_should_stop(const ccl_private VolumeIntegrateResult &result,
-                                             const ccl_private VolumeIntegrateState &vstate)
+/* Accumulate transmittance for equiangular distance sampling without MIS. Using telescoping to
+ * reduce noise. */
+ccl_device_inline void volume_equiangular_transmittance(
+    KernelGlobals kg,
+    const IntegratorState state,
+    const ccl_private Ray *ccl_restrict ray,
+    const ccl_private Extrema<float> &sigma,
+    const ccl_private Interval<float> &interval,
+    ccl_private ShaderData *ccl_restrict sd,
+    const ccl_private RNGState *rng_state,
+    const ccl_private VolumeIntegrateState &ccl_restrict vstate,
+    ccl_private VolumeIntegrateResult &ccl_restrict result)
 {
-  if (result.indirect_scatter && result.direct_scatter) {
+  if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR || vstate.use_mis ||
+      result.direct_scatter)
+  {
+    return;
+  }
+
+  Interval<float> t;
+  if (interval.contains(result.direct_t)) {
+    /* Compute transmittance until the direct scatter position. */
+    t = {interval.min, result.direct_t};
+    result.direct_scatter = true;
+  }
+  else {
+    /* Compute transmittance of the whole segment. */
+    t = interval;
+  }
+
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  result.direct_throughput *= volume_transmittance<false>(
+      kg, state, ray, sd, sigma.range(), t, rng_state, path_flag);
+}
+
+/* Sample the next candidate indirect scatter position following exponential distribution,
+ * and compute the direct throughput for equiangular sampling if using MIS.
+ * Returns true if should continue advancing. */
+ccl_device_inline bool volume_indirect_scatter_advance(const ccl_private OctreeTracing &octree,
+                                                       const bool equiangular,
+                                                       ccl_private float &residual_optical_depth,
+                                                       ccl_private VolumeIntegrateState &vstate,
+                                                       ccl_private VolumeIntegrateResult &result)
+{
+  const float sigma_max = octree.sigma.max * vstate.majorant_scale;
+  residual_optical_depth = (octree.t.max - vstate.t) * sigma_max;
+  if (sigma_max == 0.0f) {
    return true;
  }

-  return vstate.stop;
+  vstate.t += sample_exponential_distribution(vstate.rscatter, 1.0f / sigma_max);
+
+  const bool segment_has_equiangular = equiangular && octree.t.contains(result.direct_t);
+  if (segment_has_equiangular && vstate.t > result.direct_t && !result.direct_scatter) {
+    /* Stepped beyond the equiangular scatter position, compute direct throughput. */
+    result.direct_scatter = true;
+    result.direct_throughput = result.indirect_throughput * vstate.transmittance *
+                               vstate.direct_rr_scale;
+    vstate.distance_pdf = vstate.transmittance * sigma_max;
+    vstate.sigma_max = sigma_max;
+  }
+
+  /* Sampled a position outside the current voxel. */
+  return vstate.t > octree.t.max;
+}
+
+/* Adavance to the next candidate indirect scatter position, and compute the direct throughput. */
+ccl_device_inline bool volume_integrate_advance(KernelGlobals kg,
+                                                const ccl_private Ray *ccl_restrict ray,
+                                                ccl_private ShaderData *ccl_restrict sd,
+                                                const IntegratorState state,
+                                                ccl_private RNGState *rng_state,
+                                                const uint32_t path_flag,
+                                                ccl_private OctreeTracing &octree,
+                                                ccl_private VolumeIntegrateState &vstate,
+                                                ccl_private VolumeIntegrateResult &result)
+{
+  if (vstate.step++ > VOLUME_MAX_STEPS) {
+    /* Exceeds maximal steps. */
+    return false;
+  }
+
+  float residual_optical_depth;
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SCATTER_DISTANCE);
+  const bool equiangular = (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) &&
+                           vstate.use_mis;
+
+  while (
+      volume_indirect_scatter_advance(octree, equiangular, residual_optical_depth, vstate, result))
+  {
+    /* Advance to the next voxel if the sampled distance is beyond the current voxel. */
+    if (!volume_octree_advance<false>(kg, ray, sd, state, rng_state, path_flag, octree)) {
+      return false;
+    }
+
+    vstate.optical_depth += octree.sigma.max * octree.t.length();
+    vstate.t = octree.t.min;
+    volume_equiangular_transmittance(
+        kg, state, ray, octree.sigma, octree.t, sd, rng_state, vstate, result);
+
+    /* Scale the random number by the residual depth for reusing. */
+    vstate.rscatter = saturatef(1.0f - (1.0f - vstate.rscatter) * expf(residual_optical_depth));
+  }
+
+  /* Advance random number offset. */
+  rng_state->rng_offset += PRNG_BOUNCE_NUM;
+
+  return true;
+}
+
+/* -------------------------------------------------------------------- */
+/** \name Volume Scattering Probability Guiding
+ *
+ * Following https://kehanxuuu.github.io/vspg-website/ by Kehan Xu et. al.
+ *
+ * Instead of stopping at the first real scatter event, we step through the entire ray to gather
+ * candidate scatter positions, and guide the probability of scattering inside a volume or
+ * transmitting through the volume by the contribution of both types of events.
+ *
+ * We only guide primary rays, secondary rays could be supported in the OpenPGL in the future.
+ * \{ */
+
+/* Candidate scatter position for VSPG. */
+struct VolumeSampleCandidate {
+  PackedSpectrum emission;
+  float t;
+  PackedSpectrum throughput;
+  float distance_pdf;
+#  ifdef __DENOISING_FEATURES__
+  PackedSpectrum albedo;
+#  endif
+  /* Remember the random number so that we sample the sample point for stochastic evaluation. */
+  uint lcg_state;
+};
+
+/* Sample reservoir for VSPG. */
+struct VolumeSampleReservoir {
+  float total_weight = 0.0f;
+  float rand;
+  VolumeSampleCandidate candidate;
+
+  ccl_device_inline_method VolumeSampleReservoir(const float rand_) : rand(rand_) {}
+
+  /* Stream the candidate samples through the reservoir. */
+  ccl_device_inline_method void add_sample(const float weight,
+                                           const VolumeSampleCandidate new_candidate)
+  {
+    if (!(weight > 0.0f)) {
+      return;
+    }
+
+    total_weight += weight;
+    const float thresh = weight / total_weight;
+
+    if ((rand <= thresh) || (total_weight == weight)) {
+      /* Explicitly select the first candidate in case of numerical issues. */
+      candidate = new_candidate;
+      rand /= thresh;
+    }
+    else {
+      rand = (rand - thresh) / (1.0f - thresh);
+    }
+
+    /* Ensure the `rand` is always within 0..1 range, which could be violated above when
+     * `-ffast-math` is used. */
+    rand = saturatef(rand);
+  }
+
+  ccl_device_inline_method bool is_empty() const
+  {
+    return total_weight == 0.0f;
+  }
+};
+
+/* Estimate volume majorant optical depth `\sum\sigma_{max}t` along the ray, by accumulating the
+ * result from previous samples in a render buffer. */
+ccl_device_inline float volume_majorant_optical_depth(KernelGlobals kg,
+                                                      const ccl_global float *buffer)
+{
+  kernel_assert(kernel_data.film.pass_volume_majorant != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_volume_majorant_sample_count != PASS_UNUSED);
+
+  const ccl_global float *accumulated_optical_depth = buffer +
+                                                      kernel_data.film.pass_volume_majorant;
+  const ccl_global float *count = buffer + kernel_data.film.pass_volume_majorant_sample_count;
+
+  /* Assume `FLT_MAX` when we have no information of the optical depth. */
+  return (*count == 0.0f) ? FLT_MAX : *accumulated_optical_depth / *count;
+}
+
+/* Compute guided volume scatter probability and the majorant scale needed for achieving the
+ * scatter probability, for heterogeneous volume. */
+ccl_device_inline void volume_scatter_probability_get(KernelGlobals kg,
+                                                      const IntegratorState state,
+                                                      ccl_global float *ccl_restrict render_buffer,
+                                                      ccl_private VolumeIntegrateState &vstate)
+{
+  /* Only guide primary rays. */
+  vstate.vspg = (INTEGRATOR_STATE(state, path, bounce) == 0);
+
+  if (!vstate.vspg) {
+    vstate.scatter_prob = 1.0f;
+    vstate.majorant_scale = 1.0f;
+    return;
+  }
+
+  const ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  kernel_assert(kernel_data.film.pass_volume_scatter_denoised != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_volume_transmit_denoised != PASS_UNUSED);
+
+  /* Contribution based criterion, see Eq. (15). */
+  const float L_scattered = reduce_add(
+      kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised));
+  const float L_transmitted = reduce_add(
+      kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised));
+  const float L_volume = L_transmitted + L_scattered;
+
+  /* Compute guided scattering probability. */
+  if (L_volume == 0.0f) {
+    /* Equal probability if no information gathered yet. */
+    vstate.scatter_prob = 0.5f;
+  }
+  else {
+    /* Exponential distribution has non-zero probability beyond the boundary, so the scatter
+     * probability can never reach 1. Clamp to avoid scaling the majorant to infinity. */
+    vstate.scatter_prob = fminf(L_scattered / L_volume, 0.9999f);
+  }
+
+  const float optical_depth = volume_majorant_optical_depth(kg, buffer);
+
+  /* There is a non-zero probability of sampling no scatter events in the volume segment. In order
+   * to reach the desired scattering probability, we might need to upscale the majorant and/or the
+   * guiding scattering probability. See Eq (25,26). */
+  vstate.majorant_scale = (optical_depth == 0.0f) ?
+                              1.0f :
+                              -fast_logf(1.0f - vstate.scatter_prob) / optical_depth;
+  if (vstate.majorant_scale < 1.0f) {
+    vstate.majorant_scale = 1.0f;
+    vstate.scatter_prob = safe_divide(vstate.scatter_prob, 1.0f - fast_expf(-optical_depth));
+  }
+  else {
+    vstate.scatter_prob = 1.0f;
+  }
+}
+
+/* Final guiding decision on sampling scatter or transmit event. */
+ccl_device_inline void volume_distance_sampling_finalize(
+    KernelGlobals kg,
+    const IntegratorState state,
+    const ccl_private Ray *ccl_restrict ray,
+    ccl_private ShaderData *ccl_restrict sd,
+    ccl_private VolumeIntegrateState &ccl_restrict vstate,
+    ccl_private VolumeIntegrateResult &ccl_restrict result,
+    ccl_private VolumeSampleReservoir &reservoir)
+{
+  if (reservoir.is_empty()) {
+    return;
+  }
+
+  const bool sample_distance = !(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_TERMINATE) &&
+                               (vstate.direct_sample_method == VOLUME_SAMPLE_DISTANCE);
+
+  if (!vstate.vspg) {
+    result.indirect_throughput = reservoir.candidate.throughput;
+    vstate.emission = reservoir.candidate.emission;
+#  ifdef __DENOISING_FEATURES__
+    vstate.albedo = reservoir.candidate.albedo;
+#  endif
+    result.indirect_t = reservoir.candidate.t;
+
+    if (sample_distance) {
+      /* If using distance sampling for direct light, just copy parameters of indirect light
+       * since we scatter at the same point. */
+      result.direct_scatter = true;
+      result.direct_t = result.indirect_t;
+      result.direct_throughput = result.indirect_throughput;
+      if (vstate.use_mis) {
+        vstate.distance_pdf = reservoir.candidate.distance_pdf;
+      }
+    }
+    return;
+  }
+
+  const uint lcg_state = reservoir.candidate.lcg_state;
+
+  if (sample_distance) {
+    /* Always sample direct scatter, regardless of indirect scatter guiding decision. */
+    result.direct_throughput = reservoir.candidate.throughput * reservoir.total_weight;
+    vstate.distance_pdf = reservoir.candidate.distance_pdf;
+  }
+
+  /* We only guide scatter decisions, no need to apply on emission and albedo. */
+  vstate.emission = mix(vstate.emission, reservoir.candidate.emission, reservoir.total_weight);
+#  ifdef __DENOISING_FEATURES__
+  vstate.albedo = mix(vstate.albedo, reservoir.candidate.albedo, reservoir.total_weight);
+#  endif
+
+  const float unguided_scatter_prob = reservoir.total_weight;
+  float guided_scatter_prob;
+  if (is_zero(result.indirect_throughput)) {
+    /* Always sample scatter event if the contribution of transmitted event is zero. */
+    guided_scatter_prob = 1.0f;
+  }
+  else {
+    /* Defensive resampling. */
+    const float alpha = 0.75f;
+    reservoir.total_weight = mix(reservoir.total_weight, vstate.scatter_prob, alpha);
+    guided_scatter_prob = reservoir.total_weight;
+
+    /* Add transmitted candidate. */
+    reservoir.add_sample(
+        1.0f - guided_scatter_prob,
+#  ifdef __DENOISING_FEATURES__
+        {vstate.emission, reservoir.candidate.t, result.indirect_throughput, 0.0f, vstate.albedo}
+#  else
+        {vstate.emission, reservoir.candidate.t, result.indirect_throughput, 0.0f}
+#  endif
+    );
+  }
+
+  const bool scatter = (reservoir.candidate.distance_pdf > 0.0f);
+  const float scale = scatter ? unguided_scatter_prob / guided_scatter_prob :
+                                (1.0f - unguided_scatter_prob) / (1.0f - guided_scatter_prob);
+  result.indirect_throughput = reservoir.candidate.throughput * scale;
+
+  if (!scatter && !sample_distance) {
+    /* No scatter event sampled. */
+    return;
+  }
+
+  /* Recover the volume coefficients at the scatter position. */
+  sd->P = ray->P + ray->D * reservoir.candidate.t;
+  sd->lcg_state = lcg_state;
+  VolumeShaderCoefficients coeff ccl_optional_struct_init;
+  if (!volume_shader_sample(kg, state, sd, &coeff)) {
+    kernel_assert(false);
+    return;
+  }
+
+  kernel_assert(sd->flag & SD_SCATTER);
+  if (sample_distance) {
+    /* Direct scatter. */
+    result.direct_scatter = true;
+    result.direct_t = reservoir.candidate.t;
+    volume_shader_copy_phases(&result.direct_phases, sd);
+  }
+
+  if (scatter) {
+    /* Indirect scatter. */
+    result.indirect_scatter = true;
+    result.indirect_t = reservoir.candidate.t;
+    volume_shader_copy_phases(&result.indirect_phases, sd);
+  }
+}
+
+/** \} */
+
+ccl_device bool volume_integrate_should_stop(const ccl_private VolumeIntegrateResult &result)
+{
+  if (is_zero(result.indirect_throughput) && is_zero(result.direct_throughput)) {
+    /* Stopped during Russian Roulette. */
+    return true;
+  }
+
+  /* If we have scattering data for both direct and indirect, we're done. */
+  return (result.direct_scatter && result.indirect_scatter);
+}
+
+/* Perform Russian Roulette termination to avoid drawing too many samples for indirect scatter, but
+ * only if both direct and indirect scatter positions are available, or if no scattering is needed.
+ */
+ccl_device_inline bool volume_russian_roulette_termination(
+    const IntegratorState state,
+    ccl_private VolumeSampleReservoir &reservoir,
+    ccl_private VolumeIntegrateResult &ccl_restrict result,
+    ccl_private VolumeIntegrateState &ccl_restrict vstate)
+{
+  if (result.direct_scatter && result.indirect_scatter) {
+    return true;
+  }
+
+  const float thresh = reduce_max(fabs(result.indirect_throughput));
+  if (thresh > 0.05f) {
+    /* Only stop if contribution is low enough. */
+    return false;
+  }
+
+  /* Whether equiangular estimator of the direct throughput depends on the indirect throughput. */
+  const bool equiangular = (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) &&
+                           vstate.use_mis && !result.direct_scatter;
+  /* Whether both indirect and direct scatter are possible. */
+  const bool has_scatter_samples = !reservoir.is_empty() && !equiangular;
+  /* The path is to be terminated, no scatter position is needed along the ray. */
+  const bool absorption_only = INTEGRATOR_STATE(state, path, flag) & PATH_RAY_TERMINATE;
+
+  /* Randomly stop indirect scatter. */
+  if (absorption_only || has_scatter_samples) {
+    if (reservoir.rand > thresh) {
+      result.indirect_throughput = zero_spectrum();
+      if (equiangular || (vstate.direct_sample_method == VOLUME_SAMPLE_DISTANCE)) {
+        /* Direct throughput depends on the indirect throughput, set to 0 for early termination. */
+        result.direct_throughput = zero_spectrum();
+      }
+      return true;
+    }
+
+    reservoir.rand = saturatef(reservoir.rand / thresh);
+    result.indirect_throughput /= thresh;
+  }
+
+  /* Randomly stop direct scatter. */
+  if (equiangular) {
+    if (reservoir.rand > thresh) {
+      result.direct_scatter = true;
+      result.direct_throughput = zero_spectrum();
+      reservoir.rand = (reservoir.rand - thresh) / (1.0f - thresh);
+    }
+    else {
+      reservoir.rand /= thresh;
+      vstate.direct_rr_scale /= thresh;
+    }
+    reservoir.rand = saturatef(reservoir.rand);
+  }
+
+  return false;
 }

 /* -------------------------------------------------------------------- */
@@ -854,8 +1279,65 @@ ccl_device_inline float volume_scatter_probability(
  return dot(coeff.sigma_s / sigma_c, channel_pdf);
 }

+/* Decide between real and null scatter events at the current position. */
+ccl_device_inline void volume_sample_indirect_scatter(
+    const float sigma_max,
+    const float prob_s,
+    const Spectrum sigma_s,
+    ccl_private ShaderData *ccl_restrict sd,
+    ccl_private VolumeIntegrateState &ccl_restrict vstate,
+    ccl_private VolumeIntegrateResult &ccl_restrict result,
+    const uint lcg_state,
+    ccl_private VolumeSampleReservoir &reservoir)
+{
+  const float weight = vstate.transmittance * prob_s;
+  const Spectrum throughput = result.indirect_throughput * sigma_s / (prob_s * sigma_max);
+
+  if (vstate.vspg) {
+    /* If we guide the scatter probability, simply put the candidate in the reservoir. */
+    reservoir.add_sample(
+#  ifdef __DENOISING_FEATURES__
+        weight,
+        {vstate.emission, vstate.t, throughput, weight * sigma_max, vstate.albedo, lcg_state}
+#  else
+        weight, {vstate.emission, vstate.t, throughput, weight * sigma_max, lcg_state}
+#  endif
+    );
+  }
+  else if (!result.indirect_scatter) {
+    /* If no guiding and indirect scatter position has not been found, decide between real and null
+     * scatter events. */
+    if (reservoir.rand <= prob_s) {
+      /* Rescale random number for reusing. */
+      reservoir.rand /= prob_s;
+
+      /* Sampled scatter event. */
+      result.indirect_scatter = true;
+      volume_shader_copy_phases(&result.indirect_phases, sd);
+      reservoir.add_sample(
+#  ifdef __DENOISING_FEATURES__
+          weight,
+          {vstate.emission, vstate.t, throughput, weight * sigma_max, vstate.albedo, lcg_state}
+#  else
+          weight, {vstate.emission, vstate.t, throughput, weight * sigma_max, lcg_state}
+#  endif
+      );
+
+      if (vstate.direct_sample_method == VOLUME_SAMPLE_DISTANCE) {
+        result.direct_scatter = true;
+        volume_shader_copy_phases(&result.direct_phases, sd);
+      }
+    }
+    else {
+      /* Rescale random number for reusing. */
+      reservoir.rand = (reservoir.rand - prob_s) / (1.0f - prob_s);
+    }
+    reservoir.rand = saturatef(reservoir.rand);
+  }
+}
+
 /**
- * Sample indirect scatter position along the ray based on weighted delta tracking, from
+ * Integrate volume based on weighted delta tracking, from
 * [Spectral and Decomposition Tracking for Rendering Heterogeneous Volumes]
 * (https://disneyanimation.com/publications/spectral-and-decomposition-tracking-for-rendering-heterogeneous-volumes)
 * by Peter Kutz et. al.
@@ -869,200 +1351,100 @@ ccl_device_inline float volume_scatter_probability(
 * - If ξ < sigma_s / (sigma_s + |sigma_n|), we sample scatter event and evaluate L_s.
 * - Otherwise, no real collision happens and we continue the recursive process.
 * The emission L_e is evaluated at each step.
- *
- * \param sigma_max: majorant volume density inside the current octree node
- * \param interval: interval of t along the ray.
 */
-ccl_device void volume_sample_indirect_scatter(
-    KernelGlobals kg,
-    const IntegratorState state,
-    const ccl_private Ray *ccl_restrict ray,
-    ccl_private ShaderData *ccl_restrict sd,
-    const float sigma_max,
-    const Interval<float> interval,
-    ccl_private RNGState *rng_state,
-    ccl_private VolumeIntegrateState &ccl_restrict vstate,
-    ccl_private VolumeIntegrateResult &ccl_restrict result)
-{
-  if (result.indirect_scatter) {
-    /* Already sampled indirect scatter position. */
-    return;
-  }
-
-  /* Initialization. */
-  float t = interval.min;
-  const float inv_maj = (sigma_max == 0.0f) ? FLT_MAX : 1.0f / sigma_max;
-  const bool segment_has_equiangular = vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR &&
-                                       interval.contains(result.direct_t) && vstate.use_mis;
-  bool direct_scatter = false;
-  while (vstate.step++ < VOLUME_MAX_STEPS) {
-    if (reduce_max(fabs(result.indirect_throughput)) < VOLUME_THROUGHPUT_EPSILON) {
-      /* TODO(weizhen): terminate using Russian Roulette. */
-      /* TODO(weizhen): deal with negative transmittance. */
-      /* TODO(weizhen): should we stop if direct_scatter not yet found? */
-      vstate.stop = true;
-      result.indirect_throughput = zero_spectrum();
-      return;
-    }
-
-    /* Generate the next distance using random walk. */
-    const float rand = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SCATTER_DISTANCE);
-    t += sample_exponential_distribution(rand, inv_maj);
-
-    /* Advance random number offset. */
-    rng_state->rng_offset += PRNG_BOUNCE_NUM;
-
-    if (segment_has_equiangular && t > result.direct_t && !direct_scatter) {
-      /* Stepped beyond the equiangular scatter position, compute direct throughput. */
-      direct_scatter = true;
-      result.direct_throughput = result.indirect_throughput * vstate.transmittance;
-      vstate.distance_pdf = vstate.transmittance * sigma_max;
-    }
-
-    if (t > interval.max) {
-      break;
-    }
-
-    sd->P = ray->P + ray->D * t;
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-    if (!volume_shader_sample(kg, state, sd, &coeff)) {
-      continue;
-    }
-
-    /* Emission. */
-    if (sd->flag & SD_EMISSION) {
-      /* Emission = inv_sigma * (L_e + sigma_n * (inv_sigma * (L_e + sigma_n * ···))). */
-      const Spectrum emission = inv_maj * coeff.emission;
-      vstate.emission += result.indirect_throughput * emission;
-      guiding_record_volume_emission(kg, state, emission);
-    }
-
-    /* Null scattering coefficients. */
-    const Spectrum sigma_n = volume_null_event_coefficients(kg, coeff, sigma_max);
-
-    if (reduce_add(coeff.sigma_s) == 0.0f) {
-      /* Absorption only. Deterministically choose null scattering and estimate the transmittance
-       * of the current ray segment. */
-      result.indirect_throughput *= sigma_n * inv_maj;
-      continue;
-    }
-
-#  ifdef __DENOISING_FEATURES__
-    if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_DENOISING_FEATURES) {
-      /* Albedo = inv_sigma * (sigma_s + sigma_n * (inv_sigma * (sigma_s + sigma_n * ···))). */
-      vstate.albedo += result.indirect_throughput * coeff.sigma_s * inv_maj;
-    }
-#  endif
-
-    const float prob_s = volume_scatter_probability(coeff, sigma_n, result.indirect_throughput);
-    if (vstate.rchannel < prob_s) {
-      /* Sampled scatter event. */
-      result.indirect_throughput *= coeff.sigma_s * inv_maj / prob_s;
-      result.indirect_t = t;
-      result.indirect_scatter = true;
-      volume_shader_copy_phases(&result.indirect_phases, sd);
-
-      if (vstate.direct_sample_method == VOLUME_SAMPLE_DISTANCE) {
-        /* If using distance sampling for direct light, just copy parameters of indirect light
-         * since we scatter at the same point. */
-        result.direct_scatter = true;
-        result.direct_t = result.indirect_t;
-        result.direct_throughput = result.indirect_throughput;
-        volume_shader_copy_phases(&result.direct_phases, sd);
-        if (vstate.use_mis) {
-          vstate.distance_pdf = vstate.transmittance * prob_s * sigma_max;
-        }
-      }
-      return;
-    }
-
-    /* Null scattering. Accumulate weight and continue. */
-    const float prob_n = 1.0f - prob_s;
-    result.indirect_throughput *= sigma_n * inv_maj / prob_n;
-
-    if (vstate.use_mis) {
-      vstate.transmittance *= prob_n;
-    }
-
-    /* Rescale random number for reusing. */
-    vstate.rchannel = (vstate.rchannel - prob_s) / prob_n;
-  }
-
-  /* No scatter event sampled in the interval. */
-}
-
-/* Throughput and pdf for equiangular sampling.
- * If MIS is used with transmittance-based distance sampling, we compute the direct throughput from
- * the indirect throughput in the function above. Otherwise, we use telescoping for higher quality.
- */
-ccl_device_inline void volume_equiangular_direct_scatter(
-    KernelGlobals kg,
-    const IntegratorState state,
-    const ccl_private Ray *ccl_restrict ray,
-    const ccl_private Extrema<float> &sigma,
-    const ccl_private Interval<float> &t,
-    ccl_private ShaderData *ccl_restrict sd,
-    ccl_private RNGState *rng_state,
-    ccl_private VolumeIntegrateState &vstate,
-    ccl_private VolumeIntegrateResult &ccl_restrict result)
-{
-  if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
-    return;
-  }
-
-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-
-  if (t.contains(result.direct_t)) {
-    /* Equiangular scatter position is inside the current segment. */
-    sd->P = ray->P + ray->D * result.direct_t;
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-    if (volume_shader_sample(kg, state, sd, &coeff) && (sd->flag & SD_SCATTER)) {
-      volume_shader_copy_phases(&result.direct_phases, sd);
-      result.direct_scatter = true;
-
-      if (vstate.use_mis) {
-        /* Compute distance pdf for multiple importance sampling. */
-        const Spectrum sigma_n = volume_null_event_coefficients(kg, coeff, sigma.max);
-
-        vstate.distance_pdf *= volume_scatter_probability(
-            coeff, sigma_n, result.direct_throughput);
-      }
-      else {
-        /* Compute transmittance until the direct scatter position. */
-        const Interval<float> t_ = {t.min, result.direct_t};
-        result.direct_throughput *= volume_transmittance<false>(
-            kg, state, ray, sd, sigma.range(), t_, rng_state, path_flag);
-      }
-
-      result.direct_throughput *= coeff.sigma_s / vstate.equiangular_pdf;
-    }
-  }
-  else if (result.direct_t > t.max && !vstate.use_mis) {
-    /* Accumulate transmittance. */
-    result.direct_throughput *= volume_transmittance<false>(
-        kg, state, ray, sd, sigma.range(), t, rng_state, path_flag);
-  }
-}
-
-/* Find direct and indirect scatter positions inside the current active octree leaf node. */
 ccl_device void volume_integrate_step_scattering(
    KernelGlobals kg,
    const IntegratorState state,
    const ccl_private Ray *ccl_restrict ray,
-    const ccl_private Extrema<float> &sigma,
-    const ccl_private Interval<float> &interval,
+    const float sigma_max,
    ccl_private ShaderData *ccl_restrict sd,
-    ccl_private RNGState *rng_state,
    ccl_private VolumeIntegrateState &ccl_restrict vstate,
+    ccl_private VolumeIntegrateResult &ccl_restrict result,
+    ccl_private VolumeSampleReservoir &reservoir)
+{
+  if (volume_russian_roulette_termination(state, reservoir, result, vstate)) {
+    return;
+  }
+
+  sd->P = ray->P + ray->D * vstate.t;
+  VolumeShaderCoefficients coeff ccl_optional_struct_init;
+  const uint lcg_state = sd->lcg_state;
+  if (!volume_shader_sample(kg, state, sd, &coeff)) {
+    return;
+  }
+
+  kernel_assert(sigma_max != 0.0f);
+  const float inv_maj = 1.0f / sigma_max;
+
+  /* Emission. */
+  if (sd->flag & SD_EMISSION) {
+    /* Emission = inv_sigma * (L_e + sigma_n * (inv_sigma * (L_e + sigma_n * ···))). */
+    const Spectrum emission = inv_maj * coeff.emission;
+    vstate.emission += result.indirect_throughput * emission;
+    if (!result.indirect_scatter) {
+      /* Record emission until scatter position. */
+      guiding_record_volume_emission(kg, state, emission);
+    }
+  }
+
+  /* Null scattering coefficients. */
+  const Spectrum sigma_n = volume_null_event_coefficients(kg, coeff, sigma_max);
+
+  if (reduce_add(coeff.sigma_s) == 0.0f) {
+    /* Absorption only. Deterministically choose null scattering and estimate the transmittance
+     * of the current ray segment. */
+    result.indirect_throughput *= sigma_n * inv_maj;
+    return;
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_DENOISING_FEATURES) {
+    /* Albedo = inv_sigma * (sigma_s + sigma_n * (inv_sigma * (sigma_s + sigma_n * ···))). */
+    vstate.albedo += result.indirect_throughput * coeff.sigma_s * inv_maj;
+  }
+#  endif
+
+  /* Indirect scatter. */
+  const float prob_s = volume_scatter_probability(coeff, sigma_n, result.indirect_throughput);
+  volume_sample_indirect_scatter(
+      sigma_max, prob_s, coeff.sigma_s, sd, vstate, result, lcg_state, reservoir);
+
+  /* Null scattering. Accumulate weight and continue. */
+  const float prob_n = 1.0f - prob_s;
+  result.indirect_throughput *= safe_divide(sigma_n * inv_maj, prob_n);
+  vstate.transmittance *= prob_n;
+}
+
+/* Evaluate coefficients at the equiangular scatter position, and update the direct throughput. */
+ccl_device_inline void volume_equiangular_direct_scatter(
+    KernelGlobals kg,
+    const IntegratorState state,
+    const ccl_private Ray *ccl_restrict ray,
+    ccl_private ShaderData *ccl_restrict sd,
+    ccl_private VolumeIntegrateState &vstate,
    ccl_private VolumeIntegrateResult &ccl_restrict result)
 {
-  /* Distance sampling for indirect and optional direct lighting. */
-  volume_sample_indirect_scatter(
-      kg, state, ray, sd, sigma.max, interval, rng_state, vstate, result);
+  if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR || !result.direct_scatter) {
+    return;
+  }

-  /* Equiangular sampling for direct lighting. */
-  volume_equiangular_direct_scatter(
-      kg, state, ray, sigma, interval, sd, rng_state, vstate, result);
+  sd->P = ray->P + ray->D * result.direct_t;
+  VolumeShaderCoefficients coeff ccl_optional_struct_init;
+  if (volume_shader_sample(kg, state, sd, &coeff) && (sd->flag & SD_SCATTER)) {
+    volume_shader_copy_phases(&result.direct_phases, sd);
+
+    if (vstate.use_mis) {
+      /* Compute distance pdf for multiple importance sampling. */
+      const Spectrum sigma_n = volume_null_event_coefficients(kg, coeff, vstate.sigma_max);
+      vstate.distance_pdf *= volume_scatter_probability(coeff, sigma_n, result.direct_throughput);
+    }
+
+    result.direct_throughput *= coeff.sigma_s / vstate.equiangular_pdf;
+  }
+  else {
+    /* Scattering coefficient is zero at the sampled position. */
+    result.direct_scatter = false;
+  }
 }

 /* Multiple Importance Sampling between equiangular sampling and distance sampling.
@@ -1103,7 +1485,7 @@ ccl_device_inline void volume_direct_scatter_mis(
    const ccl_private EquiangularCoefficients &equiangular_coeffs,
    ccl_private VolumeIntegrateResult &ccl_restrict result)
 {
-  if (!vstate.use_mis || vstate.direct_sample_method == VOLUME_SAMPLE_NONE) {
+  if (!vstate.use_mis || !result.direct_scatter) {
    return;
  }

@@ -1120,23 +1502,28 @@ ccl_device_inline void volume_direct_scatter_mis(
  result.direct_throughput *= 2.0f * mis_weight;
 }

+/** \} */
+
 ccl_device_inline void volume_integrate_state_init(KernelGlobals kg,
+                                                   const IntegratorState state,
                                                   const VolumeSampleMethod direct_sample_method,
+                                                   ccl_global float *ccl_restrict render_buffer,
+                                                   const ccl_private OctreeTracing &octree,
                                                   const ccl_private RNGState *rng_state,
                                                   ccl_private VolumeIntegrateState &vstate)
 {
  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SCATTER_DISTANCE);
-  vstate.rchannel = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_COLOR_CHANNEL);

  /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
  vstate.direct_sample_method = direct_sample_method;
  vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS);
  if (vstate.use_mis) {
    if (vstate.rscatter < 0.5f) {
-      vstate.rscatter *= 2.0f;
      vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE;
+      vstate.rscatter *= 2.0f;
    }
    else {
+      /* Rescale for equiangular distance sampling. */
      vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f;
      vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR;
    }
@@ -1146,8 +1533,10 @@ ccl_device_inline void volume_integrate_state_init(KernelGlobals kg,
  vstate.equiangular_pdf = 0.0f;
  vstate.transmittance = 1.0f;
  vstate.step = 0;
-  vstate.stop = false;
-
+  vstate.t = octree.t.min;
+  vstate.optical_depth = octree.sigma.max * octree.t.length();
+  volume_scatter_probability_get(kg, state, render_buffer, vstate);
+  vstate.direct_rr_scale = 1.0f;
  vstate.emission = zero_spectrum();
 #  ifdef __DENOISING_FEATURES__
  vstate.albedo = zero_spectrum();
@@ -1162,8 +1551,7 @@ ccl_device_inline void volume_integrate_result_init(
    ccl_private VolumeIntegrateResult &result)
 {
  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
-  result.direct_throughput = (vstate.use_mis ||
-                              (vstate.direct_sample_method == VOLUME_SAMPLE_NONE)) ?
+  result.direct_throughput = (vstate.direct_sample_method == VOLUME_SAMPLE_NONE) ?
                                 zero_spectrum() :
                                 throughput;
  result.indirect_throughput = throughput;
@@ -1263,36 +1651,46 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 {
  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE);

-  EquiangularCoefficients equiangular_coeffs = {zero_float3(), {ray->tmin, ray->tmax}};
-  const VolumeSampleMethod direct_sample_method = volume_direct_sample_method(
-      kg, state, ray, sd, rng_state, &equiangular_coeffs, ls);
-
-  VolumeIntegrateState vstate ccl_optional_struct_init;
-  volume_integrate_state_init(kg, direct_sample_method, rng_state, vstate);
-
-  /* Initialize volume integration result. */
-  volume_integrate_result_init(state, ray, vstate, equiangular_coeffs, result);
-
  OctreeTracing octree(ray->tmin);
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
  if (!volume_octree_setup<false>(kg, ray, sd, state, rng_state, path_flag, octree)) {
    return;
  }

+  EquiangularCoefficients equiangular_coeffs = {zero_float3(), {ray->tmin, ray->tmax}};
+  const VolumeSampleMethod direct_sample_method = volume_direct_sample_method(
+      kg, state, ray, sd, rng_state, &equiangular_coeffs, ls);
+
+  /* Initialize reservoir for sampling scatter position. */
+  VolumeSampleReservoir reservoir = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_RESERVOIR);
+
+  /* Initialize volume integration state. */
+  VolumeIntegrateState vstate ccl_optional_struct_init;
+  volume_integrate_state_init(
+      kg, state, direct_sample_method, render_buffer, octree, rng_state, vstate);
+
+  /* Initialize volume integration result. */
+  volume_integrate_result_init(state, ray, vstate, equiangular_coeffs, result);
+
  /* Scramble for stepping through volume. */
  path_state_rng_scramble(rng_state, 0xe35fad82);

-  do {
-    volume_integrate_step_scattering(
-        kg, state, ray, octree.sigma, octree.t, sd, rng_state, vstate, result);
+  volume_equiangular_transmittance(
+      kg, state, ray, octree.sigma, octree.t, sd, rng_state, vstate, result);

-    if (volume_integrate_should_stop(result, vstate)) {
+  while (
+      volume_integrate_advance(kg, ray, sd, state, rng_state, path_flag, octree, vstate, result))
+  {
+    const float sigma_max = octree.sigma.max * vstate.majorant_scale;
+    volume_integrate_step_scattering(kg, state, ray, sigma_max, sd, vstate, result, reservoir);
+
+    if (volume_integrate_should_stop(result)) {
      break;
    }
+  }

-  } while (
-      volume_octree_advance<false>(kg, ray, sd, state, rng_state, path_flag, octree, vstate.step));
-
+  volume_distance_sampling_finalize(kg, state, ray, sd, vstate, result, reservoir);
+  volume_equiangular_direct_scatter(kg, state, ray, sd, vstate, result);
  volume_direct_scatter_mis(ray, vstate, equiangular_coeffs, result);

  /* Write accumulated emission. */
@@ -1310,6 +1708,10 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
        kg, state, vstate.albedo, result.indirect_scatter, render_buffer);
  }
 #  endif /* __DENOISING_FEATURES__ */
+
+  if (INTEGRATOR_STATE(state, path, bounce) == 0) {
+    INTEGRATOR_STATE_WRITE(state, path, optical_depth) += vstate.optical_depth;
+  }
 }

 /* Path tracing: sample point on light and evaluate light shader, then
@@ -1417,6 +1819,11 @@ ccl_device_forceinline void integrate_volume_direct_light(
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
  }

+  if (bounce == 0) {
+    shadow_flag |= PATH_RAY_VOLUME_SCATTER;
+    shadow_flag &= ~PATH_RAY_VOLUME_PRIMARY_TRANSMIT;
+  }
+
  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
      state, path, render_pixel_index);
  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_offset) = INTEGRATOR_STATE(
@@ -1752,10 +2159,15 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
    volume_stack_clean(kg, state);
  }

+  /* Assign flag to transmitted volume rays for scattering probability guiding. */
+  if (INTEGRATOR_STATE(state, path, bounce) == 0) {
+    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_VOLUME_PRIMARY_TRANSMIT;
+  }
+
  const VolumeIntegrateEvent event = volume_integrate(kg, state, &ray, render_buffer);
  if (event == VOLUME_PATH_MISSED) {
    /* End path. */
-    integrator_path_terminate(state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    integrator_path_terminate(kg, state, render_buffer, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
    return;
  }

--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -7,6 +7,8 @@
 #include "kernel/globals.h"
 #include "kernel/types.h"

+#include "kernel/film/write.h"
+
 #include "kernel/integrator/state.h"

 #ifdef __KERNEL_GPU__
@@ -46,6 +48,24 @@ ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegrator
  return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0;
 }

+ccl_device_inline void write_optical_depth(KernelGlobals kg,
+                                           IntegratorState state,
+                                           ccl_global float *ccl_restrict render_buffer)
+{
+  if (!render_buffer) {
+    return;
+  }
+
+  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_VOLUME_PRIMARY_TRANSMIT) {
+    kernel_assert(kernel_data.film.pass_volume_majorant != PASS_UNUSED);
+
+    const float optical_depth = INTEGRATOR_STATE(state, path, optical_depth);
+    ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+    film_write_pass_float(buffer + kernel_data.film.pass_volume_majorant, optical_depth);
+    film_write_pass_float(buffer + kernel_data.film.pass_volume_majorant_sample_count, 1.0f);
+  }
+}
+
 #ifdef __KERNEL_GPU__

 ccl_device_forceinline void integrator_path_init(IntegratorState state,
@@ -65,9 +85,13 @@ ccl_device_forceinline void integrator_path_next(IntegratorState state,
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
 }

-ccl_device_forceinline void integrator_path_terminate(IntegratorState state,
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      ccl_global float *ccl_restrict render_buffer,
                                                      const DeviceKernel current_kernel)
 {
+  write_optical_depth(kg, state, render_buffer);
+
  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
                              1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
@@ -176,9 +200,13 @@ ccl_device_forceinline void integrator_path_next(IntegratorState state,
  (void)current_kernel;
 }

-ccl_device_forceinline void integrator_path_terminate(IntegratorState state,
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      ccl_global float *ccl_restrict render_buffer,
                                                      const DeviceKernel current_kernel)
 {
+  write_optical_depth(kg, state, render_buffer);
+
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
  (void)current_kernel;
 }
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -39,6 +39,8 @@ KERNEL_STRUCT_MEMBER(path, uint16_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* enum PathRayMNEE */
 KERNEL_STRUCT_MEMBER(path, uint8_t, mnee, KERNEL_FEATURE_PATH_TRACING)
+/* Majorant volume optical depth. */
+KERNEL_STRUCT_MEMBER(path, float, optical_depth, KERNEL_FEATURE_PATH_TRACING)
 /* Multiple importance sampling
 * The PDF of BSDF sampling at the last scatter point, which is at ray distance
 * zero and distance. Note that transparency and volume attenuation increase
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -291,7 +291,7 @@ enum PathTraceDimension {

  /* Volume */
  PRNG_VOLUME_PHASE = 3,
-  PRNG_VOLUME_COLOR_CHANNEL = 4,
+  PRNG_VOLUME_RESERVOIR = 4,
  PRNG_VOLUME_SCATTER_DISTANCE = 5,
  PRNG_VOLUME_EXPANSION_ORDER = 6,
  PRNG_VOLUME_SHADE_OFFSET = 7,
@@ -437,6 +437,13 @@ enum PathRayFlag : uint32_t {

  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
+
+  /* TODO(weizhen): should add another flag to record only the primary scatter, but then we need to
+     change the flag to 64 bits or split path_flags in two. Right now we also write volume scatter
+     if the primary hit is surface, but that seems fine. */
+  /* Volume scattering probability guiding. This flag is added to path where the primary ray passed
+     through the volume without scattering. */
+  PATH_RAY_VOLUME_PRIMARY_TRANSMIT = (1U << 23U),
 };

 // 8bit enum, just in case we need to move more variables in it
@@ -505,6 +512,8 @@ enum PassType {
  PASS_VOLUME,
  PASS_VOLUME_DIRECT,
  PASS_VOLUME_INDIRECT,
+  PASS_VOLUME_SCATTER,
+  PASS_VOLUME_TRANSMIT,
  PASS_CATEGORY_LIGHT_END = 31,

  /* Data passes */
@@ -554,6 +563,10 @@ enum PassType {
  PASS_GUIDING_PROBABILITY,
  /* The avg. roughness at the first bounce. */
  PASS_GUIDING_AVG_ROUGHNESS,
+  /* The majorant optical depth along the ray, for volume scattering probability guiding.
+   * When reading this pass, it is converted to majorant transmittance */
+  PASS_VOLUME_MAJORANT,
+  PASS_VOLUME_MAJORANT_SAMPLE_COUNT,
  PASS_CATEGORY_DATA_END = 63,

  PASS_BAKE_PRIMITIVE,
@@ -1868,6 +1881,7 @@ enum DeviceKernel : int {

  DECLARE_FILM_CONVERT_KERNEL(DEPTH),
  DECLARE_FILM_CONVERT_KERNEL(MIST),
+  DECLARE_FILM_CONVERT_KERNEL(VOLUME_MAJORANT),
  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
  DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
@@ -1890,6 +1904,9 @@ enum DeviceKernel : int {
  DEVICE_KERNEL_FILTER_COLOR_PREPROCESS,
  DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS,

+  DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X,
+  DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y,
+
  DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS,

  DEVICE_KERNEL_PREFIX_SUM,
--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -187,6 +187,11 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
  kfilm->pass_transmission_indirect = PASS_UNUSED;
  kfilm->pass_volume_direct = PASS_UNUSED;
  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_volume_scatter = PASS_UNUSED;
+  kfilm->pass_volume_transmit = PASS_UNUSED;
+  kfilm->pass_volume_scatter_denoised = PASS_UNUSED;
+  kfilm->pass_volume_transmit_denoised = PASS_UNUSED;
+  kfilm->pass_volume_majorant = PASS_UNUSED;
  kfilm->pass_lightgroup = PASS_UNUSED;

  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
@@ -218,6 +223,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
    if (pass->get_mode() == PassMode::DENOISED) {
      /* Generally we only storing offsets of the noisy passes. The display pass is an exception
       * since it is a read operation and not a write. */
+      if (pass->get_type() == PASS_VOLUME_TRANSMIT) {
+        kfilm->pass_volume_transmit_denoised = kfilm->pass_stride;
+      }
+      else if (pass->get_type() == PASS_VOLUME_SCATTER) {
+        kfilm->pass_volume_scatter_denoised = kfilm->pass_stride;
+      }
      kfilm->pass_stride += pass->get_info().num_components;
      continue;
    }
@@ -328,6 +339,18 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
      case PASS_VOLUME_DIRECT:
        kfilm->pass_volume_direct = kfilm->pass_stride;
        break;
+      case PASS_VOLUME_SCATTER:
+        kfilm->pass_volume_scatter = kfilm->pass_stride;
+        break;
+      case PASS_VOLUME_TRANSMIT:
+        kfilm->pass_volume_transmit = kfilm->pass_stride;
+        break;
+      case PASS_VOLUME_MAJORANT:
+        kfilm->pass_volume_majorant = kfilm->pass_stride;
+        break;
+      case PASS_VOLUME_MAJORANT_SAMPLE_COUNT:
+        kfilm->pass_volume_majorant_sample_count = kfilm->pass_stride;
+        break;

      case PASS_BAKE_PRIMITIVE:
        kfilm->pass_bake_primitive = kfilm->pass_stride;
@@ -478,8 +501,8 @@ void Film::update_passes(Scene *scene)
  const ObjectManager *object_manager = scene->object_manager.get();
  Integrator *integrator = scene->integrator;

-  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified() &&
-      !background->is_modified())
+  if (!object_manager->need_update() && !integrator->is_modified() && !background->is_modified() &&
+      !scene->has_volume_modified())
  {
    return;
  }
@@ -571,6 +594,20 @@ void Film::update_passes(Scene *scene)
    }
  }

+  if (scene->has_volume()) {
+    add_auto_pass(scene, PASS_VOLUME_SCATTER);
+    add_auto_pass(scene, PASS_VOLUME_SCATTER, PassMode::DENOISED, "Volume Scatter");
+    add_auto_pass(scene, PASS_VOLUME_TRANSMIT);
+    add_auto_pass(scene, PASS_VOLUME_TRANSMIT, PassMode::DENOISED, "Volume Transmit");
+    if (!Pass::contains(scene->passes, PASS_SAMPLE_COUNT)) {
+      add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    }
+    if (!Pass::contains(scene->passes, PASS_VOLUME_MAJORANT)) {
+      add_auto_pass(scene, PASS_VOLUME_MAJORANT, "Volume Majorant");
+    }
+    add_auto_pass(scene, PASS_VOLUME_MAJORANT_SAMPLE_COUNT);
+  }
+
  /* Remove duplicates and initialize internal pass info. */
  finalize_passes(scene, use_denoise);

@@ -669,8 +706,9 @@ void Film::finalize_passes(Scene *scene, const bool use_denoise)

    /* Disable denoising on passes if denoising is disabled, or if the
     * pass does not support it. */
-    pass->set_mode((use_denoise && pass->get_info().support_denoise) ? pass->get_mode() :
-                                                                       PassMode::NOISY);
+    const bool need_denoise = pass->get_info().support_denoise &&
+                              (use_denoise || is_volume_guiding_pass(pass->get_type()));
+    pass->set_mode(need_denoise ? pass->get_mode() : PassMode::NOISY);

    /* Merge duplicate passes. */
    bool duplicate_found = false;
@@ -722,13 +760,16 @@ uint Film::get_kernel_features(const Scene *scene) const
    const PassType pass_type = pass->get_type();
    const PassMode pass_mode = pass->get_mode();

-    if (pass_mode == PassMode::DENOISED || pass_type == PASS_DENOISING_NORMAL ||
+    const bool has_denoise_pass = (pass_mode == PassMode::DENOISED) &&
+                                  !is_volume_guiding_pass(pass_type);
+
+    if (has_denoise_pass || pass_type == PASS_DENOISING_NORMAL ||
        pass_type == PASS_DENOISING_ALBEDO || pass_type == PASS_DENOISING_DEPTH)
    {
      kernel_features |= KERNEL_FEATURE_DENOISING;
    }

-    if (pass_type >= PASS_DIFFUSE && pass_type <= PASS_VOLUME_INDIRECT) {
+    if (pass_type >= PASS_DIFFUSE && pass_type <= PASS_VOLUME_TRANSMIT) {
      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
    }

--- a/intern/cycles/scene/pass.cpp
+++ b/intern/cycles/scene/pass.cpp
@@ -64,6 +64,8 @@ const NodeEnum *Pass::get_type_enum()
    pass_type_enum.insert("volume", PASS_VOLUME);
    pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
    pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
+    pass_type_enum.insert("volume_scatter", PASS_VOLUME_SCATTER);
+    pass_type_enum.insert("volume_transmit", PASS_VOLUME_TRANSMIT);

    /* Data passes. */
    pass_type_enum.insert("depth", PASS_DEPTH);
@@ -88,6 +90,8 @@ const NodeEnum *Pass::get_type_enum()
    pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
    pass_type_enum.insert("denoising_depth", PASS_DENOISING_DEPTH);
    pass_type_enum.insert("denoising_previous", PASS_DENOISING_PREVIOUS);
+    pass_type_enum.insert("volume_majorant", PASS_VOLUME_MAJORANT);
+    pass_type_enum.insert("volume_majorant_sample_count", PASS_VOLUME_MAJORANT_SAMPLE_COUNT);

    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
@@ -274,6 +278,25 @@ PassInfo Pass::get_info(const PassType type, const bool include_albedo, const bo
      pass_info.num_components = 3;
      pass_info.use_exposure = true;
      break;
+    case PASS_VOLUME_SCATTER:
+    case PASS_VOLUME_TRANSMIT:
+      /* TODO(weizhen): Gaussian filter only needs 1 component, but we can have negative pixel
+       * values in some channels, preventing us from simply add them together; besides, using RGB
+       * channels is better for visualization. We can optimize the memory by using RGBE format. */
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.use_filter = false;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_VOLUME_MAJORANT:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      pass_info.divide_type = PASS_VOLUME_MAJORANT_SAMPLE_COUNT;
+      break;
+    case PASS_VOLUME_MAJORANT_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;

    case PASS_CRYPTOMATTE:
      pass_info.num_components = 4;
@@ -438,4 +461,9 @@ std::ostream &operator<<(std::ostream &os, const Pass &pass)
  return os;
 }

+bool is_volume_guiding_pass(const PassType pass_type)
+{
+  return (pass_type == PASS_VOLUME_SCATTER) || (pass_type == PASS_VOLUME_TRANSMIT);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/scene/pass.h
+++ b/intern/cycles/scene/pass.h
@@ -95,4 +95,6 @@ class Pass : public Node {

 std::ostream &operator<<(std::ostream &os, const Pass &pass);

+bool is_volume_guiding_pass(const PassType pass_type);
+
 CCL_NAMESPACE_END
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -789,6 +789,22 @@ void Scene::tag_shadow_catcher_modified()
  shadow_catcher_modified_ = true;
 }

+bool Scene::has_volume()
+{
+  has_volume_modified_ = false;
+  return dscene.data.integrator.use_volumes;
+}
+
+bool Scene::has_volume_modified() const
+{
+  return has_volume_modified_;
+}
+
+void Scene::tag_has_volume_modified()
+{
+  has_volume_modified_ = true;
+}
+
 template<> Light *Scene::create_node<Light>()
 {
  unique_ptr<Light> node = make_unique<Light>();
--- a/intern/cycles/scene/scene.h
+++ b/intern/cycles/scene/scene.h
@@ -201,6 +201,9 @@ class Scene : public NodeOwner {

  bool has_shadow_catcher();
  void tag_shadow_catcher_modified();
+  bool has_volume();
+  bool has_volume_modified() const;
+  void tag_has_volume_modified();

  /* This function is used to create a node of a specified type instead of
   * calling 'new', and sets the scene as the owner of the node.
@@ -245,6 +248,7 @@ class Scene : public NodeOwner {

  bool has_shadow_catcher_ = false;
  bool shadow_catcher_modified_ = true;
+  bool has_volume_modified_ = true;

  /* Maximum number of closure during session lifetime. */
  int max_closure_global;
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -546,7 +546,10 @@ void ShaderManager::device_update_pre(Device * /*device*/,

  /* Set this early as it is needed by volume rendering passes. */
  KernelIntegrator *kintegrator = &dscene->data.integrator;
-  kintegrator->use_volumes = has_volumes;
+  if (kintegrator->use_volumes != has_volumes) {
+    scene->tag_has_volume_modified();
+    kintegrator->use_volumes = has_volumes;
+  }
 }

 void ShaderManager::device_update_post(Device *device,
--- a/intern/cycles/util/tbb.h
+++ b/intern/cycles/util/tbb.h
@@ -10,6 +10,7 @@
 #  include "util/windows.h"
 #endif

+#include <tbb/blocked_range2d.h>
 #include <tbb/blocked_range3d.h>
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
@@ -27,6 +28,7 @@
 CCL_NAMESPACE_BEGIN

 using tbb::blocked_range;
+using tbb::blocked_range2d;
 using tbb::blocked_range3d;
 using tbb::enumerable_thread_specific;
 using tbb::parallel_for;