From df496eb894d700e7b30ad177db931168bd7df82e Mon Sep 17 00:00:00 2001
From: Weizhen Huang <weizhen@blender.org>
Date: Mon, 11 Aug 2025 16:31:34 +0200
Subject: [PATCH] Cycles: use one-tap stochastic interpolation for volume

It has ~1.2x speed-up on CPU and ~1.5x speed-up on GPU (tested on Metal
M2 Ultra).

Individual samples are noisier, but equal time renders are mostly
better.

Note that volume emission renders differently than before.

Pull Request: https://projects.blender.org/blender/blender/pulls/144451
---
 intern/cycles/kernel/geom/volume.h            |   3 +-
 intern/cycles/kernel/osl/services.cpp         |   2 +-
 intern/cycles/kernel/osl/services_gpu.h       |   3 +-
 intern/cycles/kernel/sample/lcg.h             |   9 +
 intern/cycles/kernel/util/texture_3d.h        | 196 +++++-------------
 intern/cycles/util/math_float3.h              |   9 +
 .../cycles_renders/openvdb_overlap.png        |   4 +-
 .../cycles_renders/principled_blackbody.png   |   4 +-
 .../openvdb/cycles_renders/smoke_color.png    |   4 +-
 .../openvdb/cycles_renders/smoke_fire.png     |   4 +-
 .../cycles_renders/overlapping_octrees.png    |   4 +-
 11 files changed, 83 insertions(+), 159 deletions(-)
diff --git a/intern/cycles/kernel/geom/volume.h b/intern/cycles/kernel/geom/volume.h
index 1b3a3f1f625..90c3a835ca8 100644
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -94,8 +94,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
     object_inverse_position_transform(kg, sd, &P);
     const InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
                                                                     INTERPOLATION_NONE;
-    return kernel_tex_image_interp_3d(
-        kg, desc.offset, P, interp, (stochastic) ? lcg_step_float(&sd->lcg_state) : -1.0f);
+    return kernel_tex_image_interp_3d(kg, sd, desc.offset, P, interp, stochastic);
   }
   return zero_float4();
 }
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index 4a4c4b51313..ad1cd5339b9 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -1286,7 +1286,7 @@ bool OSLRenderServices::texture3d(OSLUStringHash filename,
       const int slot = handle->svm_slots[0].y;
       const float3 P_float3 = make_float3(P.x, P.y, P.z);
       float4 rgba = kernel_tex_image_interp_3d(
-          kernel_globals, slot, P_float3, INTERPOLATION_NONE, -1.0f);
+          kernel_globals, globals->sd, slot, P_float3, INTERPOLATION_NONE, false);
 
       result[0] = rgba[0];
       if (nchannels > 1) {
diff --git a/intern/cycles/kernel/osl/services_gpu.h b/intern/cycles/kernel/osl/services_gpu.h
index 21ae8ddc82a..8ac3cc41c81 100644
--- a/intern/cycles/kernel/osl/services_gpu.h
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -1072,7 +1072,8 @@ ccl_device_extern bool rs_texture3d(ccl_private ShaderGlobals *sg,
 
   switch (type) {
     case OSL_TEXTURE_HANDLE_TYPE_SVM: {
-      const float4 rgba = kernel_tex_image_interp_3d(nullptr, slot, *P, INTERPOLATION_NONE, -1.0f);
+      const float4 rgba = kernel_tex_image_interp_3d(
+          nullptr, sg->sd, slot, *P, INTERPOLATION_NONE, false);
       if (nchannels > 0) {
         result[0] = rgba.x;
       }
diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h
index 39fde424835..298dfdfe609 100644
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -26,6 +26,15 @@ template<class T> ccl_device float lcg_step_float(T rng)
   return (float)*rng * (1.0f / (float)0xFFFFFFFF);
 }
 
+template<class T> ccl_device float3 lcg_step_float3(T rng)
+{
+  /* Make sure the random numbers are evaluated in order. */
+  const float rand_x = lcg_step_float(rng);
+  const float rand_y = lcg_step_float(rng);
+  const float rand_z = lcg_step_float(rng);
+  return make_float3(rand_x, rand_y, rand_z);
+}
+
 ccl_device uint lcg_init(const uint seed)
 {
   uint rng = seed;
diff --git a/intern/cycles/kernel/util/texture_3d.h b/intern/cycles/kernel/util/texture_3d.h
index 865db289024..5fa149b49bb 100644
--- a/intern/cycles/kernel/util/texture_3d.h
+++ b/intern/cycles/kernel/util/texture_3d.h
@@ -5,6 +5,8 @@
 #pragma once
 
 #include "kernel/globals.h"
+#include "kernel/sample/lcg.h"
+
 #include "util/texture.h"
 
 #if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
@@ -22,163 +24,61 @@ namespace {
 #endif
 
 #ifdef WITH_NANOVDB
-/* Stochastically turn a tricubic filter into a trilinear filter. */
-ccl_device_inline float3 interp_tricubic_to_trilinear_stochastic(const float3 P, float randu)
+/* -------------------------------------------------------------------- */
+/** Return the sample position for stochastical one-tap sampling.
+ * From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810
+ * \{ */
+ccl_device_inline float3 interp_tricubic_stochastic(const float3 P, ccl_private float3 &rand)
 {
-  /* Some optimizations possible:
-   * - Could use select() for SIMD if we split the random number into 10
-   *   bits each and use that for each dimensions.
-   * - For GPU would be better not to compute P0 and P1 for all dimensions
-   *   in advance?
-   * - 1/g0 and 1/(1 - g0) are computed twice.
-   */
-
   const float3 p = floor(P);
   const float3 t = P - p;
 
-  /* Cubic weights. */
-  const float3 w0 = (1.0f / 6.0f) * (t * (t * (-t + 3.0f) - 3.0f) + 1.0f);
-  const float3 w1 = (1.0f / 6.0f) * (t * t * (3.0f * t - 6.0f) + 4.0f);
-  //    float3 w2 = (1.0f / 6.0f) * (t * (t * (-3.0f * t + 3.0f) + 3.0f) + 1.0f);
-  const float3 w3 = (1.0f / 6.0f) * (t * t * t);
+  /* Cubic interpolation weights. */
+  const float3 w[4] = {(((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f),
+                       ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f),
+                       ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f),
+                       (1.0f / 6.0f) * t * t * t};
 
-  const float3 g0 = w0 + w1;
-  const float3 P0 = p + (w1 / g0) - 1.0f;
-  const float3 P1 = p + (w3 / (make_float3(1.0f) - g0)) + 1.0f;
+  /* For reservoir sampling, always accept the first in the stream. */
+  float3 total_weight = w[0];
+  float3 offset = make_float3(-1.0f);
 
-  float3 Pnew = P0;
-
-  if (randu < g0.x) {
-    randu /= g0.x;
-  }
-  else {
-    Pnew.x = P1.x;
-    randu = (randu - g0.x) / (1 - g0.x);
+  for (int j = 1; j < 4; j++) {
+    total_weight += w[j];
+    const float3 thresh = w[j] / total_weight;
+    const auto mask = rand < thresh;
+    offset = select(mask, make_float3(float(j) - 1.0f), offset);
+    rand = select(mask, safe_divide(rand, thresh), safe_divide(rand - thresh, 1.0f - thresh));
   }
 
-  if (randu < g0.y) {
-    randu /= g0.y;
-  }
-  else {
-    Pnew.y = P1.y;
-    randu = (randu - g0.y) / (1 - g0.y);
-  }
-
-  if (randu < g0.z) {
-  }
-  else {
-    Pnew.z = P1.z;
-  }
-
-  return Pnew;
+  return p + offset;
 }
 
-/* From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810
- *
- * Could be used in specific situations where we are certain a single
- * tap is enough. Maybe better to try optimizing bilinear lookups in
- * NanoVDB (detect when fully inside a single leaf) than deal with this. */
-
-#  if 0
-ccl_device int3 interp_tricubic_stochastic(const float3 P, float randu)
+ccl_device_inline float3 interp_trilinear_stochastic(const float3 P, const float3 rand)
 {
-  const float ix = floorf(P.x);
-  const float iy = floorf(P.y);
-  const float iz = floorf(P.z);
-  const float deltas[3] = {P.x - ix, P.y - iy, P.z - iz};
-  int idx[3] = {(int)ix - 1, (int)iy - 1, (int)iz - 1};
-
-  for (int i = 0; i < 3; i++) {
-    const float t = deltas[i];
-    const float t2 = t * t;
-
-    /* Weighted reservoir sampling, first tap always accepted */
-    const float w0 = (1.0f / 6.0f) * (-t * t2 + 3 * t2 - 3 * t + 1);
-    float sumWt = w0;
-    int index = 0;
-
-    /* TODO: reduce number of divisions? */
-
-    /* Sample the other 3 filter taps. */
-    {
-      const float w1 = (1.0f / 6.0f) * (3 * t * t2 - 6 * t2 + 4);
-      sumWt += w1;
-      const float p = w1 / sumWt;
-      if (randu < p) {
-        index = 1;
-        randu /= p;
-      }
-      else {
-        randu = (randu - p) / (1 - p);
-      }
-    }
-
-    {
-      const float w2 = (1.0f / 6.0f) * (-3 * t * t2 + 3 * t2 + 3 * t + 1);
-      sumWt += w2;
-      const float p = w2 / sumWt;
-      if (randu < p) {
-        index = 2;
-        randu /= p;
-      }
-      else {
-        randu = (randu - p) / (1 - p);
-      }
-    }
-
-    {
-      const float w3 = (1.0f / 6.0f) * t * t2;
-      sumWt += w3;
-      const float p = w3 / sumWt;
-      if (randu < p) {
-        index = 3;
-        randu /= p;
-      }
-      else {
-        randu = (randu - p) / (1 - p);
-      }
-    }
-
-    idx[i] += index;
-  }
-
-  return make_int3(idx[0], idx[1], idx[2]);
+  const float3 p = floor(P);
+  const float3 t = P - p;
+  return select(rand < t, p + 1.0f, p);
 }
 
-ccl_device int3 interp_trilinear_stochastic(const float3 P, float randu)
+ccl_device_inline float3 interp_stochastic(const float3 P,
+                                           ccl_private InterpolationType &interpolation,
+                                           ccl_private float3 &rand)
 {
-  const float ix = floorf(P.x);
-  const float iy = floorf(P.y);
-  const float iz = floorf(P.z);
-  int idx[3] = {(int)ix, (int)iy, (int)iz};
-
-  const float tx = P.x - ix;
-  const float ty = P.y - iy;
-  const float tz = P.z - iz;
-
-  if (randu < tx) {
-    idx[0]++;
-    randu /= tx;
+  float3 P_new = P;
+  if (interpolation == INTERPOLATION_CUBIC) {
+    P_new = interp_tricubic_stochastic(P, rand);
+  }
+  else if (interpolation == INTERPOLATION_LINEAR) {
+    P_new = interp_trilinear_stochastic(P, rand);
   }
   else {
-    randu = (randu - tx) / (1 - tx);
+    kernel_assert(interpolation == INTERPOLATION_CLOSEST);
   }
-
-  if (randu < ty) {
-    idx[1]++;
-    randu /= ty;
-  }
-  else {
-    randu = (randu - ty) / (1 - ty);
-  }
-
-  if (randu < tz) {
-    idx[2]++;
-  }
-
-  return make_int3(idx[0], idx[1], idx[2]);
+  interpolation = INTERPOLATION_CLOSEST;
+  return P_new;
 }
-#  endif
+/** \} */
 
 template<typename OutT, typename Acc>
 ccl_device OutT kernel_tex_image_interp_trilinear_nanovdb(ccl_private Acc &acc, const float3 P)
@@ -278,8 +178,12 @@ OutT kernel_tex_image_interp_nanovdb(const ccl_global TextureInfo &info,
 }
 #endif /* WITH_NANOVDB */
 
-ccl_device float4 kernel_tex_image_interp_3d(
-    KernelGlobals kg, const int id, float3 P, InterpolationType interp, const float randu)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             const int id,
+                                             float3 P,
+                                             InterpolationType interp,
+                                             const bool stochastic)
 {
 #ifdef WITH_NANOVDB
   const ccl_global TextureInfo &info = kernel_data_fetch(texture_info, id);
@@ -294,9 +198,10 @@ ccl_device float4 kernel_tex_image_interp_3d(
 
   /* A -0.5 offset is used to center the cubic samples around the sample point. */
   P = P - make_float3(0.5f);
-  if (interpolation == INTERPOLATION_CUBIC && randu >= 0.0f) {
-    P = interp_tricubic_to_trilinear_stochastic(P, randu);
-    interpolation = INTERPOLATION_LINEAR;
+
+  if (stochastic) {
+    float3 rand = lcg_step_float3(&sd->lcg_state);
+    P = interp_stochastic(P, interpolation, rand);
   }
 
   const ImageDataType data_type = (ImageDataType)info.data_type;
@@ -325,10 +230,11 @@ ccl_device float4 kernel_tex_image_interp_3d(
   }
 #else
   (void)kg;
+  (void)sd;
   (void)id;
   (void)P;
   (void)interp;
-  (void)randu;
+  (void)stochastic;
 #endif
 
   return make_float4(
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 40a4029d07c..780281d0c40 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -236,6 +236,15 @@ ccl_device_inline int3 operator>=(const float3 a, const float3 b)
 #  endif
 }
 
+ccl_device_inline int3 operator<(const float3 a, const float3 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
+#  else
+  return make_int3(a.x < b.x, a.y < b.y, a.z < b.z);
+#  endif
+}
+
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
diff --git a/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png b/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
index 885913eb6a7..2f166b6283f 100644
--- a/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
+++ b/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08f142a43bceeff0f7bce685b94b9e8941a63324b5133be08e521f25c7721dab
-size 26173
+oid sha256:97155e9c640a07366af930273f1766cf0b8687eedb11c974a00a2aced3456cc9
+size 26382
diff --git a/tests/files/render/openvdb/cycles_renders/principled_blackbody.png b/tests/files/render/openvdb/cycles_renders/principled_blackbody.png
index bcb2aa768a1..394601a3ca9 100644
--- a/tests/files/render/openvdb/cycles_renders/principled_blackbody.png
+++ b/tests/files/render/openvdb/cycles_renders/principled_blackbody.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b993264dab9cbcd26e21d6f145d4aa0e3a5773172d5b686e303c50b5cd42ca83
-size 38458
+oid sha256:f06fbabaf4959472a6c7862aab505aacbf2a0b093372bf88c8f86c1b159bbfb0
+size 35796
diff --git a/tests/files/render/openvdb/cycles_renders/smoke_color.png b/tests/files/render/openvdb/cycles_renders/smoke_color.png
index 71599564f60..ca6a92217b1 100644
--- a/tests/files/render/openvdb/cycles_renders/smoke_color.png
+++ b/tests/files/render/openvdb/cycles_renders/smoke_color.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f4ac8dc3388072549929d870c646a615ca01ac6c0f120385847fdb96fafd1eb
-size 30556
+oid sha256:ec2faf2e398b066a6ff3b4913e0710527e150dc5233c35e2d9b802799a130049
+size 30329
diff --git a/tests/files/render/openvdb/cycles_renders/smoke_fire.png b/tests/files/render/openvdb/cycles_renders/smoke_fire.png
index 5c005f2e1f1..f5cf387e5ca 100644
--- a/tests/files/render/openvdb/cycles_renders/smoke_fire.png
+++ b/tests/files/render/openvdb/cycles_renders/smoke_fire.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7f26f3e9e623841cd2e72957014ff4940c36067010cacdfeb4ee2ffda2925b9
-size 40947
+oid sha256:24ac21680f4a64e5b81263e13ac5d82391fa74acfc14068beb0298a033167c66
+size 41059
diff --git a/tests/files/render/volume/cycles_renders/overlapping_octrees.png b/tests/files/render/volume/cycles_renders/overlapping_octrees.png
index ec190c36d4c..d12a416f278 100644
--- a/tests/files/render/volume/cycles_renders/overlapping_octrees.png
+++ b/tests/files/render/volume/cycles_renders/overlapping_octrees.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab15d155b1c86e4664d7c12fe5dcea3020c6ba8669b95a40ebcfaeb5b9b4c0c2
-size 28176
+oid sha256:2470c3e7e5d52702c6962489ad585539c11474e1dc84fd90c1ca006406ec0f65
+size 28287