Cycles: use one-tap stochastic interpolation for volume

It has ~1.2x speed-up on CPU and ~1.5x speed-up on GPU (tested on Metal M2 Ultra). Individual samples are noisier, but equal time renders are mostly better. Note that volume emission renders differently than before. Pull Request: https://projects.blender.org/blender/blender/pulls/144451
2025-08-11 16:31:34 +02:00
parent 0c371ca3c5
commit df496eb894
11 changed files with 83 additions and 159 deletions
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -94,8 +94,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
    object_inverse_position_transform(kg, sd, &P);
    const InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
                                                                    INTERPOLATION_NONE;
-    return kernel_tex_image_interp_3d(
+    return kernel_tex_image_interp_3d(kg, sd, desc.offset, P, interp, stochastic);
        kg, desc.offset, P, interp, (stochastic) ? lcg_step_float(&sd->lcg_state) : -1.0f);
  }
  return zero_float4();
 }
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -1286,7 +1286,7 @@ bool OSLRenderServices::texture3d(OSLUStringHash filename,
      const int slot = handle->svm_slots[0].y;
      const float3 P_float3 = make_float3(P.x, P.y, P.z);
      float4 rgba = kernel_tex_image_interp_3d(
-          kernel_globals, slot, P_float3, INTERPOLATION_NONE, -1.0f);
+          kernel_globals, globals->sd, slot, P_float3, INTERPOLATION_NONE, false);
      result[0] = rgba[0];
      if (nchannels > 1) {
--- a/intern/cycles/kernel/osl/services_gpu.h
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -1072,7 +1072,8 @@ ccl_device_extern bool rs_texture3d(ccl_private ShaderGlobals *sg,
  switch (type) {
    case OSL_TEXTURE_HANDLE_TYPE_SVM: {
-      const float4 rgba = kernel_tex_image_interp_3d(nullptr, slot, *P, INTERPOLATION_NONE, -1.0f);
+      const float4 rgba = kernel_tex_image_interp_3d(
          nullptr, sg->sd, slot, *P, INTERPOLATION_NONE, false);
      if (nchannels > 0) {
        result[0] = rgba.x;
      }
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -26,6 +26,15 @@ template<class T> ccl_device float lcg_step_float(T rng)
  return (float)*rng * (1.0f / (float)0xFFFFFFFF);
 }
 template<class T> ccl_device float3 lcg_step_float3(T rng)
 {
  /* Make sure the random numbers are evaluated in order. */
  const float rand_x = lcg_step_float(rng);
  const float rand_y = lcg_step_float(rng);
  const float rand_z = lcg_step_float(rng);
  return make_float3(rand_x, rand_y, rand_z);
 }
 ccl_device uint lcg_init(const uint seed)
 {
  uint rng = seed;
--- a/intern/cycles/kernel/util/texture_3d.h
+++ b/intern/cycles/kernel/util/texture_3d.h
@@ -5,6 +5,8 @@
 #pragma once
 #include "kernel/globals.h"
 #include "kernel/sample/lcg.h"
 #include "util/texture.h"
 #if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
@@ -22,163 +24,61 @@ namespace {
 #endif
 #ifdef WITH_NANOVDB
-/* Stochastically turn a tricubic filter into a trilinear filter. */
+/* -------------------------------------------------------------------- */
-ccl_device_inline float3 interp_tricubic_to_trilinear_stochastic(const float3 P, float randu)
+/** Return the sample position for stochastical one-tap sampling.
 * From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810
 * \{ */
 ccl_device_inline float3 interp_tricubic_stochastic(const float3 P, ccl_private float3 &rand)
 {
  /* Some optimizations possible:
   * - Could use select() for SIMD if we split the random number into 10
   *   bits each and use that for each dimensions.
   * - For GPU would be better not to compute P0 and P1 for all dimensions
   *   in advance?
   * - 1/g0 and 1/(1 - g0) are computed twice.
   */
  const float3 p = floor(P);
  const float3 t = P - p;
-  /* Cubic weights. */
+  /* Cubic interpolation weights. */
-  const float3 w0 = (1.0f / 6.0f) * (t * (t * (-t + 3.0f) - 3.0f) + 1.0f);
+  const float3 w[4] = {(((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f),
-  const float3 w1 = (1.0f / 6.0f) * (t * t * (3.0f * t - 6.0f) + 4.0f);
+                       ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f),
-  //    float3 w2 = (1.0f / 6.0f) * (t * (t * (-3.0f * t + 3.0f) + 3.0f) + 1.0f);
+                       ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f),
-  const float3 w3 = (1.0f / 6.0f) * (t * t * t);
+                       (1.0f / 6.0f) * t * t * t};
-  const float3 g0 = w0 + w1;
+  /* For reservoir sampling, always accept the first in the stream. */
-  const float3 P0 = p + (w1 / g0) - 1.0f;
+  float3 total_weight = w[0];
-  const float3 P1 = p + (w3 / (make_float3(1.0f) - g0)) + 1.0f;
+  float3 offset = make_float3(-1.0f);
-  float3 Pnew = P0;
+  for (int j = 1; j < 4; j++) {
-
+    total_weight += w[j];
-  if (randu < g0.x) {
+    const float3 thresh = w[j] / total_weight;
-    randu /= g0.x;
+    const auto mask = rand < thresh;
-  }
+    offset = select(mask, make_float3(float(j) - 1.0f), offset);
-  else {
+    rand = select(mask, safe_divide(rand, thresh), safe_divide(rand - thresh, 1.0f - thresh));
    Pnew.x = P1.x;
    randu = (randu - g0.x) / (1 - g0.x);
  }
-  if (randu < g0.y) {
+  return p + offset;
    randu /= g0.y;
  }
  else {
    Pnew.y = P1.y;
    randu = (randu - g0.y) / (1 - g0.y);
  }
  if (randu < g0.z) {
  }
  else {
    Pnew.z = P1.z;
  }
  return Pnew;
 }
-/* From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810
+ccl_device_inline float3 interp_trilinear_stochastic(const float3 P, const float3 rand)
 *
 * Could be used in specific situations where we are certain a single
 * tap is enough. Maybe better to try optimizing bilinear lookups in
 * NanoVDB (detect when fully inside a single leaf) than deal with this. */
 #  if 0
 ccl_device int3 interp_tricubic_stochastic(const float3 P, float randu)
 {
-  const float ix = floorf(P.x);
+  const float3 p = floor(P);
-  const float iy = floorf(P.y);
+  const float3 t = P - p;
-  const float iz = floorf(P.z);
+  return select(rand < t, p + 1.0f, p);
  const float deltas[3] = {P.x - ix, P.y - iy, P.z - iz};
  int idx[3] = {(int)ix - 1, (int)iy - 1, (int)iz - 1};
  for (int i = 0; i < 3; i++) {
    const float t = deltas[i];
    const float t2 = t * t;
    /* Weighted reservoir sampling, first tap always accepted */
    const float w0 = (1.0f / 6.0f) * (-t * t2 + 3 * t2 - 3 * t + 1);
    float sumWt = w0;
    int index = 0;
    /* TODO: reduce number of divisions? */
    /* Sample the other 3 filter taps. */
    {
      const float w1 = (1.0f / 6.0f) * (3 * t * t2 - 6 * t2 + 4);
      sumWt += w1;
      const float p = w1 / sumWt;
      if (randu < p) {
        index = 1;
        randu /= p;
      }
      else {
        randu = (randu - p) / (1 - p);
      }
    }
    {
      const float w2 = (1.0f / 6.0f) * (-3 * t * t2 + 3 * t2 + 3 * t + 1);
      sumWt += w2;
      const float p = w2 / sumWt;
      if (randu < p) {
        index = 2;
        randu /= p;
      }
      else {
        randu = (randu - p) / (1 - p);
      }
    }
    {
      const float w3 = (1.0f / 6.0f) * t * t2;
      sumWt += w3;
      const float p = w3 / sumWt;
      if (randu < p) {
        index = 3;
        randu /= p;
      }
      else {
        randu = (randu - p) / (1 - p);
      }
    }
    idx[i] += index;
  }
  return make_int3(idx[0], idx[1], idx[2]);
 }
-ccl_device int3 interp_trilinear_stochastic(const float3 P, float randu)
+ccl_device_inline float3 interp_stochastic(const float3 P,
                                           ccl_private InterpolationType &interpolation,
                                           ccl_private float3 &rand)
 {
-  const float ix = floorf(P.x);
+  float3 P_new = P;
-  const float iy = floorf(P.y);
+  if (interpolation == INTERPOLATION_CUBIC) {
-  const float iz = floorf(P.z);
+    P_new = interp_tricubic_stochastic(P, rand);
-  int idx[3] = {(int)ix, (int)iy, (int)iz};
+  }
-
+  else if (interpolation == INTERPOLATION_LINEAR) {
-  const float tx = P.x - ix;
+    P_new = interp_trilinear_stochastic(P, rand);
  const float ty = P.y - iy;
  const float tz = P.z - iz;
  if (randu < tx) {
    idx[0]++;
    randu /= tx;
  }
  else {
-    randu = (randu - tx) / (1 - tx);
+    kernel_assert(interpolation == INTERPOLATION_CLOSEST);
  }
-
+  interpolation = INTERPOLATION_CLOSEST;
-  if (randu < ty) {
+  return P_new;
    idx[1]++;
    randu /= ty;
  }
  else {
    randu = (randu - ty) / (1 - ty);
  }
  if (randu < tz) {
    idx[2]++;
  }
  return make_int3(idx[0], idx[1], idx[2]);
 }
-#  endif
+/** \} */
 template<typename OutT, typename Acc>
 ccl_device OutT kernel_tex_image_interp_trilinear_nanovdb(ccl_private Acc &acc, const float3 P)
@@ -278,8 +178,12 @@ OutT kernel_tex_image_interp_nanovdb(const ccl_global TextureInfo &info,
 }
 #endif /* WITH_NANOVDB */
-ccl_device float4 kernel_tex_image_interp_3d(
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
-    KernelGlobals kg, const int id, float3 P, InterpolationType interp, const float randu)
+                                             ccl_private ShaderData *sd,
                                             const int id,
                                             float3 P,
                                             InterpolationType interp,
                                             const bool stochastic)
 {
 #ifdef WITH_NANOVDB
  const ccl_global TextureInfo &info = kernel_data_fetch(texture_info, id);
@@ -294,9 +198,10 @@ ccl_device float4 kernel_tex_image_interp_3d(
  /* A -0.5 offset is used to center the cubic samples around the sample point. */
  P = P - make_float3(0.5f);
-  if (interpolation == INTERPOLATION_CUBIC && randu >= 0.0f) {
+
-    P = interp_tricubic_to_trilinear_stochastic(P, randu);
+  if (stochastic) {
-    interpolation = INTERPOLATION_LINEAR;
+    float3 rand = lcg_step_float3(&sd->lcg_state);
    P = interp_stochastic(P, interpolation, rand);
  }
  const ImageDataType data_type = (ImageDataType)info.data_type;
@@ -325,10 +230,11 @@ ccl_device float4 kernel_tex_image_interp_3d(
  }
 #else
  (void)kg;
  (void)sd;
  (void)id;
  (void)P;
  (void)interp;
-  (void)randu;
+  (void)stochastic;
 #endif
  return make_float4(
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -236,6 +236,15 @@ ccl_device_inline int3 operator>=(const float3 a, const float3 b)
 #  endif
 }
 ccl_device_inline int3 operator<(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
 #  else
  return make_int3(a.x < b.x, a.y < b.y, a.z < b.z);
 #  endif
 }
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
--- a/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
+++ b/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
--- a/tests/files/render/openvdb/cycles_renders/principled_blackbody.png
+++ b/tests/files/render/openvdb/cycles_renders/principled_blackbody.png
--- a/tests/files/render/openvdb/cycles_renders/smoke_color.png
+++ b/tests/files/render/openvdb/cycles_renders/smoke_color.png
--- a/tests/files/render/openvdb/cycles_renders/smoke_fire.png
+++ b/tests/files/render/openvdb/cycles_renders/smoke_fire.png
--- a/tests/files/render/volume/cycles_renders/overlapping_octrees.png
+++ b/tests/files/render/volume/cycles_renders/overlapping_octrees.png