From df496eb894d700e7b30ad177db931168bd7df82e Mon Sep 17 00:00:00 2001 From: Weizhen Huang Date: Mon, 11 Aug 2025 16:31:34 +0200 Subject: [PATCH] Cycles: use one-tap stochastic interpolation for volume It has ~1.2x speed-up on CPU and ~1.5x speed-up on GPU (tested on Metal M2 Ultra). Individual samples are noisier, but equal time renders are mostly better. Note that volume emission renders differently than before. Pull Request: https://projects.blender.org/blender/blender/pulls/144451 --- intern/cycles/kernel/geom/volume.h | 3 +- intern/cycles/kernel/osl/services.cpp | 2 +- intern/cycles/kernel/osl/services_gpu.h | 3 +- intern/cycles/kernel/sample/lcg.h | 9 + intern/cycles/kernel/util/texture_3d.h | 196 +++++------------- intern/cycles/util/math_float3.h | 9 + .../cycles_renders/openvdb_overlap.png | 4 +- .../cycles_renders/principled_blackbody.png | 4 +- .../openvdb/cycles_renders/smoke_color.png | 4 +- .../openvdb/cycles_renders/smoke_fire.png | 4 +- .../cycles_renders/overlapping_octrees.png | 4 +- 11 files changed, 83 insertions(+), 159 deletions(-) diff --git a/intern/cycles/kernel/geom/volume.h b/intern/cycles/kernel/geom/volume.h index 1b3a3f1f625..90c3a835ca8 100644 --- a/intern/cycles/kernel/geom/volume.h +++ b/intern/cycles/kernel/geom/volume.h @@ -94,8 +94,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg, object_inverse_position_transform(kg, sd, &P); const InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC : INTERPOLATION_NONE; - return kernel_tex_image_interp_3d( - kg, desc.offset, P, interp, (stochastic) ? lcg_step_float(&sd->lcg_state) : -1.0f); + return kernel_tex_image_interp_3d(kg, sd, desc.offset, P, interp, stochastic); } return zero_float4(); } diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp index 4a4c4b51313..ad1cd5339b9 100644 --- a/intern/cycles/kernel/osl/services.cpp +++ b/intern/cycles/kernel/osl/services.cpp @@ -1286,7 +1286,7 @@ bool OSLRenderServices::texture3d(OSLUStringHash filename, const int slot = handle->svm_slots[0].y; const float3 P_float3 = make_float3(P.x, P.y, P.z); float4 rgba = kernel_tex_image_interp_3d( - kernel_globals, slot, P_float3, INTERPOLATION_NONE, -1.0f); + kernel_globals, globals->sd, slot, P_float3, INTERPOLATION_NONE, false); result[0] = rgba[0]; if (nchannels > 1) { diff --git a/intern/cycles/kernel/osl/services_gpu.h b/intern/cycles/kernel/osl/services_gpu.h index 21ae8ddc82a..8ac3cc41c81 100644 --- a/intern/cycles/kernel/osl/services_gpu.h +++ b/intern/cycles/kernel/osl/services_gpu.h @@ -1072,7 +1072,8 @@ ccl_device_extern bool rs_texture3d(ccl_private ShaderGlobals *sg, switch (type) { case OSL_TEXTURE_HANDLE_TYPE_SVM: { - const float4 rgba = kernel_tex_image_interp_3d(nullptr, slot, *P, INTERPOLATION_NONE, -1.0f); + const float4 rgba = kernel_tex_image_interp_3d( + nullptr, sg->sd, slot, *P, INTERPOLATION_NONE, false); if (nchannels > 0) { result[0] = rgba.x; } diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h index 39fde424835..298dfdfe609 100644 --- a/intern/cycles/kernel/sample/lcg.h +++ b/intern/cycles/kernel/sample/lcg.h @@ -26,6 +26,15 @@ template ccl_device float lcg_step_float(T rng) return (float)*rng * (1.0f / (float)0xFFFFFFFF); } +template ccl_device float3 lcg_step_float3(T rng) +{ + /* Make sure the random numbers are evaluated in order. */ + const float rand_x = lcg_step_float(rng); + const float rand_y = lcg_step_float(rng); + const float rand_z = lcg_step_float(rng); + return make_float3(rand_x, rand_y, rand_z); +} + ccl_device uint lcg_init(const uint seed) { uint rng = seed; diff --git a/intern/cycles/kernel/util/texture_3d.h b/intern/cycles/kernel/util/texture_3d.h index 865db289024..5fa149b49bb 100644 --- a/intern/cycles/kernel/util/texture_3d.h +++ b/intern/cycles/kernel/util/texture_3d.h @@ -5,6 +5,8 @@ #pragma once #include "kernel/globals.h" +#include "kernel/sample/lcg.h" + #include "util/texture.h" #if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__) @@ -22,163 +24,61 @@ namespace { #endif #ifdef WITH_NANOVDB -/* Stochastically turn a tricubic filter into a trilinear filter. */ -ccl_device_inline float3 interp_tricubic_to_trilinear_stochastic(const float3 P, float randu) +/* -------------------------------------------------------------------- */ +/** Return the sample position for stochastical one-tap sampling. + * From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810 + * \{ */ +ccl_device_inline float3 interp_tricubic_stochastic(const float3 P, ccl_private float3 &rand) { - /* Some optimizations possible: - * - Could use select() for SIMD if we split the random number into 10 - * bits each and use that for each dimensions. - * - For GPU would be better not to compute P0 and P1 for all dimensions - * in advance? - * - 1/g0 and 1/(1 - g0) are computed twice. - */ - const float3 p = floor(P); const float3 t = P - p; - /* Cubic weights. */ - const float3 w0 = (1.0f / 6.0f) * (t * (t * (-t + 3.0f) - 3.0f) + 1.0f); - const float3 w1 = (1.0f / 6.0f) * (t * t * (3.0f * t - 6.0f) + 4.0f); - // float3 w2 = (1.0f / 6.0f) * (t * (t * (-3.0f * t + 3.0f) + 3.0f) + 1.0f); - const float3 w3 = (1.0f / 6.0f) * (t * t * t); + /* Cubic interpolation weights. */ + const float3 w[4] = {(((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f), + ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f), + ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f), + (1.0f / 6.0f) * t * t * t}; - const float3 g0 = w0 + w1; - const float3 P0 = p + (w1 / g0) - 1.0f; - const float3 P1 = p + (w3 / (make_float3(1.0f) - g0)) + 1.0f; + /* For reservoir sampling, always accept the first in the stream. */ + float3 total_weight = w[0]; + float3 offset = make_float3(-1.0f); - float3 Pnew = P0; - - if (randu < g0.x) { - randu /= g0.x; - } - else { - Pnew.x = P1.x; - randu = (randu - g0.x) / (1 - g0.x); + for (int j = 1; j < 4; j++) { + total_weight += w[j]; + const float3 thresh = w[j] / total_weight; + const auto mask = rand < thresh; + offset = select(mask, make_float3(float(j) - 1.0f), offset); + rand = select(mask, safe_divide(rand, thresh), safe_divide(rand - thresh, 1.0f - thresh)); } - if (randu < g0.y) { - randu /= g0.y; - } - else { - Pnew.y = P1.y; - randu = (randu - g0.y) / (1 - g0.y); - } - - if (randu < g0.z) { - } - else { - Pnew.z = P1.z; - } - - return Pnew; + return p + offset; } -/* From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810 - * - * Could be used in specific situations where we are certain a single - * tap is enough. Maybe better to try optimizing bilinear lookups in - * NanoVDB (detect when fully inside a single leaf) than deal with this. */ - -# if 0 -ccl_device int3 interp_tricubic_stochastic(const float3 P, float randu) +ccl_device_inline float3 interp_trilinear_stochastic(const float3 P, const float3 rand) { - const float ix = floorf(P.x); - const float iy = floorf(P.y); - const float iz = floorf(P.z); - const float deltas[3] = {P.x - ix, P.y - iy, P.z - iz}; - int idx[3] = {(int)ix - 1, (int)iy - 1, (int)iz - 1}; - - for (int i = 0; i < 3; i++) { - const float t = deltas[i]; - const float t2 = t * t; - - /* Weighted reservoir sampling, first tap always accepted */ - const float w0 = (1.0f / 6.0f) * (-t * t2 + 3 * t2 - 3 * t + 1); - float sumWt = w0; - int index = 0; - - /* TODO: reduce number of divisions? */ - - /* Sample the other 3 filter taps. */ - { - const float w1 = (1.0f / 6.0f) * (3 * t * t2 - 6 * t2 + 4); - sumWt += w1; - const float p = w1 / sumWt; - if (randu < p) { - index = 1; - randu /= p; - } - else { - randu = (randu - p) / (1 - p); - } - } - - { - const float w2 = (1.0f / 6.0f) * (-3 * t * t2 + 3 * t2 + 3 * t + 1); - sumWt += w2; - const float p = w2 / sumWt; - if (randu < p) { - index = 2; - randu /= p; - } - else { - randu = (randu - p) / (1 - p); - } - } - - { - const float w3 = (1.0f / 6.0f) * t * t2; - sumWt += w3; - const float p = w3 / sumWt; - if (randu < p) { - index = 3; - randu /= p; - } - else { - randu = (randu - p) / (1 - p); - } - } - - idx[i] += index; - } - - return make_int3(idx[0], idx[1], idx[2]); + const float3 p = floor(P); + const float3 t = P - p; + return select(rand < t, p + 1.0f, p); } -ccl_device int3 interp_trilinear_stochastic(const float3 P, float randu) +ccl_device_inline float3 interp_stochastic(const float3 P, + ccl_private InterpolationType &interpolation, + ccl_private float3 &rand) { - const float ix = floorf(P.x); - const float iy = floorf(P.y); - const float iz = floorf(P.z); - int idx[3] = {(int)ix, (int)iy, (int)iz}; - - const float tx = P.x - ix; - const float ty = P.y - iy; - const float tz = P.z - iz; - - if (randu < tx) { - idx[0]++; - randu /= tx; + float3 P_new = P; + if (interpolation == INTERPOLATION_CUBIC) { + P_new = interp_tricubic_stochastic(P, rand); + } + else if (interpolation == INTERPOLATION_LINEAR) { + P_new = interp_trilinear_stochastic(P, rand); } else { - randu = (randu - tx) / (1 - tx); + kernel_assert(interpolation == INTERPOLATION_CLOSEST); } - - if (randu < ty) { - idx[1]++; - randu /= ty; - } - else { - randu = (randu - ty) / (1 - ty); - } - - if (randu < tz) { - idx[2]++; - } - - return make_int3(idx[0], idx[1], idx[2]); + interpolation = INTERPOLATION_CLOSEST; + return P_new; } -# endif +/** \} */ template ccl_device OutT kernel_tex_image_interp_trilinear_nanovdb(ccl_private Acc &acc, const float3 P) @@ -278,8 +178,12 @@ OutT kernel_tex_image_interp_nanovdb(const ccl_global TextureInfo &info, } #endif /* WITH_NANOVDB */ -ccl_device float4 kernel_tex_image_interp_3d( - KernelGlobals kg, const int id, float3 P, InterpolationType interp, const float randu) +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, + ccl_private ShaderData *sd, + const int id, + float3 P, + InterpolationType interp, + const bool stochastic) { #ifdef WITH_NANOVDB const ccl_global TextureInfo &info = kernel_data_fetch(texture_info, id); @@ -294,9 +198,10 @@ ccl_device float4 kernel_tex_image_interp_3d( /* A -0.5 offset is used to center the cubic samples around the sample point. */ P = P - make_float3(0.5f); - if (interpolation == INTERPOLATION_CUBIC && randu >= 0.0f) { - P = interp_tricubic_to_trilinear_stochastic(P, randu); - interpolation = INTERPOLATION_LINEAR; + + if (stochastic) { + float3 rand = lcg_step_float3(&sd->lcg_state); + P = interp_stochastic(P, interpolation, rand); } const ImageDataType data_type = (ImageDataType)info.data_type; @@ -325,10 +230,11 @@ ccl_device float4 kernel_tex_image_interp_3d( } #else (void)kg; + (void)sd; (void)id; (void)P; (void)interp; - (void)randu; + (void)stochastic; #endif return make_float4( diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 40a4029d07c..780281d0c40 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -236,6 +236,15 @@ ccl_device_inline int3 operator>=(const float3 a, const float3 b) # endif } +ccl_device_inline int3 operator<(const float3 a, const float3 b) +{ +# ifdef __KERNEL_SSE__ + return int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); +# else + return make_int3(a.x < b.x, a.y < b.y, a.z < b.z); +# endif +} + ccl_device_inline float dot(const float3 a, const float3 b) { # if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__) diff --git a/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png b/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png index 885913eb6a7..2f166b6283f 100644 --- a/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png +++ b/tests/files/render/openvdb/cycles_renders/openvdb_overlap.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08f142a43bceeff0f7bce685b94b9e8941a63324b5133be08e521f25c7721dab -size 26173 +oid sha256:97155e9c640a07366af930273f1766cf0b8687eedb11c974a00a2aced3456cc9 +size 26382 diff --git a/tests/files/render/openvdb/cycles_renders/principled_blackbody.png b/tests/files/render/openvdb/cycles_renders/principled_blackbody.png index bcb2aa768a1..394601a3ca9 100644 --- a/tests/files/render/openvdb/cycles_renders/principled_blackbody.png +++ b/tests/files/render/openvdb/cycles_renders/principled_blackbody.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b993264dab9cbcd26e21d6f145d4aa0e3a5773172d5b686e303c50b5cd42ca83 -size 38458 +oid sha256:f06fbabaf4959472a6c7862aab505aacbf2a0b093372bf88c8f86c1b159bbfb0 +size 35796 diff --git a/tests/files/render/openvdb/cycles_renders/smoke_color.png b/tests/files/render/openvdb/cycles_renders/smoke_color.png index 71599564f60..ca6a92217b1 100644 --- a/tests/files/render/openvdb/cycles_renders/smoke_color.png +++ b/tests/files/render/openvdb/cycles_renders/smoke_color.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f4ac8dc3388072549929d870c646a615ca01ac6c0f120385847fdb96fafd1eb -size 30556 +oid sha256:ec2faf2e398b066a6ff3b4913e0710527e150dc5233c35e2d9b802799a130049 +size 30329 diff --git a/tests/files/render/openvdb/cycles_renders/smoke_fire.png b/tests/files/render/openvdb/cycles_renders/smoke_fire.png index 5c005f2e1f1..f5cf387e5ca 100644 --- a/tests/files/render/openvdb/cycles_renders/smoke_fire.png +++ b/tests/files/render/openvdb/cycles_renders/smoke_fire.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7f26f3e9e623841cd2e72957014ff4940c36067010cacdfeb4ee2ffda2925b9 -size 40947 +oid sha256:24ac21680f4a64e5b81263e13ac5d82391fa74acfc14068beb0298a033167c66 +size 41059 diff --git a/tests/files/render/volume/cycles_renders/overlapping_octrees.png b/tests/files/render/volume/cycles_renders/overlapping_octrees.png index ec190c36d4c..d12a416f278 100644 --- a/tests/files/render/volume/cycles_renders/overlapping_octrees.png +++ b/tests/files/render/volume/cycles_renders/overlapping_octrees.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab15d155b1c86e4664d7c12fe5dcea3020c6ba8669b95a40ebcfaeb5b9b4c0c2 -size 28176 +oid sha256:2470c3e7e5d52702c6962489ad585539c11474e1dc84fd90c1ca006406ec0f65 +size 28287