Cycles: use one-tap stochastic interpolation for volume
It has ~1.2x speed-up on CPU and ~1.5x speed-up on GPU (tested on Metal M2 Ultra). Individual samples are noisier, but equal time renders are mostly better. Note that volume emission renders differently than before. Pull Request: https://projects.blender.org/blender/blender/pulls/144451
This commit is contained in:
@@ -94,8 +94,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
|
|||||||
object_inverse_position_transform(kg, sd, &P);
|
object_inverse_position_transform(kg, sd, &P);
|
||||||
const InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
|
const InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
|
||||||
INTERPOLATION_NONE;
|
INTERPOLATION_NONE;
|
||||||
return kernel_tex_image_interp_3d(
|
return kernel_tex_image_interp_3d(kg, sd, desc.offset, P, interp, stochastic);
|
||||||
kg, desc.offset, P, interp, (stochastic) ? lcg_step_float(&sd->lcg_state) : -1.0f);
|
|
||||||
}
|
}
|
||||||
return zero_float4();
|
return zero_float4();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1286,7 +1286,7 @@ bool OSLRenderServices::texture3d(OSLUStringHash filename,
|
|||||||
const int slot = handle->svm_slots[0].y;
|
const int slot = handle->svm_slots[0].y;
|
||||||
const float3 P_float3 = make_float3(P.x, P.y, P.z);
|
const float3 P_float3 = make_float3(P.x, P.y, P.z);
|
||||||
float4 rgba = kernel_tex_image_interp_3d(
|
float4 rgba = kernel_tex_image_interp_3d(
|
||||||
kernel_globals, slot, P_float3, INTERPOLATION_NONE, -1.0f);
|
kernel_globals, globals->sd, slot, P_float3, INTERPOLATION_NONE, false);
|
||||||
|
|
||||||
result[0] = rgba[0];
|
result[0] = rgba[0];
|
||||||
if (nchannels > 1) {
|
if (nchannels > 1) {
|
||||||
|
|||||||
@@ -1072,7 +1072,8 @@ ccl_device_extern bool rs_texture3d(ccl_private ShaderGlobals *sg,
|
|||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case OSL_TEXTURE_HANDLE_TYPE_SVM: {
|
case OSL_TEXTURE_HANDLE_TYPE_SVM: {
|
||||||
const float4 rgba = kernel_tex_image_interp_3d(nullptr, slot, *P, INTERPOLATION_NONE, -1.0f);
|
const float4 rgba = kernel_tex_image_interp_3d(
|
||||||
|
nullptr, sg->sd, slot, *P, INTERPOLATION_NONE, false);
|
||||||
if (nchannels > 0) {
|
if (nchannels > 0) {
|
||||||
result[0] = rgba.x;
|
result[0] = rgba.x;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,15 @@ template<class T> ccl_device float lcg_step_float(T rng)
|
|||||||
return (float)*rng * (1.0f / (float)0xFFFFFFFF);
|
return (float)*rng * (1.0f / (float)0xFFFFFFFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class T> ccl_device float3 lcg_step_float3(T rng)
|
||||||
|
{
|
||||||
|
/* Make sure the random numbers are evaluated in order. */
|
||||||
|
const float rand_x = lcg_step_float(rng);
|
||||||
|
const float rand_y = lcg_step_float(rng);
|
||||||
|
const float rand_z = lcg_step_float(rng);
|
||||||
|
return make_float3(rand_x, rand_y, rand_z);
|
||||||
|
}
|
||||||
|
|
||||||
ccl_device uint lcg_init(const uint seed)
|
ccl_device uint lcg_init(const uint seed)
|
||||||
{
|
{
|
||||||
uint rng = seed;
|
uint rng = seed;
|
||||||
|
|||||||
@@ -5,6 +5,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "kernel/globals.h"
|
#include "kernel/globals.h"
|
||||||
|
#include "kernel/sample/lcg.h"
|
||||||
|
|
||||||
#include "util/texture.h"
|
#include "util/texture.h"
|
||||||
|
|
||||||
#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
|
#if !defined(__KERNEL_METAL__) && !defined(__KERNEL_ONEAPI__)
|
||||||
@@ -22,163 +24,61 @@ namespace {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef WITH_NANOVDB
|
#ifdef WITH_NANOVDB
|
||||||
/* Stochastically turn a tricubic filter into a trilinear filter. */
|
/* -------------------------------------------------------------------- */
|
||||||
ccl_device_inline float3 interp_tricubic_to_trilinear_stochastic(const float3 P, float randu)
|
/** Return the sample position for stochastical one-tap sampling.
|
||||||
|
* From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810
|
||||||
|
* \{ */
|
||||||
|
ccl_device_inline float3 interp_tricubic_stochastic(const float3 P, ccl_private float3 &rand)
|
||||||
{
|
{
|
||||||
/* Some optimizations possible:
|
|
||||||
* - Could use select() for SIMD if we split the random number into 10
|
|
||||||
* bits each and use that for each dimensions.
|
|
||||||
* - For GPU would be better not to compute P0 and P1 for all dimensions
|
|
||||||
* in advance?
|
|
||||||
* - 1/g0 and 1/(1 - g0) are computed twice.
|
|
||||||
*/
|
|
||||||
|
|
||||||
const float3 p = floor(P);
|
const float3 p = floor(P);
|
||||||
const float3 t = P - p;
|
const float3 t = P - p;
|
||||||
|
|
||||||
/* Cubic weights. */
|
/* Cubic interpolation weights. */
|
||||||
const float3 w0 = (1.0f / 6.0f) * (t * (t * (-t + 3.0f) - 3.0f) + 1.0f);
|
const float3 w[4] = {(((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f),
|
||||||
const float3 w1 = (1.0f / 6.0f) * (t * t * (3.0f * t - 6.0f) + 4.0f);
|
((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f),
|
||||||
// float3 w2 = (1.0f / 6.0f) * (t * (t * (-3.0f * t + 3.0f) + 3.0f) + 1.0f);
|
((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f),
|
||||||
const float3 w3 = (1.0f / 6.0f) * (t * t * t);
|
(1.0f / 6.0f) * t * t * t};
|
||||||
|
|
||||||
const float3 g0 = w0 + w1;
|
/* For reservoir sampling, always accept the first in the stream. */
|
||||||
const float3 P0 = p + (w1 / g0) - 1.0f;
|
float3 total_weight = w[0];
|
||||||
const float3 P1 = p + (w3 / (make_float3(1.0f) - g0)) + 1.0f;
|
float3 offset = make_float3(-1.0f);
|
||||||
|
|
||||||
float3 Pnew = P0;
|
for (int j = 1; j < 4; j++) {
|
||||||
|
total_weight += w[j];
|
||||||
if (randu < g0.x) {
|
const float3 thresh = w[j] / total_weight;
|
||||||
randu /= g0.x;
|
const auto mask = rand < thresh;
|
||||||
}
|
offset = select(mask, make_float3(float(j) - 1.0f), offset);
|
||||||
else {
|
rand = select(mask, safe_divide(rand, thresh), safe_divide(rand - thresh, 1.0f - thresh));
|
||||||
Pnew.x = P1.x;
|
|
||||||
randu = (randu - g0.x) / (1 - g0.x);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (randu < g0.y) {
|
return p + offset;
|
||||||
randu /= g0.y;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Pnew.y = P1.y;
|
|
||||||
randu = (randu - g0.y) / (1 - g0.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (randu < g0.z) {
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Pnew.z = P1.z;
|
|
||||||
}
|
|
||||||
|
|
||||||
return Pnew;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* From "Stochastic Texture Filtering": https://arxiv.org/abs/2305.05810
|
ccl_device_inline float3 interp_trilinear_stochastic(const float3 P, const float3 rand)
|
||||||
*
|
|
||||||
* Could be used in specific situations where we are certain a single
|
|
||||||
* tap is enough. Maybe better to try optimizing bilinear lookups in
|
|
||||||
* NanoVDB (detect when fully inside a single leaf) than deal with this. */
|
|
||||||
|
|
||||||
# if 0
|
|
||||||
ccl_device int3 interp_tricubic_stochastic(const float3 P, float randu)
|
|
||||||
{
|
{
|
||||||
const float ix = floorf(P.x);
|
const float3 p = floor(P);
|
||||||
const float iy = floorf(P.y);
|
const float3 t = P - p;
|
||||||
const float iz = floorf(P.z);
|
return select(rand < t, p + 1.0f, p);
|
||||||
const float deltas[3] = {P.x - ix, P.y - iy, P.z - iz};
|
|
||||||
int idx[3] = {(int)ix - 1, (int)iy - 1, (int)iz - 1};
|
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
const float t = deltas[i];
|
|
||||||
const float t2 = t * t;
|
|
||||||
|
|
||||||
/* Weighted reservoir sampling, first tap always accepted */
|
|
||||||
const float w0 = (1.0f / 6.0f) * (-t * t2 + 3 * t2 - 3 * t + 1);
|
|
||||||
float sumWt = w0;
|
|
||||||
int index = 0;
|
|
||||||
|
|
||||||
/* TODO: reduce number of divisions? */
|
|
||||||
|
|
||||||
/* Sample the other 3 filter taps. */
|
|
||||||
{
|
|
||||||
const float w1 = (1.0f / 6.0f) * (3 * t * t2 - 6 * t2 + 4);
|
|
||||||
sumWt += w1;
|
|
||||||
const float p = w1 / sumWt;
|
|
||||||
if (randu < p) {
|
|
||||||
index = 1;
|
|
||||||
randu /= p;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
randu = (randu - p) / (1 - p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float w2 = (1.0f / 6.0f) * (-3 * t * t2 + 3 * t2 + 3 * t + 1);
|
|
||||||
sumWt += w2;
|
|
||||||
const float p = w2 / sumWt;
|
|
||||||
if (randu < p) {
|
|
||||||
index = 2;
|
|
||||||
randu /= p;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
randu = (randu - p) / (1 - p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float w3 = (1.0f / 6.0f) * t * t2;
|
|
||||||
sumWt += w3;
|
|
||||||
const float p = w3 / sumWt;
|
|
||||||
if (randu < p) {
|
|
||||||
index = 3;
|
|
||||||
randu /= p;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
randu = (randu - p) / (1 - p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
idx[i] += index;
|
|
||||||
}
|
|
||||||
|
|
||||||
return make_int3(idx[0], idx[1], idx[2]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device int3 interp_trilinear_stochastic(const float3 P, float randu)
|
ccl_device_inline float3 interp_stochastic(const float3 P,
|
||||||
|
ccl_private InterpolationType &interpolation,
|
||||||
|
ccl_private float3 &rand)
|
||||||
{
|
{
|
||||||
const float ix = floorf(P.x);
|
float3 P_new = P;
|
||||||
const float iy = floorf(P.y);
|
if (interpolation == INTERPOLATION_CUBIC) {
|
||||||
const float iz = floorf(P.z);
|
P_new = interp_tricubic_stochastic(P, rand);
|
||||||
int idx[3] = {(int)ix, (int)iy, (int)iz};
|
}
|
||||||
|
else if (interpolation == INTERPOLATION_LINEAR) {
|
||||||
const float tx = P.x - ix;
|
P_new = interp_trilinear_stochastic(P, rand);
|
||||||
const float ty = P.y - iy;
|
|
||||||
const float tz = P.z - iz;
|
|
||||||
|
|
||||||
if (randu < tx) {
|
|
||||||
idx[0]++;
|
|
||||||
randu /= tx;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
randu = (randu - tx) / (1 - tx);
|
kernel_assert(interpolation == INTERPOLATION_CLOSEST);
|
||||||
}
|
}
|
||||||
|
interpolation = INTERPOLATION_CLOSEST;
|
||||||
if (randu < ty) {
|
return P_new;
|
||||||
idx[1]++;
|
|
||||||
randu /= ty;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
randu = (randu - ty) / (1 - ty);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (randu < tz) {
|
|
||||||
idx[2]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return make_int3(idx[0], idx[1], idx[2]);
|
|
||||||
}
|
}
|
||||||
# endif
|
/** \} */
|
||||||
|
|
||||||
template<typename OutT, typename Acc>
|
template<typename OutT, typename Acc>
|
||||||
ccl_device OutT kernel_tex_image_interp_trilinear_nanovdb(ccl_private Acc &acc, const float3 P)
|
ccl_device OutT kernel_tex_image_interp_trilinear_nanovdb(ccl_private Acc &acc, const float3 P)
|
||||||
@@ -278,8 +178,12 @@ OutT kernel_tex_image_interp_nanovdb(const ccl_global TextureInfo &info,
|
|||||||
}
|
}
|
||||||
#endif /* WITH_NANOVDB */
|
#endif /* WITH_NANOVDB */
|
||||||
|
|
||||||
ccl_device float4 kernel_tex_image_interp_3d(
|
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
|
||||||
KernelGlobals kg, const int id, float3 P, InterpolationType interp, const float randu)
|
ccl_private ShaderData *sd,
|
||||||
|
const int id,
|
||||||
|
float3 P,
|
||||||
|
InterpolationType interp,
|
||||||
|
const bool stochastic)
|
||||||
{
|
{
|
||||||
#ifdef WITH_NANOVDB
|
#ifdef WITH_NANOVDB
|
||||||
const ccl_global TextureInfo &info = kernel_data_fetch(texture_info, id);
|
const ccl_global TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||||
@@ -294,9 +198,10 @@ ccl_device float4 kernel_tex_image_interp_3d(
|
|||||||
|
|
||||||
/* A -0.5 offset is used to center the cubic samples around the sample point. */
|
/* A -0.5 offset is used to center the cubic samples around the sample point. */
|
||||||
P = P - make_float3(0.5f);
|
P = P - make_float3(0.5f);
|
||||||
if (interpolation == INTERPOLATION_CUBIC && randu >= 0.0f) {
|
|
||||||
P = interp_tricubic_to_trilinear_stochastic(P, randu);
|
if (stochastic) {
|
||||||
interpolation = INTERPOLATION_LINEAR;
|
float3 rand = lcg_step_float3(&sd->lcg_state);
|
||||||
|
P = interp_stochastic(P, interpolation, rand);
|
||||||
}
|
}
|
||||||
|
|
||||||
const ImageDataType data_type = (ImageDataType)info.data_type;
|
const ImageDataType data_type = (ImageDataType)info.data_type;
|
||||||
@@ -325,10 +230,11 @@ ccl_device float4 kernel_tex_image_interp_3d(
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
(void)kg;
|
(void)kg;
|
||||||
|
(void)sd;
|
||||||
(void)id;
|
(void)id;
|
||||||
(void)P;
|
(void)P;
|
||||||
(void)interp;
|
(void)interp;
|
||||||
(void)randu;
|
(void)stochastic;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return make_float4(
|
return make_float4(
|
||||||
|
|||||||
@@ -236,6 +236,15 @@ ccl_device_inline int3 operator>=(const float3 a, const float3 b)
|
|||||||
# endif
|
# endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ccl_device_inline int3 operator<(const float3 a, const float3 b)
|
||||||
|
{
|
||||||
|
# ifdef __KERNEL_SSE__
|
||||||
|
return int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
|
||||||
|
# else
|
||||||
|
return make_int3(a.x < b.x, a.y < b.y, a.z < b.z);
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
|
||||||
ccl_device_inline float dot(const float3 a, const float3 b)
|
ccl_device_inline float dot(const float3 a, const float3 b)
|
||||||
{
|
{
|
||||||
# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
|
# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
|
||||||
|
|||||||
BIN
tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
(Stored with Git LFS)
BIN
tests/files/render/openvdb/cycles_renders/openvdb_overlap.png
(Stored with Git LFS)
Binary file not shown.
BIN
tests/files/render/openvdb/cycles_renders/principled_blackbody.png
(Stored with Git LFS)
BIN
tests/files/render/openvdb/cycles_renders/principled_blackbody.png
(Stored with Git LFS)
Binary file not shown.
BIN
tests/files/render/openvdb/cycles_renders/smoke_color.png
(Stored with Git LFS)
BIN
tests/files/render/openvdb/cycles_renders/smoke_color.png
(Stored with Git LFS)
Binary file not shown.
BIN
tests/files/render/openvdb/cycles_renders/smoke_fire.png
(Stored with Git LFS)
BIN
tests/files/render/openvdb/cycles_renders/smoke_fire.png
(Stored with Git LFS)
Binary file not shown.
BIN
tests/files/render/volume/cycles_renders/overlapping_octrees.png
(Stored with Git LFS)
BIN
tests/files/render/volume/cycles_renders/overlapping_octrees.png
(Stored with Git LFS)
Binary file not shown.
Reference in New Issue
Block a user