Cycles: Use RGBE for denoised guiding buffers to reduce memory usage

Co-authored-by: Brecht Van Lommel <brecht@blender.org>
This commit is contained in:
Weizhen Huang
2025-05-21 12:23:13 +02:00
committed by Weizhen Huang
parent 5cb6014efd
commit a4f8e0bfa2
37 changed files with 284 additions and 41 deletions

View File

@@ -41,6 +41,7 @@ CPUKernels::CPUKernels()
REGISTER_KERNEL_FILM_CONVERT(sample_count),
REGISTER_KERNEL_FILM_CONVERT(float),
REGISTER_KERNEL_FILM_CONVERT(light_path),
REGISTER_KERNEL_FILM_CONVERT(rgbe),
REGISTER_KERNEL_FILM_CONVERT(float3),
REGISTER_KERNEL_FILM_CONVERT(motion),
REGISTER_KERNEL_FILM_CONVERT(cryptomatte),

View File

@@ -119,6 +119,7 @@ class CPUKernels {
KERNEL_FILM_CONVERT_FUNCTION(float)
KERNEL_FILM_CONVERT_FUNCTION(light_path)
KERNEL_FILM_CONVERT_FUNCTION(rgbe)
KERNEL_FILM_CONVERT_FUNCTION(float3)
KERNEL_FILM_CONVERT_FUNCTION(motion)

View File

@@ -126,6 +126,7 @@ const char *device_kernel_as_string(DeviceKernel kernel)
FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
FILM_CONVERT_KERNEL_AS_STRING(RGBE, rgbe)
FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)

View File

@@ -316,7 +316,7 @@ void DenoiserGPU::denoise_color_read(const DenoiseContext &context, const Denois
const PassAccessorGPU pass_accessor(
denoiser_queue_.get(), pass_access_info, 1.0f, context.num_samples);
PassAccessor::Destination destination(pass_access_info.type);
PassAccessor::Destination destination(pass_access_info.type, pass_access_info.mode);
destination.d_pixels = context.render_buffers->buffer.device_pointer;
destination.num_components = 3;
destination.pixel_offset = pass.denoised_offset;

View File

@@ -53,7 +53,7 @@ class OIDNPass {
offset = buffer_params.get_pass_offset(type, mode);
need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
const PassInfo pass_info = Pass::get_info(type);
const PassInfo pass_info = Pass::get_info(type, mode);
num_components = pass_info.num_components;
use_compositing = pass_info.use_compositing;
use_denoising_albedo = pass_info.use_denoising_albedo;

View File

@@ -33,9 +33,9 @@ PassAccessor::Destination::Destination(float *pixels, const int num_components)
{
}
PassAccessor::Destination::Destination(const PassType pass_type)
PassAccessor::Destination::Destination(const PassType pass_type, const PassMode pass_mode)
{
const PassInfo pass_info = Pass::get_info(pass_type);
const PassInfo pass_info = Pass::get_info(pass_type, pass_mode);
num_components = pass_info.num_components;
}
@@ -126,12 +126,16 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
const PassType type = pass_access_info_.type;
const PassMode mode = pass_access_info_.mode;
const PassInfo pass_info = Pass::get_info(
type, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
type, mode, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
int num_written_components = pass_info.num_components;
if (pass_info.num_components == 1) {
if (is_volume_guiding_pass(type)) {
get_pass_rgbe(render_buffers, buffer_params, destination);
num_written_components = 3;
}
/* Single channel passes. */
if (mode == PassMode::DENOISED) {
else if (mode == PassMode::DENOISED) {
/* Denoised passes store their final pixels, no need in special calculation. */
get_pass_float(render_buffers, buffer_params, destination);
}
@@ -226,8 +230,10 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
const Destination &destination) const
{
const PassMode mode = pass_access_info_.mode;
const PassInfo &pass_info = Pass::get_info(
pass_access_info_.type, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
mode,
pass_access_info_.include_albedo,
pass_access_info_.is_lightgroup);
kfilm_convert->pass_offset = pass_access_info_.offset;
kfilm_convert->pass_stride = buffer_params.pass_stride;
@@ -290,8 +296,10 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
return false;
}
const PassInfo pass_info = Pass::get_info(
pass_access_info_.type, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
pass_access_info_.mode,
pass_access_info_.include_albedo,
pass_access_info_.is_lightgroup);
const BufferParams &buffer_params = render_buffers->params;

View File

@@ -48,7 +48,7 @@ class PassAccessor {
/* Destination will be initialized with the number of components which is native for the given
* pass type. */
explicit Destination(const PassType pass_type);
explicit Destination(const PassType pass_type, const PassMode pass_mode);
/* CPU-side pointers. only usable by the `PassAccessorCPU`. */
float *pixels = nullptr;
@@ -138,6 +138,7 @@ class PassAccessor {
/* Float3 passes. */
DECLARE_PASS_ACCESSOR(light_path)
DECLARE_PASS_ACCESSOR(shadow_catcher)
DECLARE_PASS_ACCESSOR(rgbe)
DECLARE_PASS_ACCESSOR(float3)
/* Float4 passes. */

View File

@@ -112,6 +112,7 @@ DEFINE_PASS_ACCESSOR(float)
/* Float3 passes. */
DEFINE_PASS_ACCESSOR(light_path)
DEFINE_PASS_ACCESSOR(shadow_catcher)
DEFINE_PASS_ACCESSOR(rgbe)
DEFINE_PASS_ACCESSOR(float3)
/* Float4 passes. */

View File

@@ -47,6 +47,7 @@ class PassAccessorCPU : public PassAccessor {
/* Float3 passes. */
DECLARE_PASS_ACCESSOR(light_path)
DECLARE_PASS_ACCESSOR(shadow_catcher)
DECLARE_PASS_ACCESSOR(rgbe)
DECLARE_PASS_ACCESSOR(float3)
/* Float4 passes. */

View File

@@ -96,6 +96,7 @@ DEFINE_PASS_ACCESSOR(float, FLOAT);
/* Float3 passes. */
DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
DEFINE_PASS_ACCESSOR(rgbe, RGBE);
DEFINE_PASS_ACCESSOR(float3, FLOAT3);
/* Float4 passes. */

View File

@@ -40,6 +40,7 @@ class PassAccessorGPU : public PassAccessor {
/* Float3 passes. */
DECLARE_PASS_ACCESSOR(light_path);
DECLARE_PASS_ACCESSOR(rgbe);
DECLARE_PASS_ACCESSOR(float3);
/* Float4 passes. */

View File

@@ -186,9 +186,9 @@ PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMod
}
PassAccessor::Destination PathTraceWork::get_display_destination_template(
const PathTraceDisplay *display) const
const PathTraceDisplay *display, const PassMode mode) const
{
PassAccessor::Destination destination(film_->get_display_pass());
PassAccessor::Destination destination(film_->get_display_pass(), mode);
const int2 display_texture_size = display->get_texture_size();
const int texture_x = effective_buffer_params_.full_x - effective_big_tile_params_.full_x +

View File

@@ -165,8 +165,8 @@ class PathTraceWork {
/* Get destination which offset and stride are configured so that writing to it will write to a
* proper location of GPU display texture, taking current tile and device slice into account. */
PassAccessor::Destination get_display_destination_template(
const PathTraceDisplay *display) const;
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display,
const PassMode mode) const;
/* Device which will be used for path tracing.
* Note that it is an actual render device (and never is a multi-device). */

View File

@@ -197,7 +197,7 @@ void PathTraceWorkCPU::copy_to_display(PathTraceDisplay *display,
const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
PassAccessor::Destination destination = get_display_destination_template(display);
PassAccessor::Destination destination = get_display_destination_template(display, pass_mode);
destination.pixels_half_rgba = rgba_half;
tbb::task_arena local_arena = local_tbb_arena_create(device_);

View File

@@ -1018,7 +1018,7 @@ void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
queue_->zero_to_device(display_rgba_half_);
}
PassAccessor::Destination destination(film_->get_display_pass());
PassAccessor::Destination destination(film_->get_display_pass(), pass_mode);
destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;
get_render_tile_film_pixels(destination, pass_mode, num_samples);
@@ -1045,7 +1045,7 @@ bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display,
return false;
}
PassAccessor::Destination destination = get_display_destination_template(display);
PassAccessor::Destination destination = get_display_destination_template(display, pass_mode);
destination.d_pixels_half_rgba = d_rgba_half;
get_render_tile_film_pixels(destination, pass_mode, num_samples);

View File

@@ -420,6 +420,7 @@ set(SRC_UTIL_HEADERS
../util/types_int4.h
../util/types_int8.h
../util/types_spectrum.h
../util/types_rgbe.h
../util/types_uchar2.h
../util/types_uchar3.h
../util/types_uchar4.h

View File

@@ -54,6 +54,7 @@ KERNEL_FILM_CONVERT_FUNCTION(volume_majorant)
KERNEL_FILM_CONVERT_FUNCTION(float)
KERNEL_FILM_CONVERT_FUNCTION(light_path)
KERNEL_FILM_CONVERT_FUNCTION(rgbe)
KERNEL_FILM_CONVERT_FUNCTION(float3)
KERNEL_FILM_CONVERT_FUNCTION(motion)

View File

@@ -374,6 +374,7 @@ KERNEL_FILM_CONVERT_FUNCTION(volume_majorant, true)
KERNEL_FILM_CONVERT_FUNCTION(float, true)
KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
KERNEL_FILM_CONVERT_FUNCTION(rgbe, false)
KERNEL_FILM_CONVERT_FUNCTION(float3, false)
KERNEL_FILM_CONVERT_FUNCTION(motion, false)

View File

@@ -892,6 +892,7 @@ KERNEL_FILM_CONVERT_VARIANT(float, 1)
/* 3 channel inputs */
KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
KERNEL_FILM_CONVERT_VARIANT(rgbe, 3)
KERNEL_FILM_CONVERT_VARIANT(float3, 3)
/* 4 channel inputs */

View File

@@ -277,6 +277,7 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,
#define coshf(x) cosh(float(x))
#define tanhf(x) tanh(float(x))
#define saturatef(x) saturate(float(x))
#define ldexpf(x, y) ldexp(float(x), int(y))
/* Use native functions with possibly lower precision for performance,
* no issues found so far. */

View File

@@ -224,6 +224,7 @@ ccl_device_forceinline int __float_as_int(const float x)
#define fminf(x, y) sycl::fmin((x), (y))
#define fmodf(x, y) sycl::fmod((x), (y))
#define lgammaf(x) sycl::lgamma((x))
#define ldexpf(x, y) sycl::ldexp((x), (y))
#define cosf(x) sycl::native::cos(((float)(x)))
#define sinf(x) sycl::native::sin(((float)(x)))

View File

@@ -635,6 +635,7 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
DEVICE_KERNEL_FILM_CONVERT(rgbe, RGBE);
DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);

View File

@@ -171,6 +171,22 @@ ccl_device_inline void film_get_pass_pixel_volume_majorant(
pixel[0] = (*count != 0.0f) ? expf(-(f * scale_exposure) / *count) : 0.0f;
}
ccl_device_inline void film_get_pass_pixel_rgbe(const ccl_global KernelFilmConvert *ccl_restrict
kfilm_convert,
const ccl_global float *ccl_restrict buffer,
ccl_private float *ccl_restrict pixel)
{
kernel_assert(kfilm_convert->num_components >= 1);
kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
const ccl_global float *in = buffer + kfilm_convert->pass_offset;
const float3 f = rgbe_to_rgb(RGBE(*in));
pixel[0] = f.x;
pixel[1] = f.y;
pixel[2] = f.z;
}
ccl_device_inline void film_get_pass_pixel_float(const ccl_global KernelFilmConvert *ccl_restrict
kfilm_convert,
const ccl_global float *ccl_restrict buffer,

View File

@@ -60,15 +60,13 @@ ccl_device void volume_guiding_filter_x(KernelGlobals kg,
const float weight = gaussian_params[dx] /
__float_as_uint(buffer[kernel_data.film.pass_sample_count]);
scatter += fabs(kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter)) *
weight;
transmit += fabs(kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit)) *
weight;
scatter += kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter) * weight;
transmit += kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit) * weight;
}
/* Write to the buffer. */
film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
}
ccl_device void volume_guiding_filter_y(KernelGlobals kg,
@@ -109,9 +107,9 @@ ccl_device void volume_guiding_filter_y(KernelGlobals kg,
else {
ccl_global float *buffer = film_pass_pixel_render_buffer(
kg, x, y, offset, stride, render_buffer);
scatter_neighbors[i] = kernel_read_pass_float3(
buffer + kernel_data.film.pass_volume_scatter_denoised);
transmit_neighbors[i] = kernel_read_pass_float3(
scatter_neighbors[i] = kernel_read_pass_rgbe(buffer +
kernel_data.film.pass_volume_scatter_denoised);
transmit_neighbors[i] = kernel_read_pass_rgbe(
buffer + kernel_data.film.pass_volume_transmit_denoised);
}
}
@@ -128,9 +126,9 @@ ccl_device void volume_guiding_filter_y(KernelGlobals kg,
else {
ccl_global float *buffer = film_pass_pixel_render_buffer(
kg, x, next_y, offset, stride, render_buffer);
scatter_neighbors[index] = kernel_read_pass_float3(
scatter_neighbors[index] = kernel_read_pass_rgbe(
buffer + kernel_data.film.pass_volume_scatter_denoised);
transmit_neighbors[index] = kernel_read_pass_float3(
transmit_neighbors[index] = kernel_read_pass_rgbe(
buffer + kernel_data.film.pass_volume_transmit_denoised);
}
@@ -147,8 +145,10 @@ ccl_device void volume_guiding_filter_y(KernelGlobals kg,
/* Write to the buffers. */
ccl_global float *buffer = film_pass_pixel_render_buffer(
kg, x, y, offset, stride, render_buffer);
film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised, scatter);
film_overwrite_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised, transmit);
film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_scatter_denoised,
fabs(scatter));
film_overwrite_pass_rgbe(buffer + kernel_data.film.pass_volume_transmit_denoised,
fabs(transmit));
}
}

View File

@@ -10,6 +10,8 @@
#include "kernel/util/colorspace.h"
#include "util/types_rgbe.h"
#ifdef __KERNEL_GPU__
# include "util/atomic.h"
# define __ATOMIC_PASS_WRITE__
@@ -108,6 +110,12 @@ ccl_device_inline void film_write_pass_float4(ccl_global float *ccl_restrict buf
#endif
}
ccl_device_inline void film_overwrite_pass_rgbe(ccl_global float *ccl_restrict buffer,
const float3 value)
{
*buffer = rgb_to_rgbe(value).f;
}
/* Overwrite for passes that only write on sample 0. This assumes only a single thread will write
* to this pixel and no atomics are needed. */
@@ -142,4 +150,9 @@ ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict
return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
}
ccl_device_inline float3 kernel_read_pass_rgbe(const ccl_global float *ccl_restrict buffer)
{
return rgbe_to_rgb(RGBE(*buffer));
}
CCL_NAMESPACE_END

View File

@@ -1012,9 +1012,9 @@ ccl_device_inline void volume_scatter_probability_get(KernelGlobals kg,
/* Contribution based criterion, see Eq. (15). */
const float L_scattered = reduce_add(
kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_scatter_denoised));
kernel_read_pass_rgbe(buffer + kernel_data.film.pass_volume_scatter_denoised));
const float L_transmitted = reduce_add(
kernel_read_pass_float3(buffer + kernel_data.film.pass_volume_transmit_denoised));
kernel_read_pass_rgbe(buffer + kernel_data.film.pass_volume_transmit_denoised));
const float L_volume = L_transmitted + L_scattered;
/* Compute guided scattering probability. */

View File

@@ -1885,6 +1885,7 @@ enum DeviceKernel : int {
DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
DECLARE_FILM_CONVERT_KERNEL(FLOAT),
DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
DECLARE_FILM_CONVERT_KERNEL(RGBE),
DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
DECLARE_FILM_CONVERT_KERNEL(MOTION),
DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),

View File

@@ -143,7 +143,7 @@ Pass::Pass() : Node(get_node_type()), is_auto_(false) {}
PassInfo Pass::get_info() const
{
return get_info(type, include_albedo, !lightgroup.empty());
return get_info(type, mode, include_albedo, !lightgroup.empty());
}
bool Pass::is_written() const
@@ -151,7 +151,10 @@ bool Pass::is_written() const
return get_info().is_written;
}
PassInfo Pass::get_info(const PassType type, const bool include_albedo, const bool is_lightgroup)
PassInfo Pass::get_info(const PassType type,
const PassMode mode,
const bool include_albedo,
const bool is_lightgroup)
{
PassInfo pass_info;
@@ -280,10 +283,9 @@ PassInfo Pass::get_info(const PassType type, const bool include_albedo, const bo
break;
case PASS_VOLUME_SCATTER:
case PASS_VOLUME_TRANSMIT:
/* TODO(weizhen): Gaussian filter only needs 1 component, but we can have negative pixel
* values in some channels, preventing us from simply add them together; besides, using RGB
* channels is better for visualization. We can optimize the memory by using RGBE format. */
pass_info.num_components = 3;
/* Noisy buffer needs higher precision for accumulating the contribution, denoised buffer is
* used directly and thus can have lower resolution. */
pass_info.num_components = (mode == PassMode::NOISY) ? 3 : 1;
pass_info.use_exposure = true;
pass_info.use_filter = false;
pass_info.support_denoise = true;

View File

@@ -75,6 +75,7 @@ class Pass : public Node {
static const NodeEnum *get_mode_enum();
static PassInfo get_info(PassType type,
const PassMode mode = PassMode::DENOISED,
const bool include_albedo = false,
const bool is_lightgroup = false);

View File

@@ -68,7 +68,7 @@ BufferPass::BufferPass(const Pass *scene_pass)
PassInfo BufferPass::get_info() const
{
return Pass::get_info(type, include_albedo, !lightgroup.empty());
return Pass::get_info(type, mode, include_albedo, !lightgroup.empty());
}
/* --------------------------------------------------------------------

View File

@@ -39,6 +39,7 @@ set(SRC
util_math_fast_test.cpp
util_math_float3_test.cpp
util_math_float4_test.cpp
util_rgbe_test.cpp
util_md5_test.cpp
util_path_test.cpp
util_string_test.cpp

View File

@@ -0,0 +1,65 @@
/* SPDX-FileCopyrightText: 2025 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#include <gtest/gtest.h>
#include "util/log.h"
#include "util/types_rgbe.h"
CCL_NAMESPACE_BEGIN
TEST(RGBE, round_trip)
{
{
const float3 f = make_float3(7.334898f, 5.811583f, 2.414717f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(7.34375f, 5.8125f, 2.40625f));
}
{
const float3 f = make_float3(0.08750992f, 0.05150064f, 0.24991725f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(0.087890625f, 0.05078125f, 0.25f));
}
{
const float3 f = make_float3(4e-6f, 30257.0f, 1.0f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(0.0f, 30208.0f, 0.0f));
}
{
const float3 f = zero_float3();
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), zero_float3());
}
{
const float3 f = make_float3(5.9e-8f, 0.0f, 0.0f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), zero_float3());
}
{
const float3 f = make_float3(6.0e-8f, 0.0f, 0.0f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(1.1920928955078125e-7f, 0.0f, 0.0f));
}
{
const float3 f = make_float3(-0.863880f, 0.558654f, -0.223357f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(-0.86328125f, 0.55859375f, -0.22265625f));
}
{
const float3 f = make_float3(-FLT_MAX, FLT_MAX, 0.0f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(-65280.0f, 65280.0f, 0.0f));
}
{
const float inf = __uint_as_float(0x7f800000);
const float3 f = make_float3(inf, 127.0f, 129.0f);
EXPECT_EQ(rgbe_to_rgb(rgb_to_rgbe(f)), make_float3(65280.0f, 0.0f, 256.0f));
}
{
/* No test for NaN, undefined behaviour. */
}
}
CCL_NAMESPACE_END

View File

@@ -115,6 +115,7 @@ set(SRC_HEADERS
types_int4.h
types_int8.h
types_spectrum.h
types_rgbe.h
types_uchar2.h
types_uchar3.h
types_uchar4.h

View File

@@ -668,6 +668,11 @@ ccl_device_inline float xor_mask(const float x, const uint y)
return __uint_as_float(__float_as_uint(x) ^ y);
}
ccl_device_inline float or_mask(const float x, const uint y)
{
return __uint_as_float(__float_as_uint(x) | y);
}
ccl_device float bits_to_01(const uint bits)
{
return bits * (1.0f / (float)0xFFFFFFFF);

View File

@@ -341,6 +341,11 @@ ccl_device_inline float vector_angle(const float3 a, const float3 b)
return 2.0f * fast_atan2f(len(a - b), len(a + b));
}
ccl_device_inline int floor_log2f(const float x)
{
return (int)(__float_as_uint(x) >> 23) - 127;
}
/* Based on:
*
* https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
@@ -351,7 +356,7 @@ ccl_device float fast_log2f(float x)
* negative values/NAN's. */
x = clamp(x, FLT_MIN, FLT_MAX);
const unsigned bits = __float_as_uint(x);
const int exponent = (int)(bits >> 23) - 127;
const int exponent = floor_log2f(x);
const float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
/* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
* 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.

View File

@@ -26,6 +26,7 @@
#include "util/types_float4.h" // IWYU pragma: export
#include "util/types_float8.h" // IWYU pragma: export
#include "util/types_rgbe.h" // IWYU pragma: export
#include "util/types_spectrum.h" // IWYU pragma: export
#include "util/types_dual.h" // IWYU pragma: export

View File

@@ -0,0 +1,109 @@
/* SPDX-FileCopyrightText: 2025 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#include "util/math_fast.h"
#include "util/math_float3.h"
#include "util/types_base.h"
CCL_NAMESPACE_BEGIN
struct RGBE {
union {
struct {
uint8_t r, g, b, e;
};
float f;
};
RGBE() = default;
ccl_device_inline_method RGBE(const float f_) : f(f_) {}
};
static_assert(sizeof(RGBE) == 4, "RGBE expected to be exactly 4 bytes");
/**
* RGBE format represents an RGB value with 4 bytes.
*
* The original implementation by Greg Ward uses 8 bits for RGB each, plus 8 bits shared exponent.
* It has the same relative precision as the 0 to 255 range of standard 24-bit image formats, but
* offers extended intensity range:
* https://www.graphics.cornell.edu/~bjw/rgbe.html
* GL_EXT_texture_shared_exponent uses 9 bits for RGB and 5 bits for exponent instead, with smaller
* range but higher precision:
* https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
*
* Our implementation is mostly based on GL_EXT_texture_shared_exponent, but uses 8 bits for RGB
* each, and adds 3 sign bits to represent negative values. The memory layout is as follows:
*
* xxxxxxxx xxxxxxxx xxxxxxxx xxx xxxxx
* m(R) m(G) m(B) sgn exp
*
* Each float component is interpreted as
* sgn exp - bias
* f = (-1) * 0.m * 2
*
* We choose a bias of 15, so that the largest representable value is
* RGBE_MAX = 0.11111111 * 2^(31 - 15) = 65280,
* and the smallest positive representable value is
* RGBE_MIN = 0.00000001 * 2^(0 - 15) = 1.1920929e-7
*/
#define RGBE_EXP_BIAS 15
#define RGBE_MANTISSA_BITS 8
#define RGBE_EXPONENT_BITS 5
#define RGBE_MAX 65280.0f
ccl_device RGBE rgb_to_rgbe(float3 rgb)
{
const float max_v = min(reduce_max(fabs(rgb)), RGBE_MAX);
if (max_v < ldexpf(0.5f, -RGBE_EXP_BIAS - RGBE_MANTISSA_BITS)) {
return RGBE(0.0f);
}
int e = max(-RGBE_EXP_BIAS - 1, floor_log2f(max_v)) + 1;
float v = ldexpf(1.0f, RGBE_MANTISSA_BITS - e);
/* The original implementation by Greg Ward uses `floor`, causing systematic bias when
* accumulated in a buffer.
* We use `round` instead, but need to deal with overflow. */
if (int(roundf(max_v * v)) == power_of_2(RGBE_MANTISSA_BITS)) {
e += 1;
v *= 0.5f;
}
/* Get sign bits. */
const uint sign_bits = ((__float_as_uint(rgb.x) >> 31) << 7) |
((__float_as_uint(rgb.y) >> 31) << 6) |
((__float_as_uint(rgb.z) >> 31) << 5);
RGBE rgbe;
rgb = min(round(fabs(rgb) * v), make_float3(255.0f));
rgbe.r = uint8_t(rgb.x);
rgbe.g = uint8_t(rgb.y);
rgbe.b = uint8_t(rgb.z);
rgbe.e = uint8_t(((e + RGBE_EXP_BIAS) & 0x1Fu) | sign_bits);
return rgbe;
}
ccl_device_inline float3 rgbe_to_rgb(const RGBE rgbe)
{
if (rgbe.f == 0.0f) {
return zero_float3();
}
const int e = rgbe.e & 0x1Fu;
const float f = ldexpf(1.0f, e - (int)(RGBE_EXP_BIAS + RGBE_MANTISSA_BITS));
float3 result = make_float3(rgbe.r, rgbe.g, rgbe.b) * f;
/* Set sign bits. */
result.x = or_mask(result.x, (uint(rgbe.e) & 0x80u) << 24);
result.y = or_mask(result.y, (uint(rgbe.e) & 0x40u) << 25);
result.z = or_mask(result.z, (uint(rgbe.e) & 0x20u) << 26);
return result;
}
CCL_NAMESPACE_END