Files
test/source/blender/compositor/operations/COM_VectorBlurOperation.cc
2024-04-08 21:57:41 +10:00

503 lines
19 KiB
C++

/* SPDX-FileCopyrightText: 2024 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#include <cmath>
#include <cstring>
#include <memory>
#include "BLI_array.hh"
#include "BLI_index_range.hh"
#include "BLI_math_base.hh"
#include "BLI_math_vector.h"
#include "BLI_math_vector.hh"
#include "BLI_task.hh"
#include "COM_VectorBlurOperation.h"
/* This is identical to the compositor implementation in compositor_motion_blur_info.hh and its
* related files with the necessary adjustments to make it work for the CPU. */
#define MOTION_BLUR_TILE_SIZE 32
#define DEPTH_SCALE 100.0f
namespace blender::compositor {
VectorBlurOperation::VectorBlurOperation()
{
this->add_input_socket(DataType::Color);
this->add_input_socket(DataType::Value);
this->add_input_socket(DataType::Color);
this->add_output_socket(DataType::Color);
settings_ = nullptr;
}
/* Returns the input velocity that has the larger magnitude. */
static float2 max_velocity(const float2 &a, const float2 &b)
{
return math::length_squared(a) > math::length_squared(b) ? a : b;
}
/* Identical to motion_blur_tile_indirection_pack_payload, encodes the value and its texel such
* that the integer length of the value is encoded in the most significant bits, then the x value
* of the texel are encoded in the middle bits, then the y value of the texel is stored in the
* least significant bits. */
static uint32_t velocity_atomic_max_value(const float2 &value, const int2 &texel)
{
const uint32_t length_bits = math::min(uint32_t(math::ceil(math::length(value))), 0x3FFFu);
return (length_bits << 18u) | ((texel.x & 0x1FFu) << 9u) | (texel.y & 0x1FFu);
}
/* Returns the input velocity that has the larger integer magnitude, and if equal the larger x
* texel coordinates, and if equal, the larger y texel coordinates. It might be weird that we use
* an approximate comparison, but this is used for compatibility with the GPU code, which uses
* atomic integer operations, hence the limited precision. See velocity_atomic_max_value for more
* information. */
static float2 max_velocity_approximate(const float2 &a,
const float2 &b,
const int2 &a_texel,
const int2 &b_texel)
{
return velocity_atomic_max_value(a, a_texel) > velocity_atomic_max_value(b, b_texel) ? a : b;
}
/* Reduces each 32x32 block of velocity pixels into a single velocity whose magnitude is largest.
* Each of the previous and next velocities are reduces independently. */
static MemoryBuffer compute_max_tile_velocity(MemoryBuffer *velocity_buffer)
{
const int2 tile_size = int2(MOTION_BLUR_TILE_SIZE);
const int2 velocity_size = int2(velocity_buffer->get_width(), velocity_buffer->get_height());
const int2 tiles_count = math::divide_ceil(velocity_size, tile_size);
MemoryBuffer output(DataType::Color, tiles_count.x, tiles_count.y);
threading::parallel_for(IndexRange(tiles_count.y), 1, [&](const IndexRange sub_y_range) {
for (const int64_t y : sub_y_range) {
for (const int64_t x : IndexRange(tiles_count.x)) {
const int2 texel = int2(x, y);
float2 max_previous_velocity = float2(0.0f);
float2 max_next_velocity = float2(0.0f);
for (int j = 0; j < tile_size.y; j++) {
for (int i = 0; i < tile_size.x; i++) {
int2 sub_texel = texel * tile_size + int2(i, j);
const float4 velocity = velocity_buffer->get_elem_clamped(sub_texel.x, sub_texel.y);
max_previous_velocity = max_velocity(velocity.xy(), max_previous_velocity);
max_next_velocity = max_velocity(velocity.zw(), max_next_velocity);
}
}
const float4 max_velocity = float4(max_previous_velocity, max_next_velocity);
copy_v4_v4(output.get_elem(texel.x, texel.y), max_velocity);
}
}
});
return output;
}
struct MotionRect {
int2 bottom_left;
int2 extent;
};
static MotionRect compute_motion_rect(int2 tile, float2 motion, int2 size)
{
/* `ceil()` to number of tile touched. */
int2 point1 = tile + int2(math::sign(motion) *
math::ceil(math::abs(motion) / float(MOTION_BLUR_TILE_SIZE)));
int2 point2 = tile;
int2 max_point = math::max(point1, point2);
int2 min_point = math::min(point1, point2);
/* Clamp to bounds. */
max_point = math::min(max_point, size - 1);
min_point = math::max(min_point, int2(0));
MotionRect rect;
rect.bottom_left = min_point;
rect.extent = 1 + max_point - min_point;
return rect;
}
struct MotionLine {
/** Origin of the line. */
float2 origin;
/** Normal to the line direction. */
float2 normal;
};
static MotionLine compute_motion_line(int2 tile, float2 motion)
{
float magnitude = math::length(motion);
float2 dir = magnitude != 0.0f ? motion / magnitude : motion;
MotionLine line;
line.origin = float2(tile);
/* Rotate 90 degrees counter-clockwise. */
line.normal = float2(-dir.y, dir.x);
return line;
}
static bool is_inside_motion_line(int2 tile, MotionLine motion_line)
{
/* NOTE: Everything in is tile unit. */
float distance_to_line = math::dot(motion_line.normal, motion_line.origin - float2(tile));
/* In order to be conservative and for simplicity, we use the tiles bounding circles.
* Consider that both the tile and the line have bounding radius of M_SQRT1_2. */
return math::abs(distance_to_line) < math::numbers::sqrt2_v<float>;
}
/* The max tile velocity image computes the maximum within 32x32 blocks, while the velocity can
* in fact extend beyond such a small block. So we dilate the max blocks by taking the maximum
* along the path of each of the max velocity tiles. Since the shader uses custom max atomics,
* the output will be an indirection buffer that points to a particular tile in the original max
* tile velocity image. This is done as a form of performance optimization, see the shader for
* more information. */
static MemoryBuffer dilate_max_velocity(MemoryBuffer &max_tile_velocity, float shutter_speed)
{
const int2 size = int2(max_tile_velocity.get_width(), max_tile_velocity.get_height());
MemoryBuffer output(DataType::Color, size.x, size.y);
const float4 zero_value = float4(0.0f);
output.fill(output.get_rect(), zero_value);
for (const int64_t y : IndexRange(size.y)) {
for (const int64_t x : IndexRange(size.x)) {
const int2 src_tile = int2(x, y);
float4 max_motion = float4(max_tile_velocity.get_elem(x, y)) *
float4(float2(shutter_speed), float2(-shutter_speed));
{
/* Rectangular area (in tiles) where the motion vector spreads. */
MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.xy(), size);
MotionLine motion_line = compute_motion_line(src_tile, max_motion.xy());
/* Do a conservative rasterization of the line of the motion vector line. */
for (int j = 0; j < motion_rect.extent.y; j++) {
for (int i = 0; i < motion_rect.extent.x; i++) {
int2 tile = motion_rect.bottom_left + int2(i, j);
if (is_inside_motion_line(tile, motion_line)) {
float *pixel = output.get_elem(tile.x, tile.y);
copy_v2_v2(pixel + 2,
max_velocity_approximate(pixel + 2, max_motion.zw(), tile, src_tile));
copy_v2_v2(pixel, max_velocity_approximate(pixel, max_motion.xy(), tile, src_tile));
}
}
}
}
{
/* Rectangular area (in tiles) where the motion vector spreads. */
MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.zw(), size);
MotionLine motion_line = compute_motion_line(src_tile, max_motion.zw());
/* Do a conservative rasterization of the line of the motion vector line. */
for (int j = 0; j < motion_rect.extent.y; j++) {
for (int i = 0; i < motion_rect.extent.x; i++) {
int2 tile = motion_rect.bottom_left + int2(i, j);
if (is_inside_motion_line(tile, motion_line)) {
float *pixel = output.get_elem(tile.x, tile.y);
copy_v2_v2(pixel, max_velocity_approximate(pixel, max_motion.xy(), tile, src_tile));
copy_v2_v2(pixel + 2,
max_velocity_approximate(pixel + 2, max_motion.zw(), tile, src_tile));
}
}
}
}
}
}
return output;
}
/* Interleaved gradient noise by Jorge Jimenez
* http://www.iryoku.com/next-generation-post-processing-in-call-of-duty-advanced-warfare. */
static float interleaved_gradient_noise(int2 p)
{
return math::fract(52.9829189f * math::fract(0.06711056f * p.x + 0.00583715f * p.y));
}
static float2 spread_compare(float center_motion_length,
float sample_motion_length,
float offset_length)
{
return math::clamp(
float2(center_motion_length, sample_motion_length) - offset_length + 1.0f, 0.0f, 1.0f);
}
static float2 depth_compare(float center_depth, float sample_depth)
{
float2 depth_scale = float2(DEPTH_SCALE, -DEPTH_SCALE);
return math::clamp(0.5f + depth_scale * (sample_depth - center_depth), 0.0f, 1.0f);
}
/* Kill contribution if not going the same direction. */
static float dir_compare(float2 offset, float2 sample_motion, float sample_motion_length)
{
if (sample_motion_length < 0.5f) {
return 1.0f;
}
return (math::dot(offset, sample_motion) > 0.0f) ? 1.0f : 0.0f;
}
/* Return background (x) and foreground (y) weights. */
static float2 sample_weights(float center_depth,
float sample_depth,
float center_motion_length,
float sample_motion_length,
float offset_length)
{
/* Classify foreground/background. */
float2 depth_weight = depth_compare(center_depth, sample_depth);
/* Weight if sample is overlapping or under the center pixel. */
float2 spread_weight = spread_compare(center_motion_length, sample_motion_length, offset_length);
return depth_weight * spread_weight;
}
struct Accumulator {
float4 fg;
float4 bg;
/** x: Background, y: Foreground, z: dir. */
float3 weight;
};
static void gather_sample(MemoryBuffer *image_buffer,
MemoryBuffer *depth_buffer,
MemoryBuffer *velocity_buffer,
int2 size,
float2 screen_uv,
float center_depth,
float center_motion_len,
float2 offset,
float offset_len,
const bool next,
float shutter_speed,
Accumulator &accum)
{
float2 sample_uv = screen_uv - offset / float2(size);
float4 sample_vectors = velocity_buffer->texture_bilinear_extend(sample_uv) *
float4(float2(shutter_speed), float2(-shutter_speed));
float2 sample_motion = (next) ? sample_vectors.zw() : sample_vectors.xy();
float sample_motion_len = math::length(sample_motion);
float sample_depth = depth_buffer->texture_bilinear_extend(sample_uv).x;
float4 sample_color = image_buffer->texture_bilinear_extend(sample_uv);
float2 direct_weights = sample_weights(
center_depth, sample_depth, center_motion_len, sample_motion_len, offset_len);
float3 weights;
weights.x = direct_weights.x;
weights.y = direct_weights.y;
weights.z = dir_compare(offset, sample_motion, sample_motion_len);
weights.x *= weights.z;
weights.y *= weights.z;
accum.fg += sample_color * weights.y;
accum.bg += sample_color * weights.x;
accum.weight += weights;
}
static void gather_blur(MemoryBuffer *image_buffer,
MemoryBuffer *depth_buffer,
MemoryBuffer *velocity_buffer,
int2 size,
float2 screen_uv,
float2 center_motion,
float center_depth,
float2 max_motion,
float ofs,
const bool next,
int samples_count,
float shutter_speed,
Accumulator &accum)
{
float center_motion_len = math::length(center_motion);
float max_motion_len = math::length(max_motion);
/* Tile boundaries randomization can fetch a tile where there is less motion than this pixel.
* Fix this by overriding the max_motion. */
if (max_motion_len < center_motion_len) {
max_motion_len = center_motion_len;
max_motion = center_motion;
}
if (max_motion_len < 0.5f) {
return;
}
int i;
float t, inc = 1.0f / float(samples_count);
for (i = 0, t = ofs * inc; i < samples_count; i++, t += inc) {
gather_sample(image_buffer,
depth_buffer,
velocity_buffer,
size,
screen_uv,
center_depth,
center_motion_len,
max_motion * t,
max_motion_len * t,
next,
shutter_speed,
accum);
}
if (center_motion_len < 0.5f) {
return;
}
for (i = 0, t = ofs * inc; i < samples_count; i++, t += inc) {
/* Also sample in center motion direction.
* Allow recovering motion where there is conflicting
* motion between foreground and background. */
gather_sample(image_buffer,
depth_buffer,
velocity_buffer,
size,
screen_uv,
center_depth,
center_motion_len,
center_motion * t,
center_motion_len * t,
next,
shutter_speed,
accum);
}
}
static void motion_blur(MemoryBuffer *image_buffer,
MemoryBuffer *depth_buffer,
MemoryBuffer *velocity_buffer,
MemoryBuffer *max_velocity_buffer,
MemoryBuffer *output,
int samples_count,
float shutter_speed)
{
const int2 size = int2(image_buffer->get_width(), image_buffer->get_height());
threading::parallel_for(IndexRange(size.y), 1, [&](const IndexRange sub_y_range) {
for (const int64_t y : sub_y_range) {
for (const int64_t x : IndexRange(size.x)) {
const int2 texel = int2(x, y);
float2 uv = (float2(texel) + 0.5f) / float2(size);
/* Data of the center pixel of the gather (target). */
float center_depth = *depth_buffer->get_elem(x, y);
float4 center_motion = float4(velocity_buffer->get_elem(x, y)) *
float4(float2(shutter_speed), float2(-shutter_speed));
float4 center_color = image_buffer->get_elem(x, y);
/* Randomize tile boundary to avoid ugly discontinuities. Randomize 1/4th of the tile.
* Note this randomize only in one direction but in practice it's enough. */
float rand = interleaved_gradient_noise(texel);
int2 tile = (texel + int2(rand * 2.0f - 1.0f * float(MOTION_BLUR_TILE_SIZE) * 0.25f)) /
MOTION_BLUR_TILE_SIZE;
/* No need to multiply by the shutter speed and invert the next velocities since this was
* already done in dilate_max_velocity. */
float4 max_motion = max_velocity_buffer->get_elem(tile.x, tile.y);
Accumulator accum;
accum.weight = float3(0.0f, 0.0f, 1.0f);
accum.bg = float4(0.0f);
accum.fg = float4(0.0f);
/* First linear gather. time = [T - delta, T] */
gather_blur(image_buffer,
depth_buffer,
velocity_buffer,
size,
uv,
center_motion.xy(),
center_depth,
max_motion.xy(),
rand,
false,
samples_count,
shutter_speed,
accum);
/* Second linear gather. time = [T, T + delta] */
gather_blur(image_buffer,
depth_buffer,
velocity_buffer,
size,
uv,
center_motion.zw(),
center_depth,
max_motion.zw(),
rand,
true,
samples_count,
shutter_speed,
accum);
#if 1 /* Own addition. Not present in reference implementation. */
/* Avoid division by 0.0. */
float w = 1.0f / (50.0f * float(samples_count) * 4.0f);
accum.bg += center_color * w;
accum.weight.x += w;
/* NOTE: In Jimenez's presentation, they used center sample.
* We use background color as it contains more information for foreground
* elements that have not enough weights.
* Yield better blur in complex motion. */
center_color = accum.bg / accum.weight.x;
#endif
/* Merge background. */
accum.fg += accum.bg;
accum.weight.y += accum.weight.x;
/* Balance accumulation for failed samples.
* We replace the missing foreground by the background. */
float blend_fac = math::clamp(1.0f - accum.weight.y / accum.weight.z, 0.0f, 1.0f);
float4 out_color = (accum.fg / accum.weight.z) + center_color * blend_fac;
copy_v4_v4(output->get_elem(x, y), out_color);
}
}
});
}
void VectorBlurOperation::update_memory_buffer(MemoryBuffer *output,
const rcti & /*area*/,
Span<MemoryBuffer *> inputs)
{
MemoryBuffer *image = inputs[IMAGE_INPUT_INDEX];
MemoryBuffer *depth = inputs[DEPTH_INPUT_INDEX];
MemoryBuffer *velocity = inputs[VELOCITY_INPUT_INDEX];
const bool image_needs_inflation = image->is_a_single_elem();
const bool depth_needs_inflation = depth->is_a_single_elem();
const bool velocity_needs_inflation = velocity->is_a_single_elem();
MemoryBuffer *image_buffer = image_needs_inflation ? image->inflate() : image;
MemoryBuffer *depth_buffer = depth_needs_inflation ? depth->inflate() : depth;
MemoryBuffer *velocity_buffer = velocity_needs_inflation ? velocity->inflate() : velocity;
MemoryBuffer max_tile_velocity = compute_max_tile_velocity(velocity_buffer);
MemoryBuffer max_velocity = dilate_max_velocity(max_tile_velocity, settings_->fac);
motion_blur(image_buffer,
depth_buffer,
velocity_buffer,
&max_velocity,
output,
settings_->samples,
settings_->fac);
if (image_needs_inflation) {
delete image_buffer;
}
if (depth_needs_inflation) {
delete depth_buffer;
}
if (velocity_needs_inflation) {
delete velocity_buffer;
}
}
void VectorBlurOperation::get_area_of_interest(const int /*input_idx*/,
const rcti & /*output_area*/,
rcti &r_input_area)
{
r_input_area = this->get_canvas();
}
} // namespace blender::compositor