Realtime Compositor: Implement Vector Blur node

This patch implements the Vector Blur node for the Realtime Compositor. The implementation is a direct and mostly identical port of the EEVEE motion blur implementation with the necessary adjustments to make it work with the compositor. The exposed parameters in the node does not match those exposed in EEVEE, so only the parameters shared between both are currently implemented. In the future, we should make a decision to either unify both, or just consider them independent implementations, with the possibility of sharing the full or part of the code. Further, it would also make sense to port the implementation to the CPU compositor, since the new implementation is higher in quality while also being faster. The default value of the node shutter setting was changed to 0.25 to approximately match the default settings of EEVEE and Cycles, since in their default settings, they evaluate the previous and next frames at plus and minus 0.25. Pull Request: https://projects.blender.org/blender/blender/pulls/116977
2024-01-12 12:12:01 +01:00
parent c964b79edf
commit e84dc990b1
7 changed files with 551 additions and 8 deletions
--- a/source/blender/compositor/realtime_compositor/CMakeLists.txt
+++ b/source/blender/compositor/realtime_compositor/CMakeLists.txt
@@ -183,6 +183,8 @@ set(GLSL_SRC
  shaders/compositor_morphological_distance_feather.glsl
  shaders/compositor_morphological_distance_threshold.glsl
  shaders/compositor_morphological_step.glsl
+  shaders/compositor_motion_blur.glsl
+  shaders/compositor_motion_blur_max_velocity_dilate.glsl
  shaders/compositor_movie_distortion.glsl
  shaders/compositor_normalize.glsl
  shaders/compositor_parallel_reduction.glsl
@@ -235,6 +237,7 @@ set(GLSL_SRC
  shaders/library/gpu_shader_compositor_luminance_matte.glsl
  shaders/library/gpu_shader_compositor_main.glsl
  shaders/library/gpu_shader_compositor_map_value.glsl
+  shaders/library/gpu_shader_compositor_motion_blur_lib.glsl
  shaders/library/gpu_shader_compositor_normal.glsl
  shaders/library/gpu_shader_compositor_ocio_processor.glsl
  shaders/library/gpu_shader_compositor_posterize.glsl
@@ -304,6 +307,7 @@ set(SRC_SHADER_CREATE_INFOS
  shaders/infos/compositor_morphological_distance_info.hh
  shaders/infos/compositor_morphological_distance_threshold_info.hh
  shaders/infos/compositor_morphological_step_info.hh
+  shaders/infos/compositor_motion_blur_info.hh
  shaders/infos/compositor_movie_distortion_info.hh
  shaders/infos/compositor_normalize_info.hh
  shaders/infos/compositor_parallel_reduction_info.hh
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_motion_blur.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_motion_blur.glsl
@@ -0,0 +1,203 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* This is identical to the EEVEE implementation in eevee_motion_blur_gather_comp.glsl with the
+ * necessary adjustments to make it work for the compositor:
+ *
+ *   - depth_compare() uses an inverted sign since the depth texture stores linear depth.
+ *   - The next velocities are inverted since the velocity textures stores the previous and next
+ *     velocities in the same direction.
+ *   - The samples count is a variable uniform and not fixed to 8 samples.
+ *   - The depth scale is constant and set to 100.
+ *   - The motion scale is defined by the shutter_speed. */
+
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_motion_blur_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+const float g_depth_scale = 100.0;
+
+/* Interleaved gradient noise by Jorge Jimenez
+ * http://www.iryoku.com/next-generation-post-processing-in-call-of-duty-advanced-warfare. */
+float interleaved_gradient_noise(ivec2 p)
+{
+  return fract(52.9829189 * fract(0.06711056 * p.x + 0.00583715 * p.y));
+}
+
+vec2 spread_compare(float center_motion_length, float sample_motion_length, float offset_length)
+{
+  return clamp(vec2(center_motion_length, sample_motion_length) - offset_length + 1.0, 0.0, 1.0);
+}
+
+vec2 depth_compare(float center_depth, float sample_depth)
+{
+  vec2 depth_scale = vec2(g_depth_scale, -g_depth_scale);
+  return clamp(0.5 + depth_scale * (sample_depth - center_depth), 0.0, 1.0);
+}
+
+/* Kill contribution if not going the same direction. */
+float dir_compare(vec2 offset, vec2 sample_motion, float sample_motion_length)
+{
+  if (sample_motion_length < 0.5) {
+    return 1.0;
+  }
+  return (dot(offset, sample_motion) > 0.0) ? 1.0 : 0.0;
+}
+
+/* Return background (x) and foreground (y) weights. */
+vec2 sample_weights(float center_depth,
+                    float sample_depth,
+                    float center_motion_length,
+                    float sample_motion_length,
+                    float offset_length)
+{
+  /* Classify foreground/background. */
+  vec2 depth_weight = depth_compare(center_depth, sample_depth);
+  /* Weight if sample is overlapping or under the center pixel. */
+  vec2 spread_weight = spread_compare(center_motion_length, sample_motion_length, offset_length);
+  return depth_weight * spread_weight;
+}
+
+struct Accumulator {
+  vec4 fg;
+  vec4 bg;
+  /** x: Background, y: Foreground, z: dir. */
+  vec3 weight;
+};
+
+void gather_sample(vec2 screen_uv,
+                   float center_depth,
+                   float center_motion_len,
+                   vec2 offset,
+                   float offset_len,
+                   const bool next,
+                   inout Accumulator accum)
+{
+  vec2 sample_uv = screen_uv - offset / vec2(texture_size(input_tx));
+  vec4 sample_vectors = texture(velocity_tx, sample_uv) *
+                        vec4(vec2(shutter_speed), vec2(-shutter_speed));
+  vec2 sample_motion = (next) ? sample_vectors.zw : sample_vectors.xy;
+  float sample_motion_len = length(sample_motion);
+  float sample_depth = texture(depth_tx, sample_uv).r;
+  vec4 sample_color = texture(input_tx, sample_uv);
+
+  vec3 weights;
+  weights.xy = sample_weights(
+      center_depth, sample_depth, center_motion_len, sample_motion_len, offset_len);
+  weights.z = dir_compare(offset, sample_motion, sample_motion_len);
+  weights.xy *= weights.z;
+
+  accum.fg += sample_color * weights.y;
+  accum.bg += sample_color * weights.x;
+  accum.weight += weights;
+}
+
+void gather_blur(vec2 screen_uv,
+                 vec2 center_motion,
+                 float center_depth,
+                 vec2 max_motion,
+                 float ofs,
+                 const bool next,
+                 inout Accumulator accum)
+{
+  float center_motion_len = length(center_motion);
+  float max_motion_len = length(max_motion);
+
+  /* Tile boundaries randomization can fetch a tile where there is less motion than this pixel.
+   * Fix this by overriding the max_motion. */
+  if (max_motion_len < center_motion_len) {
+    max_motion_len = center_motion_len;
+    max_motion = center_motion;
+  }
+
+  if (max_motion_len < 0.5) {
+    return;
+  }
+
+  int i;
+  float t, inc = 1.0 / float(samples_count);
+  for (i = 0, t = ofs * inc; i < samples_count; i++, t += inc) {
+    gather_sample(screen_uv,
+                  center_depth,
+                  center_motion_len,
+                  max_motion * t,
+                  max_motion_len * t,
+                  next,
+                  accum);
+  }
+
+  if (center_motion_len < 0.5) {
+    return;
+  }
+
+  for (i = 0, t = ofs * inc; i < samples_count; i++, t += inc) {
+    /* Also sample in center motion direction.
+     * Allow recovering motion where there is conflicting
+     * motion between foreground and background. */
+    gather_sample(screen_uv,
+                  center_depth,
+                  center_motion_len,
+                  center_motion * t,
+                  center_motion_len * t,
+                  next,
+                  accum);
+  }
+}
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+  vec2 uv = (vec2(texel) + 0.5) / vec2(texture_size(input_tx));
+
+  /* Data of the center pixel of the gather (target). */
+  float center_depth = texture_load(depth_tx, texel).x;
+  vec4 center_motion = texture(velocity_tx, uv) * vec4(vec2(shutter_speed), vec2(-shutter_speed));
+  vec4 center_color = textureLod(input_tx, uv, 0.0);
+
+  /* Randomize tile boundary to avoid ugly discontinuities. Randomize 1/4th of the tile.
+   * Note this randomize only in one direction but in practice it's enough. */
+  float rand = interleaved_gradient_noise(texel);
+  ivec2 tile = (texel + ivec2(rand * 2.0 - 1.0 * float(MOTION_BLUR_TILE_SIZE) * 0.25)) /
+               MOTION_BLUR_TILE_SIZE;
+
+  vec4 max_motion;
+  /* Load dilation result from the indirection table. */
+  ivec2 tile_prev;
+  motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_PREV, uvec2(tile), tile_prev);
+  max_motion.xy = texture_load(max_velocity_tx, tile_prev).xy;
+  ivec2 tile_next;
+  motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_NEXT, uvec2(tile), tile_next);
+  max_motion.zw = texture_load(max_velocity_tx, tile_next).zw;
+
+  max_motion *= vec4(vec2(shutter_speed), vec2(-shutter_speed));
+
+  Accumulator accum;
+  accum.weight = vec3(0.0, 0.0, 1.0);
+  accum.bg = vec4(0.0);
+  accum.fg = vec4(0.0);
+  /* First linear gather. time = [T - delta, T] */
+  gather_blur(uv, center_motion.xy, center_depth, max_motion.xy, rand, false, accum);
+  /* Second linear gather. time = [T, T + delta] */
+  gather_blur(uv, center_motion.zw, center_depth, max_motion.zw, rand, true, accum);
+
+#if 1 /* Own addition. Not present in reference implementation. */
+  /* Avoid division by 0.0. */
+  float w = 1.0 / (50.0 * float(samples_count) * 4.0);
+  accum.bg += center_color * w;
+  accum.weight.x += w;
+  /* NOTE: In Jimenez's presentation, they used center sample.
+   * We use background color as it contains more information for foreground
+   * elements that have not enough weights.
+   * Yield better blur in complex motion. */
+  center_color = accum.bg / accum.weight.x;
+#endif
+  /* Merge background. */
+  accum.fg += accum.bg;
+  accum.weight.y += accum.weight.x;
+  /* Balance accumulation for failed samples.
+   * We replace the missing foreground by the background. */
+  float blend_fac = clamp(1.0 - accum.weight.y / accum.weight.z, 0.0, 1.0);
+  vec4 out_color = (accum.fg / accum.weight.z) + center_color * blend_fac;
+
+  imageStore(output_img, texel, out_color);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_motion_blur_max_velocity_dilate.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_motion_blur_max_velocity_dilate.glsl
@@ -0,0 +1,118 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* Identical to eevee_motion_blur_dilate_comp.glsl but with minor adjustments to work with the
+ * compositor. */
+
+#pragma BLENDER_REQUIRE(gpu_shader_math_base_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_motion_blur_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+struct MotionRect {
+  ivec2 bottom_left;
+  ivec2 extent;
+};
+
+MotionRect compute_motion_rect(ivec2 tile, vec2 motion)
+{
+  /* `ceil()` to number of tile touched. */
+  ivec2 point1 = tile + ivec2(sign(motion) * ceil(abs(motion) / float(MOTION_BLUR_TILE_SIZE)));
+  ivec2 point2 = tile;
+
+  ivec2 max_point = max(point1, point2);
+  ivec2 min_point = min(point1, point2);
+  /* Clamp to bounds. */
+  max_point = min(max_point, texture_size(input_tx) - 1);
+  min_point = max(min_point, ivec2(0));
+
+  MotionRect rect;
+  rect.bottom_left = min_point;
+  rect.extent = 1 + max_point - min_point;
+  return rect;
+}
+
+struct MotionLine {
+  /** Origin of the line. */
+  vec2 origin;
+  /** Normal to the line direction. */
+  vec2 normal;
+};
+
+MotionLine compute_motion_line(ivec2 tile, vec2 motion)
+{
+  float magnitude = length(motion);
+  vec2 dir = magnitude != 0.0 ? motion / magnitude : motion;
+
+  MotionLine line;
+  line.origin = vec2(tile);
+  /* Rotate 90 degrees counter-clockwise. */
+  line.normal = vec2(-dir.y, dir.x);
+  return line;
+}
+
+bool is_inside_motion_line(ivec2 tile, MotionLine motion_line)
+{
+  /* NOTE: Everything in is tile unit. */
+  float distance_to_line = dot(motion_line.normal, motion_line.origin - vec2(tile));
+  /* In order to be conservative and for simplicity, we use the tiles bounding circles.
+   * Consider that both the tile and the line have bounding radius of M_SQRT1_2. */
+  return abs(distance_to_line) < M_SQRT2;
+}
+
+void main()
+{
+  ivec2 src_tile = ivec2(gl_GlobalInvocationID.xy);
+  if (any(greaterThanEqual(src_tile, texture_size(input_tx)))) {
+    return;
+  }
+
+  vec4 max_motion = texture_load(input_tx, src_tile) *
+                    vec4(vec2(shutter_speed), vec2(-shutter_speed));
+
+  MotionPayload payload_prv = motion_blur_tile_indirection_pack_payload(max_motion.xy,
+                                                                        uvec2(src_tile));
+  MotionPayload payload_nxt = motion_blur_tile_indirection_pack_payload(max_motion.zw,
+                                                                        uvec2(src_tile));
+  if (true) {
+    /* Rectangular area (in tiles) where the motion vector spreads. */
+    MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.xy);
+    MotionLine motion_line = compute_motion_line(src_tile, max_motion.xy);
+    /* Do a conservative rasterization of the line of the motion vector line. */
+    for (int x = 0; x < motion_rect.extent.x; x++) {
+      for (int y = 0; y < motion_rect.extent.y; y++) {
+        ivec2 tile = motion_rect.bottom_left + ivec2(x, y);
+        if (is_inside_motion_line(tile, motion_line)) {
+          motion_blur_tile_indirection_store(
+              tile_indirection_buf, MOTION_PREV, uvec2(tile), payload_prv);
+          /* FIXME: This is a bit weird, but for some reason, we need the store the same vector in
+           * the motion next so that weighting in gather pass is better. */
+          motion_blur_tile_indirection_store(
+              tile_indirection_buf, MOTION_NEXT, uvec2(tile), payload_nxt);
+        }
+      }
+    }
+  }
+
+  if (true) {
+    MotionPayload payload = motion_blur_tile_indirection_pack_payload(max_motion.zw,
+                                                                      uvec2(src_tile));
+    /* Rectangular area (in tiles) where the motion vector spreads. */
+    MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.zw);
+    MotionLine motion_line = compute_motion_line(src_tile, max_motion.zw);
+    /* Do a conservative rasterization of the line of the motion vector line. */
+    for (int x = 0; x < motion_rect.extent.x; x++) {
+      for (int y = 0; y < motion_rect.extent.y; y++) {
+        ivec2 tile = motion_rect.bottom_left + ivec2(x, y);
+        if (is_inside_motion_line(tile, motion_line)) {
+          motion_blur_tile_indirection_store(
+              tile_indirection_buf, MOTION_NEXT, uvec2(tile), payload_nxt);
+          /* FIXME: This is a bit weird, but for some reason, we need the store the same vector in
+           * the motion next so that weighting in gather pass is better. */
+          motion_blur_tile_indirection_store(
+              tile_indirection_buf, MOTION_PREV, uvec2(tile), payload_prv);
+        }
+      }
+    }
+  }
+}
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_motion_blur_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_motion_blur_info.hh
@@ -0,0 +1,26 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "gpu_shader_create_info.hh"
+
+GPU_SHADER_CREATE_INFO(compositor_motion_blur_max_velocity_dilate)
+    .local_group_size(16, 16)
+    .push_constant(Type::FLOAT, "shutter_speed")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .storage_buf(0, Qualifier::READ_WRITE, "uint", "tile_indirection_buf[]")
+    .compute_source("compositor_motion_blur_max_velocity_dilate.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_motion_blur)
+    .local_group_size(16, 16)
+    .push_constant(Type::INT, "samples_count")
+    .push_constant(Type::FLOAT, "shutter_speed")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "depth_tx")
+    .sampler(2, ImageType::FLOAT_2D, "velocity_tx")
+    .sampler(3, ImageType::FLOAT_2D, "max_velocity_tx")
+    .storage_buf(0, Qualifier::READ, "uint", "tile_indirection_buf[]")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_motion_blur.glsl")
+    .do_static_compilation(true);
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_parallel_reduction_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_parallel_reduction_info.hh
@@ -149,3 +149,22 @@ GPU_SHADER_CREATE_INFO(compositor_minimum_float_in_range)
    .define("LOAD(value)", "value.x")
    .define("REDUCE(lhs, rhs)", "((rhs < lhs) && (rhs >= lower_bound)) ? rhs : lhs")
    .do_static_compilation(true);
+
+/* --------------------------------------------------------------------
+ * Velocity Reductions.
+ */
+
+GPU_SHADER_CREATE_INFO(compositor_max_velocity)
+    .local_group_size(32, 32)
+    .push_constant(Type::BOOL, "is_initial_reduction")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .define("TYPE", "vec4")
+    .define("IDENTITY", "vec4(0.0)")
+    .define("INITIALIZE(value)", "value")
+    .define("LOAD(value)", "value")
+    .define("REDUCE(lhs, rhs)",
+            "vec4(dot(lhs.xy, lhs.xy) > dot(rhs.xy, rhs.xy) ? lhs.xy : rhs.xy,"
+            "     dot(lhs.zw, lhs.zw) > dot(rhs.zw, rhs.zw) ? lhs.zw : rhs.zw)")
+    .compute_source("compositor_parallel_reduction.glsl")
+    .do_static_compilation(true);
--- a/source/blender/compositor/realtime_compositor/shaders/library/gpu_shader_compositor_motion_blur_lib.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/library/gpu_shader_compositor_motion_blur_lib.glsl
@@ -0,0 +1,48 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* Identical copy to eevee_motion_blur_lib.glsl, but with the needed macros defined inline. */
+
+#define MOTION_BLUR_TILE_SIZE 32
+#define MOTION_BLUR_MAX_TILE 512 /* 16384 / MOTION_BLUR_TILE_SIZE */
+#define MotionPayload uint
+
+/* Store velocity magnitude in the MSB to be able to use it with atomicMax operations. */
+MotionPayload motion_blur_tile_indirection_pack_payload(vec2 motion, uvec2 payload)
+{
+  /* NOTE: Clamp to 16383 pixel velocity. After that, it is tile position that determine the tile
+   * to dilate over. */
+  uint velocity = min(uint(ceil(length(motion))), 0x3FFFu);
+  /* Designed for 512x512 tiles max. */
+  return (velocity << 18u) | ((payload.x & 0x1FFu) << 9u) | (payload.y & 0x1FFu);
+}
+
+/* Return thread index. */
+ivec2 motion_blur_tile_indirection_pack_payload(uint data)
+{
+  return ivec2((data >> 9u) & 0x1FFu, data & 0x1FFu);
+}
+
+uint motion_blur_tile_indirection_index(uint motion_step, uvec2 tile)
+{
+  uint index = tile.x;
+  index += tile.y * MOTION_BLUR_MAX_TILE;
+  index += motion_step * MOTION_BLUR_MAX_TILE * MOTION_BLUR_MAX_TILE;
+  return index;
+}
+
+#define MOTION_PREV 0u
+#define MOTION_NEXT 1u
+
+#define motion_blur_tile_indirection_store(table_, step_, tile, payload_) \
+  if (true) { \
+    uint index = motion_blur_tile_indirection_index(step_, tile); \
+    atomicMax(table_[index], payload_); \
+  }
+
+#define motion_blur_tile_indirection_load(table_, step_, tile_, result_) \
+  if (true) { \
+    uint index = motion_blur_tile_indirection_index(step_, tile_); \
+    result_ = motion_blur_tile_indirection_pack_payload(table_[index]); \
+  }
--- a/source/blender/nodes/composite/nodes/node_composite_vec_blur.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_vec_blur.cc
@@ -6,10 +6,21 @@
 * \ingroup cmpnodes
 */

+#include <cstdint>
+
+#include "BLI_math_vector.hh"
+
 #include "UI_interface.hh"
 #include "UI_resources.hh"

+#include "GPU_compute.h"
+#include "GPU_shader.h"
+#include "GPU_storage_buffer.h"
+#include "GPU_vertex_buffer.h"
+
 #include "COM_node_operation.hh"
+#include "COM_result.hh"
+#include "COM_utilities.hh"

 #include "node_composite_util.hh"

@@ -17,15 +28,21 @@

 namespace blender::nodes::node_composite_vec_blur_cc {

+NODE_STORAGE_FUNCS(NodeBlurData)
+
 static void cmp_node_vec_blur_declare(NodeDeclarationBuilder &b)
 {
-  b.add_input<decl::Color>("Image").default_value({1.0f, 1.0f, 1.0f, 1.0f});
-  b.add_input<decl::Float>("Z").default_value(0.0f).min(0.0f).max(1.0f);
+  b.add_input<decl::Color>("Image")
+      .default_value({1.0f, 1.0f, 1.0f, 1.0f})
+      .compositor_domain_priority(0);
+  b.add_input<decl::Float>("Z").default_value(0.0f).min(0.0f).max(1.0f).compositor_domain_priority(
+      2);
  b.add_input<decl::Vector>("Speed")
      .default_value({0.0f, 0.0f, 0.0f})
      .min(0.0f)
      .max(1.0f)
-      .subtype(PROP_VELOCITY);
+      .subtype(PROP_VELOCITY)
+      .compositor_domain_priority(1);
  b.add_output<decl::Color>("Image");
 }

@@ -35,7 +52,7 @@ static void node_composit_init_vecblur(bNodeTree * /*ntree*/, bNode *node)
  NodeBlurData *nbd = MEM_cnew<NodeBlurData>(__func__);
  node->storage = nbd;
  nbd->samples = 32;
-  nbd->fac = 1.0f;
+  nbd->fac = 0.25f;
 }

 static void node_composit_buts_vecblur(uiLayout *layout, bContext * /*C*/, PointerRNA *ptr)
@@ -62,8 +79,118 @@ class VectorBlurOperation : public NodeOperation {

  void execute() override
  {
-    get_input("Image").pass_through(get_result("Image"));
-    context().set_info_message("Viewport compositor setup not fully supported");
+    Result &input = get_input("Image");
+    Result &output = get_result("Image");
+    if (input.is_single_value()) {
+      input.pass_through(output);
+      return;
+    }
+
+    Result max_tile_velocity = compute_max_tile_velocity();
+    GPUStorageBuf *tile_indirection_buffer = dilate_max_velocity(max_tile_velocity);
+    compute_motion_blur(max_tile_velocity, tile_indirection_buffer);
+    max_tile_velocity.release();
+    GPU_storagebuf_free(tile_indirection_buffer);
+  }
+
+  /* Reduces each 32x32 block of velocity pixels into a single velocity whose magnitude is largest.
+   * Each of the previous and next velocities are reduces independently. */
+  Result compute_max_tile_velocity()
+  {
+    GPUShader *shader = context().get_shader("compositor_max_velocity");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1b(shader, "is_initial_reduction", true);
+
+    Result &input = get_input("Speed");
+    input.bind_as_texture(shader, "input_tx");
+
+    Result output = context().create_temporary_result(ResultType::Color);
+    const int2 tiles_count = math::divide_ceil(input.domain().size, int2(32));
+    output.allocate_texture(Domain(tiles_count));
+    output.bind_as_image(shader, "output_img");
+
+    GPU_compute_dispatch(shader, tiles_count.x, tiles_count.y, 1);
+
+    GPU_shader_unbind();
+    input.unbind_as_texture();
+    output.unbind_as_image();
+
+    return output;
+  }
+
+  /* The max tile velocity image computes the maximum within 32x32 blocks, while the velocity can
+   * in fact extend beyond such a small block. So we dilate the max blocks by taking the maximum
+   * along the path of each of the max velocity tiles. Since the shader uses custom max atomics,
+   * the output will be an indirection buffer that points to a particular tile in the original max
+   * tile velocity image. This is done as a form of performance optimization, see the shader for
+   * more information. */
+  GPUStorageBuf *dilate_max_velocity(Result &max_tile_velocity)
+  {
+    GPUShader *shader = context().get_shader("compositor_motion_blur_max_velocity_dilate");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1f(shader, "shutter_speed", node_storage(bnode()).fac);
+
+    max_tile_velocity.bind_as_texture(shader, "input_tx");
+
+    Result output = context().create_temporary_result(ResultType::Color);
+    output.allocate_texture(max_tile_velocity.domain());
+    output.bind_as_image(shader, "output_img");
+
+    /* The shader assumes a maximum input size of 16k, and since the max tile velocity image is
+     * composed of blocks of 32, we get 16k / 32 = 512. So the table is 512x512, but we store two
+     * tables for the previous and next velocities, so we double that. */
+    const int size = sizeof(uint32_t) * 512 * 512 * 2;
+    GPUStorageBuf *tile_indirection_buffer = GPU_storagebuf_create_ex(
+        size, nullptr, GPU_USAGE_DEVICE_ONLY, __func__);
+    const int slot = GPU_shader_get_ssbo_binding(shader, "tile_indirection_buf");
+    GPU_storagebuf_bind(tile_indirection_buffer, slot);
+
+    compute_dispatch_threads_at_least(shader, max_tile_velocity.domain().size);
+
+    GPU_shader_unbind();
+    max_tile_velocity.unbind_as_texture();
+    GPU_storagebuf_unbind(tile_indirection_buffer);
+
+    return tile_indirection_buffer;
+  }
+
+  void compute_motion_blur(Result &max_tile_velocity, GPUStorageBuf *tile_indirection_buffer)
+  {
+    GPUShader *shader = context().get_shader("compositor_motion_blur");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1i(shader, "samples_count", node_storage(bnode()).samples);
+    GPU_shader_uniform_1f(shader, "shutter_speed", node_storage(bnode()).fac);
+
+    Result &input = get_input("Image");
+    input.bind_as_texture(shader, "input_tx");
+
+    Result &depth = get_input("Z");
+    depth.bind_as_texture(shader, "depth_tx");
+
+    Result &velocity = get_input("Speed");
+    velocity.bind_as_texture(shader, "velocity_tx");
+
+    max_tile_velocity.bind_as_texture(shader, "max_velocity_tx");
+
+    const int slot = GPU_shader_get_ssbo_binding(shader, "tile_indirection_buf");
+    GPU_storagebuf_bind(tile_indirection_buffer, slot);
+
+    Result &output = get_result("Image");
+    const Domain domain = compute_domain();
+    output.allocate_texture(domain);
+    output.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, output.domain().size);
+
+    GPU_shader_unbind();
+    input.unbind_as_texture();
+    depth.unbind_as_texture();
+    velocity.unbind_as_texture();
+    max_tile_velocity.unbind_as_texture();
+    output.unbind_as_image();
  }
 };

@@ -87,8 +214,6 @@ void register_node_type_cmp_vecblur()
  node_type_storage(
      &ntype, "NodeBlurData", node_free_standard_storage, node_copy_standard_storage);
  ntype.get_compositor_operation = file_ns::get_compositor_operation;
-  ntype.realtime_compositor_unsupported_message = N_(
-      "Node not supported in the Viewport compositor");

  nodeRegisterType(&ntype);
 }