Compositor: Implement Dilate node for new CPU compositor

Reference #125968.
2024-11-27 19:25:50 +02:00
parent b43c36e801
commit 0efb0ce48e
1 changed files with 218 additions and 26 deletions
--- a/source/blender/nodes/composite/nodes/node_composite_dilate.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_dilate.cc
@@ -6,6 +6,8 @@
 * \ingroup cmpnodes
 */

+#include <limits>
+
 #include "BLI_assert.h"
 #include "BLI_math_base.hh"
 #include "BLI_math_vector_types.hh"
@@ -68,17 +70,6 @@ class DilateErodeOperation : public NodeOperation {

  void execute() override
  {
-    /* Not yet supported on CPU. */
-    if (!context().use_gpu()) {
-      for (const bNodeSocket *output : this->node()->output_sockets()) {
-        Result &output_result = get_result(output->identifier);
-        if (output_result.should_compute()) {
-          output_result.allocate_invalid();
-        }
-      }
-      return;
-    }
-
    if (is_identity()) {
      get_input("Mask").pass_through(get_result("Mask"));
      return;
@@ -115,6 +106,14 @@ class DilateErodeOperation : public NodeOperation {
  }

  Result execute_step_horizontal_pass()
+  {
+    if (this->context().use_gpu()) {
+      return this->execute_step_horizontal_pass_gpu();
+    }
+    return this->execute_step_horizontal_pass_cpu();
+  }
+
+  Result execute_step_horizontal_pass_gpu()
  {
    GPUShader *shader = context().get_shader(get_morphological_step_shader_name());
    GPU_shader_bind(shader);
@@ -149,7 +148,40 @@ class DilateErodeOperation : public NodeOperation {
    return horizontal_pass_result;
  }

+  Result execute_step_horizontal_pass_cpu()
+  {
+    const Result &input = get_input("Mask");
+
+    /* We allocate an output image of a transposed size, that is, with a height equivalent to the
+     * width of the input and vice versa. This is done as a performance optimization. The shader
+     * will process the image horizontally and write it to the intermediate output transposed. Then
+     * the vertical pass will execute the same horizontal pass shader, but since its input is
+     * transposed, it will effectively do a vertical pass and write to the output transposed,
+     * effectively undoing the transposition in the horizontal pass. This is done to improve
+     * spatial cache locality in the shader and to avoid having two separate shaders for each of
+     * the passes. */
+    const Domain domain = compute_domain();
+    const int2 transposed_domain = int2(domain.size.y, domain.size.x);
+
+    Result horizontal_pass_result = context().create_result(ResultType::Color);
+    horizontal_pass_result.allocate_texture(transposed_domain);
+
+    this->execute_step_pass_cpu(input, horizontal_pass_result);
+
+    return horizontal_pass_result;
+  }
+
  void execute_step_vertical_pass(Result &horizontal_pass_result)
+  {
+    if (this->context().use_gpu()) {
+      this->execute_step_vertical_pass_gpu(horizontal_pass_result);
+    }
+    else {
+      this->execute_step_vertical_pass_cpu(horizontal_pass_result);
+    }
+  }
+
+  void execute_step_vertical_pass_gpu(Result &horizontal_pass_result)
  {
    GPUShader *shader = context().get_shader(get_morphological_step_shader_name());
    GPU_shader_bind(shader);
@@ -173,6 +205,55 @@ class DilateErodeOperation : public NodeOperation {
    output_mask.unbind_as_image();
  }

+  void execute_step_vertical_pass_cpu(Result &horizontal_pass_result)
+  {
+    const Domain domain = compute_domain();
+    Result &output_mask = get_result("Mask");
+    output_mask.allocate_texture(domain);
+
+    this->execute_step_pass_cpu(horizontal_pass_result, output_mask);
+  }
+
+  void execute_step_pass_cpu(const Result &input, Result &output)
+  {
+    /* We have specialized code for each sign, so use the absolute value. */
+    const int radius = math::abs(this->get_distance());
+
+    /* Notice that the size is transposed, see the note on the horizontal pass method for more
+     * information on the reasoning behind this. */
+    const int2 size = int2(output.domain().size.y, output.domain().size.x);
+    if (this->get_distance() > 0) {
+      parallel_for(size, [&](const int2 texel) {
+        /* Find the maximum value in the window of the given radius around the pixel. This
+         * is essentially a morphological dilate operator with a square structuring element. */
+        const float limit = std::numeric_limits<float>::lowest();
+        float value = limit;
+        for (int i = -radius; i <= radius; i++) {
+          value = math::max(value, input.load_pixel_fallback(texel + int2(i, 0), float4(limit)).x);
+        }
+
+        /* Write the value using the transposed texel. See the horizontal pass method
+         * for more information on the rational behind this. */
+        output.store_pixel(int2(texel.y, texel.x), float4(value));
+      });
+    }
+    else {
+      parallel_for(size, [&](const int2 texel) {
+        /* Find the minimum value in the window of the given radius around the pixel. This
+         * is essentially a morphological erode operator with a square structuring element. */
+        const float limit = std::numeric_limits<float>::max();
+        float value = limit;
+        for (int i = -radius; i <= radius; i++) {
+          value = math::min(value, input.load_pixel_fallback(texel + int2(i, 0), float4(limit)).x);
+        }
+
+        /* Write the value using the transposed texel. See the horizontal pass method
+         * for more information on the rational behind this. */
+        output.store_pixel(int2(texel.y, texel.x), float4(value));
+      });
+    }
+  }
+
  const char *get_morphological_step_shader_name()
  {
    if (get_distance() > 0) {
@@ -195,6 +276,29 @@ class DilateErodeOperation : public NodeOperation {
   * ------------------------------------------ */

  void execute_distance_threshold()
+  {
+    Result output_mask = context().create_result(ResultType::Float);
+
+    if (this->context().use_gpu()) {
+      this->execute_distance_threshold_gpu(output_mask);
+    }
+    else {
+      this->execute_distance_threshold_cpu(output_mask);
+    }
+
+    /* For configurations where there is little user-specified inset, anti-alias the result for
+     * smoother edges. */
+    Result &output = this->get_result("Mask");
+    if (this->get_inset() < 2.0f) {
+      smaa(this->context(), output_mask, output);
+      output_mask.release();
+    }
+    else {
+      output.steal_data(output_mask);
+    }
+  }
+
+  void execute_distance_threshold_gpu(Result &output)
  {
    GPUShader *shader = context().get_shader("compositor_morphological_distance_threshold");
    GPU_shader_bind(shader);
@@ -207,26 +311,114 @@ class DilateErodeOperation : public NodeOperation {
    input_mask.bind_as_texture(shader, "input_tx");

    const Domain domain = compute_domain();
-    Result output_mask = context().create_result(ResultType::Float);
-    output_mask.allocate_texture(domain);
-    output_mask.bind_as_image(shader, "output_img");
+    output.allocate_texture(domain);
+    output.bind_as_image(shader, "output_img");

    compute_dispatch_threads_at_least(shader, domain.size);

    GPU_shader_unbind();
-    output_mask.unbind_as_image();
+    output.unbind_as_image();
    input_mask.unbind_as_texture();
+  }

-    /* For configurations where there is little user-specified inset, anti-alias the result for
-     * smoother edges. */
-    Result &output = get_result("Mask");
-    if (get_inset() < 2.0f) {
-      smaa(context(), output_mask, output);
-      output_mask.release();
-    }
-    else {
-      output.steal_data(output_mask);
-    }
+  void execute_distance_threshold_cpu(Result &output)
+  {
+    const Result &input = get_input("Mask");
+
+    const Domain domain = compute_domain();
+    output.allocate_texture(domain);
+
+    const float inset = math::max(this->get_inset(), 10e-6f);
+    const int radius = this->get_morphological_distance_threshold_radius();
+    const int distance = this->get_distance();
+
+    /* The Morphological Distance Threshold operation is effectively three consecutive operations
+     * implemented as a single operation. The three operations are as follows:
+     *
+     * .-----------.   .--------------.   .----------------.
+     * | Threshold |-->| Dilate/Erode |-->| Distance Inset |
+     * '-----------'   '--------------'   '----------------'
+     *
+     * The threshold operation just converts the input into a binary image, where the pixel is 1 if
+     * it is larger than 0.5 and 0 otherwise. Pixels that are 1 in the output of the threshold
+     * operation are said to be masked. The dilate/erode operation is a dilate or erode
+     * morphological operation with a circular structuring element depending on the sign of the
+     * distance, where it is a dilate operation if the distance is positive and an erode operation
+     * otherwise. This is equivalent to the Morphological Distance operation, see its
+     * implementation for more information. Finally, the distance inset is an operation that
+     * converts the binary image into a narrow band distance field. That is, pixels that are
+     * unmasked will remain 0, while pixels that are masked will start from zero at the boundary of
+     * the masked region and linearly increase until reaching 1 in the span of a number pixels
+     * given by the inset value.
+     *
+     * As a performance optimization, the dilate/erode operation is omitted and its effective
+     * result is achieved by slightly adjusting the distance inset operation. The base distance
+     * inset operation works by computing the signed distance from the current center pixel to the
+     * nearest pixel with a different value. Since our image is a binary image, that means that if
+     * the pixel is masked, we compute the signed distance to the nearest unmasked pixel, and if
+     * the pixel unmasked, we compute the signed distance to the nearest masked pixel. The distance
+     * is positive if the pixel is masked and negative otherwise. The distance is then normalized
+     * by dividing by the given inset value and clamped to the [0, 1] range. Since distances larger
+     * than the inset value are eventually clamped, the distance search window is limited to a
+     * radius equivalent to the inset value.
+     *
+     * To archive the effective result of the omitted dilate/erode operation, we adjust the
+     * distance inset operation as follows. First, we increase the radius of the distance search
+     * window by the radius of the dilate/erode operation. Then we adjust the resulting narrow band
+     * signed distance field as follows.
+     *
+     * For the erode case, we merely subtract the erode distance, which makes the outermost erode
+     * distance number of pixels zero due to clamping, consequently achieving the result of the
+     * erode, while retaining the needed inset because we increased the distance search window by
+     * the same amount we subtracted.
+     *
+     * Similarly, for the dilate case, we add the dilate distance, which makes the dilate distance
+     * number of pixels just outside of the masked region positive and part of the narrow band
+     * distance field, consequently achieving the result of the dilate, while at the same time, the
+     * innermost dilate distance number of pixels become 1 due to clamping, retaining the needed
+     * inset because we increased the distance search window by the same amount we added.
+     *
+     * Since the erode/dilate distance is already signed appropriately as described before, we just
+     * add it in both cases. */
+    parallel_for(domain.size, [&](const int2 texel) {
+      /* Apply a threshold operation on the center pixel, where the threshold is currently
+       * hard-coded at 0.5. The pixels with values larger than the threshold are said to be masked.
+       */
+      bool is_center_masked = input.load_pixel(texel).x > 0.5f;
+
+      /* Since the distance search window will access pixels outside of the bounds of the image, we
+       * use a texture loader with a fallback value. And since we don't want those values to affect
+       * the result, the fallback value is chosen such that the inner condition fails, which is
+       * when the sampled pixel and the center pixel are the same, so choose a fallback that will
+       * be considered masked if the center pixel is masked and unmasked otherwise. */
+      float4 fallback = float4(is_center_masked ? 1.0f : 0.0f);
+
+      /* Since the distance search window is limited to the given radius, the maximum possible
+       * squared distance to the center is double the squared radius. */
+      int minimum_squared_distance = radius * radius * 2;
+
+      /* Find the squared distance to the nearest different pixel in the search window of the given
+       * radius. */
+      for (int y = -radius; y <= radius; y++) {
+        for (int x = -radius; x <= radius; x++) {
+          bool is_sample_masked = input.load_pixel_fallback(texel + int2(x, y), fallback).x > 0.5f;
+          if (is_center_masked != is_sample_masked) {
+            minimum_squared_distance = math::min(minimum_squared_distance, x * x + y * y);
+          }
+        }
+      }
+
+      /* Compute the actual distance from the squared distance and assign it an appropriate sign
+       * depending on whether it lies in a masked region or not. */
+      float signed_minimum_distance = math::sqrt(float(minimum_squared_distance)) *
+                                      (is_center_masked ? 1.0f : -1.0f);
+
+      /* Add the erode/dilate distance and divide by the inset amount as described in the
+       * discussion, then clamp to the [0, 1] range. */
+      float value = math::clamp((signed_minimum_distance + distance) / inset, 0.0f, 1.0f);
+
+      output.store_pixel(texel, float4(value));
+    });
  }

  /* See the discussion in the implementation for more information. */
@@ -282,7 +474,7 @@ class DilateErodeOperation : public NodeOperation {

  CMPNodeDilateErodeMethod get_method()
  {
-    return (CMPNodeDilateErodeMethod)bnode().custom1;
+    return static_cast<CMPNodeDilateErodeMethod>(bnode().custom1);
  }
 };