From 0efb0ce48e9123118d0eb951103427e04ee5757b Mon Sep 17 00:00:00 2001 From: Omar Emara Date: Wed, 27 Nov 2024 19:25:50 +0200 Subject: [PATCH] Compositor: Implement Dilate node for new CPU compositor Reference #125968. --- .../composite/nodes/node_composite_dilate.cc | 244 ++++++++++++++++-- 1 file changed, 218 insertions(+), 26 deletions(-) diff --git a/source/blender/nodes/composite/nodes/node_composite_dilate.cc b/source/blender/nodes/composite/nodes/node_composite_dilate.cc index 2b65420f2a2..27a9753b5d0 100644 --- a/source/blender/nodes/composite/nodes/node_composite_dilate.cc +++ b/source/blender/nodes/composite/nodes/node_composite_dilate.cc @@ -6,6 +6,8 @@ * \ingroup cmpnodes */ +#include + #include "BLI_assert.h" #include "BLI_math_base.hh" #include "BLI_math_vector_types.hh" @@ -68,17 +70,6 @@ class DilateErodeOperation : public NodeOperation { void execute() override { - /* Not yet supported on CPU. */ - if (!context().use_gpu()) { - for (const bNodeSocket *output : this->node()->output_sockets()) { - Result &output_result = get_result(output->identifier); - if (output_result.should_compute()) { - output_result.allocate_invalid(); - } - } - return; - } - if (is_identity()) { get_input("Mask").pass_through(get_result("Mask")); return; @@ -115,6 +106,14 @@ class DilateErodeOperation : public NodeOperation { } Result execute_step_horizontal_pass() + { + if (this->context().use_gpu()) { + return this->execute_step_horizontal_pass_gpu(); + } + return this->execute_step_horizontal_pass_cpu(); + } + + Result execute_step_horizontal_pass_gpu() { GPUShader *shader = context().get_shader(get_morphological_step_shader_name()); GPU_shader_bind(shader); @@ -149,7 +148,40 @@ class DilateErodeOperation : public NodeOperation { return horizontal_pass_result; } + Result execute_step_horizontal_pass_cpu() + { + const Result &input = get_input("Mask"); + + /* We allocate an output image of a transposed size, that is, with a height equivalent to the + * width of the input and vice versa. This is done as a performance optimization. The shader + * will process the image horizontally and write it to the intermediate output transposed. Then + * the vertical pass will execute the same horizontal pass shader, but since its input is + * transposed, it will effectively do a vertical pass and write to the output transposed, + * effectively undoing the transposition in the horizontal pass. This is done to improve + * spatial cache locality in the shader and to avoid having two separate shaders for each of + * the passes. */ + const Domain domain = compute_domain(); + const int2 transposed_domain = int2(domain.size.y, domain.size.x); + + Result horizontal_pass_result = context().create_result(ResultType::Color); + horizontal_pass_result.allocate_texture(transposed_domain); + + this->execute_step_pass_cpu(input, horizontal_pass_result); + + return horizontal_pass_result; + } + void execute_step_vertical_pass(Result &horizontal_pass_result) + { + if (this->context().use_gpu()) { + this->execute_step_vertical_pass_gpu(horizontal_pass_result); + } + else { + this->execute_step_vertical_pass_cpu(horizontal_pass_result); + } + } + + void execute_step_vertical_pass_gpu(Result &horizontal_pass_result) { GPUShader *shader = context().get_shader(get_morphological_step_shader_name()); GPU_shader_bind(shader); @@ -173,6 +205,55 @@ class DilateErodeOperation : public NodeOperation { output_mask.unbind_as_image(); } + void execute_step_vertical_pass_cpu(Result &horizontal_pass_result) + { + const Domain domain = compute_domain(); + Result &output_mask = get_result("Mask"); + output_mask.allocate_texture(domain); + + this->execute_step_pass_cpu(horizontal_pass_result, output_mask); + } + + void execute_step_pass_cpu(const Result &input, Result &output) + { + /* We have specialized code for each sign, so use the absolute value. */ + const int radius = math::abs(this->get_distance()); + + /* Notice that the size is transposed, see the note on the horizontal pass method for more + * information on the reasoning behind this. */ + const int2 size = int2(output.domain().size.y, output.domain().size.x); + if (this->get_distance() > 0) { + parallel_for(size, [&](const int2 texel) { + /* Find the maximum value in the window of the given radius around the pixel. This + * is essentially a morphological dilate operator with a square structuring element. */ + const float limit = std::numeric_limits::lowest(); + float value = limit; + for (int i = -radius; i <= radius; i++) { + value = math::max(value, input.load_pixel_fallback(texel + int2(i, 0), float4(limit)).x); + } + + /* Write the value using the transposed texel. See the horizontal pass method + * for more information on the rational behind this. */ + output.store_pixel(int2(texel.y, texel.x), float4(value)); + }); + } + else { + parallel_for(size, [&](const int2 texel) { + /* Find the minimum value in the window of the given radius around the pixel. This + * is essentially a morphological erode operator with a square structuring element. */ + const float limit = std::numeric_limits::max(); + float value = limit; + for (int i = -radius; i <= radius; i++) { + value = math::min(value, input.load_pixel_fallback(texel + int2(i, 0), float4(limit)).x); + } + + /* Write the value using the transposed texel. See the horizontal pass method + * for more information on the rational behind this. */ + output.store_pixel(int2(texel.y, texel.x), float4(value)); + }); + } + } + const char *get_morphological_step_shader_name() { if (get_distance() > 0) { @@ -195,6 +276,29 @@ class DilateErodeOperation : public NodeOperation { * ------------------------------------------ */ void execute_distance_threshold() + { + Result output_mask = context().create_result(ResultType::Float); + + if (this->context().use_gpu()) { + this->execute_distance_threshold_gpu(output_mask); + } + else { + this->execute_distance_threshold_cpu(output_mask); + } + + /* For configurations where there is little user-specified inset, anti-alias the result for + * smoother edges. */ + Result &output = this->get_result("Mask"); + if (this->get_inset() < 2.0f) { + smaa(this->context(), output_mask, output); + output_mask.release(); + } + else { + output.steal_data(output_mask); + } + } + + void execute_distance_threshold_gpu(Result &output) { GPUShader *shader = context().get_shader("compositor_morphological_distance_threshold"); GPU_shader_bind(shader); @@ -207,26 +311,114 @@ class DilateErodeOperation : public NodeOperation { input_mask.bind_as_texture(shader, "input_tx"); const Domain domain = compute_domain(); - Result output_mask = context().create_result(ResultType::Float); - output_mask.allocate_texture(domain); - output_mask.bind_as_image(shader, "output_img"); + output.allocate_texture(domain); + output.bind_as_image(shader, "output_img"); compute_dispatch_threads_at_least(shader, domain.size); GPU_shader_unbind(); - output_mask.unbind_as_image(); + output.unbind_as_image(); input_mask.unbind_as_texture(); + } - /* For configurations where there is little user-specified inset, anti-alias the result for - * smoother edges. */ - Result &output = get_result("Mask"); - if (get_inset() < 2.0f) { - smaa(context(), output_mask, output); - output_mask.release(); - } - else { - output.steal_data(output_mask); - } + void execute_distance_threshold_cpu(Result &output) + { + const Result &input = get_input("Mask"); + + const Domain domain = compute_domain(); + output.allocate_texture(domain); + + const float inset = math::max(this->get_inset(), 10e-6f); + const int radius = this->get_morphological_distance_threshold_radius(); + const int distance = this->get_distance(); + + /* The Morphological Distance Threshold operation is effectively three consecutive operations + * implemented as a single operation. The three operations are as follows: + * + * .-----------. .--------------. .----------------. + * | Threshold |-->| Dilate/Erode |-->| Distance Inset | + * '-----------' '--------------' '----------------' + * + * The threshold operation just converts the input into a binary image, where the pixel is 1 if + * it is larger than 0.5 and 0 otherwise. Pixels that are 1 in the output of the threshold + * operation are said to be masked. The dilate/erode operation is a dilate or erode + * morphological operation with a circular structuring element depending on the sign of the + * distance, where it is a dilate operation if the distance is positive and an erode operation + * otherwise. This is equivalent to the Morphological Distance operation, see its + * implementation for more information. Finally, the distance inset is an operation that + * converts the binary image into a narrow band distance field. That is, pixels that are + * unmasked will remain 0, while pixels that are masked will start from zero at the boundary of + * the masked region and linearly increase until reaching 1 in the span of a number pixels + * given by the inset value. + * + * As a performance optimization, the dilate/erode operation is omitted and its effective + * result is achieved by slightly adjusting the distance inset operation. The base distance + * inset operation works by computing the signed distance from the current center pixel to the + * nearest pixel with a different value. Since our image is a binary image, that means that if + * the pixel is masked, we compute the signed distance to the nearest unmasked pixel, and if + * the pixel unmasked, we compute the signed distance to the nearest masked pixel. The distance + * is positive if the pixel is masked and negative otherwise. The distance is then normalized + * by dividing by the given inset value and clamped to the [0, 1] range. Since distances larger + * than the inset value are eventually clamped, the distance search window is limited to a + * radius equivalent to the inset value. + * + * To archive the effective result of the omitted dilate/erode operation, we adjust the + * distance inset operation as follows. First, we increase the radius of the distance search + * window by the radius of the dilate/erode operation. Then we adjust the resulting narrow band + * signed distance field as follows. + * + * For the erode case, we merely subtract the erode distance, which makes the outermost erode + * distance number of pixels zero due to clamping, consequently achieving the result of the + * erode, while retaining the needed inset because we increased the distance search window by + * the same amount we subtracted. + * + * Similarly, for the dilate case, we add the dilate distance, which makes the dilate distance + * number of pixels just outside of the masked region positive and part of the narrow band + * distance field, consequently achieving the result of the dilate, while at the same time, the + * innermost dilate distance number of pixels become 1 due to clamping, retaining the needed + * inset because we increased the distance search window by the same amount we added. + * + * Since the erode/dilate distance is already signed appropriately as described before, we just + * add it in both cases. */ + parallel_for(domain.size, [&](const int2 texel) { + /* Apply a threshold operation on the center pixel, where the threshold is currently + * hard-coded at 0.5. The pixels with values larger than the threshold are said to be masked. + */ + bool is_center_masked = input.load_pixel(texel).x > 0.5f; + + /* Since the distance search window will access pixels outside of the bounds of the image, we + * use a texture loader with a fallback value. And since we don't want those values to affect + * the result, the fallback value is chosen such that the inner condition fails, which is + * when the sampled pixel and the center pixel are the same, so choose a fallback that will + * be considered masked if the center pixel is masked and unmasked otherwise. */ + float4 fallback = float4(is_center_masked ? 1.0f : 0.0f); + + /* Since the distance search window is limited to the given radius, the maximum possible + * squared distance to the center is double the squared radius. */ + int minimum_squared_distance = radius * radius * 2; + + /* Find the squared distance to the nearest different pixel in the search window of the given + * radius. */ + for (int y = -radius; y <= radius; y++) { + for (int x = -radius; x <= radius; x++) { + bool is_sample_masked = input.load_pixel_fallback(texel + int2(x, y), fallback).x > 0.5f; + if (is_center_masked != is_sample_masked) { + minimum_squared_distance = math::min(minimum_squared_distance, x * x + y * y); + } + } + } + + /* Compute the actual distance from the squared distance and assign it an appropriate sign + * depending on whether it lies in a masked region or not. */ + float signed_minimum_distance = math::sqrt(float(minimum_squared_distance)) * + (is_center_masked ? 1.0f : -1.0f); + + /* Add the erode/dilate distance and divide by the inset amount as described in the + * discussion, then clamp to the [0, 1] range. */ + float value = math::clamp((signed_minimum_distance + distance) / inset, 0.0f, 1.0f); + + output.store_pixel(texel, float4(value)); + }); } /* See the discussion in the implementation for more information. */ @@ -282,7 +474,7 @@ class DilateErodeOperation : public NodeOperation { CMPNodeDilateErodeMethod get_method() { - return (CMPNodeDilateErodeMethod)bnode().custom1; + return static_cast(bnode().custom1); } };