From 71e971700cd66609bd3707fe4e896f2029c82b80 Mon Sep 17 00:00:00 2001 From: Omar Emara Date: Wed, 20 Nov 2024 15:57:39 +0200 Subject: [PATCH] Compositor: Implement Summed Area Table for new CPU compositor Reference #125968. --- .../COM_algorithm_summed_area_table.hh | 51 ++++++++++++++++ .../algorithms/intern/summed_area_table.cc | 59 +++++++++++++++++-- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh index cd75f60359e..0d8088f2092 100644 --- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh +++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh @@ -4,6 +4,9 @@ #pragma once +#include "BLI_math_vector.hh" +#include "BLI_math_vector_types.hh" + #include "COM_context.hh" #include "COM_result.hh" @@ -26,4 +29,52 @@ void summed_area_table(Context &context, Result &output, SummedAreaTableOperation operation = SummedAreaTableOperation::Identity); +/* Computes the sum of the rectangular region defined by the given lower and upper bounds from the + * given summed area table. It is assumed that the given upper bound is larger than the given lower + * bound, otherwise, undefined behavior is invoked. Looking at the diagram below, in order to + * compute the sum of area X, we sample the table at each of the corners of the area X, to get: + * + * Upper Right -> A + B + C + X (1) + * Upper Left -> A + B (2) + * Lower Right -> B + C (3) + * Lower Left -> B (4) + * + * We start from (1) and subtract (2) and (3) to get rid of A and C to get: + * + * (A + B + C + X) - (A + B) - (B + C) = (X - B) + * + * To get rid of B, we add (4) to get: + * + * (X - B) + B = X + * + * ^ + * | + * +-------+-----+ + * | | | + * | A | X | + * | | | + * +-------+-----+ + * | | | + * | B | C | + * | | | + * o-------+-----+------> + * + * The aforementioned equation eliminates the edges between regions X, C, and A since they get + * subtracted with C and A. To avoid this, we subtract 1 from the lower bound and fallback to zero + * for out of bound sampling. */ +inline float4 summed_area_table_sum(const Result &table, + const int2 &lower_bound, + const int2 &upper_bound) +{ + int2 corrected_lower_bound = lower_bound - int2(1); + int2 corrected_upper_bound = math::min(table.domain().size - int2(1), upper_bound); + float4 addend = table.load_pixel_fallback(corrected_upper_bound, float4(0.0f)) + + table.load_pixel_fallback(corrected_lower_bound, float4(0.0f)); + float4 subtrahend = table.load_pixel_fallback( + int2(corrected_lower_bound.x, corrected_upper_bound.y), float4(0.0f)) + + table.load_pixel_fallback( + int2(corrected_upper_bound.x, corrected_lower_bound.y), float4(0.0f)); + return addend - subtrahend; +} + } // namespace blender::realtime_compositor diff --git a/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc b/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc index c01c61fc220..e28619dec88 100644 --- a/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc +++ b/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc @@ -3,9 +3,11 @@ * SPDX-License-Identifier: GPL-2.0-or-later */ #include "BLI_assert.h" +#include "BLI_index_range.hh" #include "BLI_math_base.hh" #include "BLI_math_vector.hh" #include "BLI_math_vector_types.hh" +#include "BLI_task.hh" #include "GPU_compute.hh" #include "GPU_shader.hh" @@ -199,10 +201,10 @@ static void compute_complete_blocks(Context &context, output.unbind_as_image(); } -void summed_area_table(Context &context, - Result &input, - Result &output, - SummedAreaTableOperation operation) +static void summed_area_table_gpu(Context &context, + Result &input, + Result &output, + SummedAreaTableOperation operation) { Result incomplete_x_prologues = context.create_result(ResultType::Color, ResultPrecision::Full); Result incomplete_y_prologues = context.create_result(ResultType::Color, ResultPrecision::Full); @@ -228,4 +230,53 @@ void summed_area_table(Context &context, complete_y_prologues.release(); } +/* Computes the summed area table as a cascade of a horizontal summing pass followed by a vertical + * summing pass. */ +static void summed_area_table_cpu(Result &input, + Result &output, + SummedAreaTableOperation operation) +{ + output.allocate_texture(input.domain()); + + /* Horizontal summing pass. */ + const int2 size = input.domain().size; + threading::parallel_for(IndexRange(size.y), 1, [&](const IndexRange range_y) { + for (const int y : range_y) { + float4 accumulated_color = float4(0.0f); + for (const int x : IndexRange(size.x)) { + const int2 texel = int2(x, y); + const float4 color = input.load_pixel(texel); + accumulated_color += operation == SummedAreaTableOperation::Square ? color * color : color; + output.store_pixel(texel, accumulated_color); + } + } + }); + + /* Vertical summing pass. */ + threading::parallel_for(IndexRange(size.x), 1, [&](const IndexRange range_x) { + for (const int x : range_x) { + float4 accumulated_color = float4(0.0f); + for (const int y : IndexRange(size.y)) { + const int2 texel = int2(x, y); + const float4 color = output.load_pixel(texel); + accumulated_color += color; + output.store_pixel(texel, accumulated_color); + } + } + }); +} + +void summed_area_table(Context &context, + Result &input, + Result &output, + SummedAreaTableOperation operation) +{ + if (context.use_gpu()) { + summed_area_table_gpu(context, input, output, operation); + } + else { + summed_area_table_cpu(input, output, operation); + } +} + } // namespace blender::realtime_compositor