Files
test/source/blender/blenlib/intern/index_mask_expression.cc
2024-03-21 10:02:53 +11:00

1362 lines
55 KiB
C++

/* SPDX-FileCopyrightText: 2024 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/**
* Expression evaluation has multiple phases:
* 1. A coarse evaluation that tries to find segments which can be trivially evaluated. For
* example, taking the union of two overlapping ranges can be done in O(1) time.
* 2. For all segments which can't be fully evaluated using coarse evaluation, an exact evaluation
* is done. This uses either an index-based or bit-based approach depending on a heuristic.
* 3. Construct the final index mask based on the resulting intermediate segments.
*/
#include "BLI_array.hh"
#include "BLI_bit_group_vector.hh"
#include "BLI_bit_span_ops.hh"
#include "BLI_enumerable_thread_specific.hh"
#include "BLI_index_mask_expression.hh"
#include "BLI_stack.hh"
#include "BLI_strict_flags.h"
#include "BLI_task.hh"
#include "BLI_timeit.hh"
namespace blender::index_mask {
/**
* Number of expression terms which don't require extra allocations in some places.
*/
constexpr int64_t inline_expr_array_size = 16;
/**
* The result of the coarse evaluation for a specific index range.
*/
struct CoarseSegment {
enum class Type {
/**
* Coarse evaluation couldn't fully resolve this segment. The segment requires another
* evaluation that is more detailed.
*/
Unknown,
/** All indices in the segment are part of the result. */
Full,
/** The evaluated result of this segment is just the copy of an input index mask. */
Copy,
};
Type type = Type::Unknown;
IndexRange bounds;
/** Mask used when the type is #Copy. */
const IndexMask *mask = nullptr;
};
/** Contains the result of a coarse evaluation split into potentially many segments. */
struct CoarseResult {
Vector<CoarseSegment> segments;
};
/** Used during coarse evaluation to split the full range into multiple segments. */
struct CourseBoundary {
/**
* The position of the boundary. The boundary is right before this index. So if this boundary is
* a beginning of a segment, the index marks the first element. If it is the end, the index marks
* the one-after-last position.
*/
int64_t index;
/** Whether this boundary is the beginning or end of the segment below. */
bool is_begin;
/** The segment this boundary comes from. */
const CoarseSegment *segment;
};
/** For the difference operation, we need to know if a boundary belongs to the main term or not. */
struct DifferenceCourseBoundary : public CourseBoundary {
bool is_main;
};
/**
* Result of the expression evaluation within a specific index range. Sometimes this can be derived
* directly from the coarse evaluation, but sometimes an additional exact evaluation is necessary.
*/
struct EvaluatedSegment {
enum class Type {
/** All indices in this segment are part of the evaluated index mask. */
Full,
/** The result in this segment is the same as what is contained in the #copy_mask below. */
Copy,
/** The result comes from exact evaluation and is a new set of indices. */
Indices,
};
Type type = Type::Indices;
IndexRange bounds;
/** Only used when the type is #Type::Copy. */
const IndexMask *copy_mask = nullptr;
/** Only used when the type is #Type::Indices. */
IndexMaskSegment indices;
};
/**
* There are different ways to do the exact evaluation. Depending on the expression or data, one
* or the other is more efficient.
*/
enum class ExactEvalMode {
/**
* Does the evaluation by working directly with arrays of sorted indices. This is usually best
* when the expression does not have intermediate results, i.e. it is very simple.
*/
Indices,
/**
* The evaluation works with bits. There is extra overhead to convert the input masks to bit
* arrays and to convert the final result back into indices. In exchange, the actual expression
* evaluation is significantly cheaper because it's just a bunch of bit operations. For larger
* expressions, this is typically much more efficient.
*/
Bits,
};
static void sort_course_boundaries(MutableSpan<CourseBoundary> boundaries)
{
std::sort(boundaries.begin(),
boundaries.end(),
[](const CourseBoundary &a, const CourseBoundary &b) { return a.index < b.index; });
}
static void sort_course_boundaries(MutableSpan<DifferenceCourseBoundary> boundaries)
{
std::sort(boundaries.begin(),
boundaries.end(),
[](const DifferenceCourseBoundary &a, const DifferenceCourseBoundary &b) {
return a.index < b.index;
});
}
/** Smaller segments should generally be merged together. */
static constexpr int64_t segment_size_threshold = 32;
/** Extends a previous full segment or appends a new one. */
static CoarseSegment &add_coarse_segment__full(CoarseSegment *prev_segment,
const int64_t prev_boundary_index,
const int64_t current_boundary_index,
CoarseResult &result)
{
const int64_t size = current_boundary_index - prev_boundary_index;
if (prev_segment) {
if (prev_segment->type == CoarseSegment::Type::Full &&
prev_segment->bounds.one_after_last() == prev_boundary_index)
{
prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
return *prev_segment;
}
if (current_boundary_index - prev_segment->bounds.start() < max_segment_size) {
if (prev_segment->bounds.size() + size < segment_size_threshold) {
/* Extend the previous segment because it's so small and change it into an unknown one. */
prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
prev_segment->type = CoarseSegment::Type::Unknown;
return *prev_segment;
}
}
}
result.segments.append(
{CoarseSegment::Type::Full, IndexRange::from_begin_size(prev_boundary_index, size)});
return result.segments.last();
}
/** Extends a previous unknown segment or appends a new one. */
static CoarseSegment &add_coarse_segment__unknown(CoarseSegment *prev_segment,
const int64_t prev_boundary_index,
const int64_t current_boundary_index,
CoarseResult &result)
{
if (prev_segment) {
if (prev_segment->bounds.start() + segment_size_threshold >= prev_boundary_index) {
/* The previous segment is very short, so extend it. */
prev_segment->type = CoarseSegment::Type::Unknown;
prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
return *prev_segment;
}
}
result.segments.append(
{CoarseSegment::Type::Unknown,
IndexRange::from_begin_end(prev_boundary_index, current_boundary_index)});
return result.segments.last();
}
/** Extends a previous copy segment or appends a new one. */
static CoarseSegment &add_coarse_segment__copy(CoarseSegment *prev_segment,
const int64_t prev_boundary_index,
const int64_t current_boundary_index,
const IndexMask &copy_from_mask,
CoarseResult &result)
{
if (prev_segment) {
if (prev_segment->type == CoarseSegment::Type::Copy &&
prev_segment->bounds.one_after_last() == prev_boundary_index &&
prev_segment->mask == &copy_from_mask)
{
/* Can extend the previous copy segment. */
prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
return *prev_segment;
}
if (prev_segment->bounds.start() + segment_size_threshold >= current_boundary_index) {
/* The previous and this segment together are very short, so better merge them together. */
prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
prev_segment->type = CoarseSegment::Type::Unknown;
return *prev_segment;
}
}
result.segments.append({CoarseSegment::Type::Copy,
IndexRange::from_begin_end(prev_boundary_index, current_boundary_index),
&copy_from_mask});
return result.segments.last();
}
static void evaluate_coarse_union(const Span<CourseBoundary> boundaries, CoarseResult &r_result)
{
if (boundaries.is_empty()) {
return;
}
CoarseResult &result = r_result;
CoarseSegment *prev_segment = nullptr;
Vector<const CoarseSegment *, 16> active_segments;
int64_t prev_boundary_index = boundaries[0].index;
for (const CourseBoundary &boundary : boundaries) {
if (prev_boundary_index < boundary.index) {
/* Compute some properties of the input segments that were active between the current and the
* previous boundary. */
bool has_full = false;
bool has_unknown = false;
bool copy_from_single_mask = true;
const IndexMask *copy_from_mask = nullptr;
for (const CoarseSegment *active_segment : active_segments) {
switch (active_segment->type) {
case CoarseSegment::Type::Unknown: {
has_unknown = true;
break;
}
case CoarseSegment::Type::Full: {
has_full = true;
break;
}
case CoarseSegment::Type::Copy: {
if (!ELEM(copy_from_mask, nullptr, active_segment->mask)) {
copy_from_single_mask = false;
}
copy_from_mask = active_segment->mask;
break;
}
}
}
/* Determine the resulting coarse segment type based on the properties computed above. */
if (has_full) {
prev_segment = &add_coarse_segment__full(
prev_segment, prev_boundary_index, boundary.index, result);
}
else if (has_unknown || !copy_from_single_mask) {
prev_segment = &add_coarse_segment__unknown(
prev_segment, prev_boundary_index, boundary.index, result);
}
else if (copy_from_mask != nullptr && copy_from_single_mask) {
prev_segment = &add_coarse_segment__copy(
prev_segment, prev_boundary_index, boundary.index, *copy_from_mask, result);
}
prev_boundary_index = boundary.index;
}
/* Update active segments. */
if (boundary.is_begin) {
active_segments.append(boundary.segment);
}
else {
active_segments.remove_first_occurrence_and_reorder(boundary.segment);
}
}
}
static void evaluate_coarse_intersection(const Span<CourseBoundary> boundaries,
const int64_t terms_num,
CoarseResult &r_result)
{
if (boundaries.is_empty()) {
return;
}
CoarseResult &result = r_result;
CoarseSegment *prev_segment = nullptr;
Vector<const CoarseSegment *, 16> active_segments;
int64_t prev_boundary_index = boundaries[0].index;
for (const CourseBoundary &boundary : boundaries) {
if (prev_boundary_index < boundary.index) {
/* Only if one segment of each term is active, it's possible that the output contains
* anything. */
if (active_segments.size() == terms_num) {
/* Compute some properties of the input segments that were active between the current and
* previous boundary. */
int full_count = 0;
int unknown_count = 0;
int copy_count = 0;
bool copy_from_single_mask = true;
const IndexMask *copy_from_mask = nullptr;
for (const CoarseSegment *active_segment : active_segments) {
switch (active_segment->type) {
case CoarseSegment::Type::Unknown: {
unknown_count++;
break;
}
case CoarseSegment::Type::Full: {
full_count++;
break;
}
case CoarseSegment::Type::Copy: {
copy_count++;
if (!ELEM(copy_from_mask, nullptr, active_segment->mask)) {
copy_from_single_mask = false;
}
copy_from_mask = active_segment->mask;
break;
}
}
}
/* Determine the resulting coarse segment type based on the properties computed above. */
BLI_assert(full_count + unknown_count + copy_count == terms_num);
if (full_count == terms_num) {
prev_segment = &add_coarse_segment__full(
prev_segment, prev_boundary_index, boundary.index, result);
}
else if (unknown_count > 0 || copy_count < terms_num || !copy_from_single_mask) {
prev_segment = &add_coarse_segment__unknown(
prev_segment, prev_boundary_index, boundary.index, result);
}
else if (copy_count == terms_num && copy_from_single_mask) {
prev_segment = &add_coarse_segment__copy(
prev_segment, prev_boundary_index, boundary.index, *copy_from_mask, result);
}
}
prev_boundary_index = boundary.index;
}
/* Update active segments. */
if (boundary.is_begin) {
active_segments.append(boundary.segment);
}
else {
active_segments.remove_first_occurrence_and_reorder(boundary.segment);
}
}
}
static void evaluate_coarse_difference(const Span<DifferenceCourseBoundary> boundaries,
CoarseResult &r_result)
{
if (boundaries.is_empty()) {
return;
}
CoarseResult &result = r_result;
CoarseSegment *prev_segment = nullptr;
Vector<const CoarseSegment *> active_main_segments;
Vector<const CoarseSegment *, 16> active_subtract_segments;
int64_t prev_boundary_index = boundaries[0].index;
for (const DifferenceCourseBoundary &boundary : boundaries) {
if (prev_boundary_index < boundary.index) {
/* There is only one main term, so at most one main segment can be active at once. */
BLI_assert(active_main_segments.size() <= 1);
if (active_main_segments.size() == 1) {
const CoarseSegment &active_main_segment = *active_main_segments[0];
/* Compute some properties of the input segments that were active between the current and
* the previous boundary. */
bool has_subtract_full = false;
bool has_subtract_same_mask = false;
for (const CoarseSegment *active_subtract_segment : active_subtract_segments) {
switch (active_subtract_segment->type) {
case CoarseSegment::Type::Unknown: {
break;
}
case CoarseSegment::Type::Full: {
has_subtract_full = true;
break;
}
case CoarseSegment::Type::Copy: {
if (active_main_segment.type == CoarseSegment::Type::Copy) {
if (active_main_segment.mask == active_subtract_segment->mask) {
has_subtract_same_mask = true;
}
}
break;
}
}
}
/* Determine the resulting coarse segment type based on the properties computed above. */
if (has_subtract_full) {
/* Do nothing, the resulting segment is empty for the current range. */
}
else {
switch (active_main_segment.type) {
case CoarseSegment::Type::Unknown: {
prev_segment = &add_coarse_segment__unknown(
prev_segment, prev_boundary_index, boundary.index, result);
break;
}
case CoarseSegment::Type::Full: {
if (active_subtract_segments.is_empty()) {
prev_segment = &add_coarse_segment__full(
prev_segment, prev_boundary_index, boundary.index, result);
}
else {
prev_segment = &add_coarse_segment__unknown(
prev_segment, prev_boundary_index, boundary.index, result);
}
break;
}
case CoarseSegment::Type::Copy: {
if (active_subtract_segments.is_empty()) {
prev_segment = &add_coarse_segment__copy(prev_segment,
prev_boundary_index,
boundary.index,
*active_main_segment.mask,
result);
}
else if (has_subtract_same_mask) {
/* Do nothing, subtracting a mask from itself results in an empty mask. */
}
else {
prev_segment = &add_coarse_segment__unknown(
prev_segment, prev_boundary_index, boundary.index, result);
}
break;
}
}
}
}
prev_boundary_index = boundary.index;
}
/* Update active segments. */
if (boundary.is_main) {
if (boundary.is_begin) {
active_main_segments.append(boundary.segment);
}
else {
active_main_segments.remove_first_occurrence_and_reorder(boundary.segment);
}
}
else {
if (boundary.is_begin) {
active_subtract_segments.append(boundary.segment);
}
else {
active_subtract_segments.remove_first_occurrence_and_reorder(boundary.segment);
}
}
}
}
/**
* The coarse evaluation only looks at the index masks as a whole within the given bounds. This
* limitation allows it to do many operations in constant time independent of the number of indices
* within each mask. For example, it can detect that two full index masks that overlap result in a
* new full index mask when the union of intersection is computed.
*
* For more complex index-masks, coarse evaluation outputs segments with type
* #CoarseSegment::Type::Unknown. Those segments can be evaluated in more detail afterwards.
*
* \param root_expression: Expression to be evaluated.
* \param eval_order: Pre-computed evaluation order.
* All children of a term must come before the term itself.
* \param eval_bounds: If given, the evaluation is restricted to those bounds.
* Otherwise, the full
* referenced masks are used.
*/
static CoarseResult evaluate_coarse(const Expr &root_expression,
const Span<const Expr *> eval_order,
const std::optional<IndexRange> eval_bounds = std::nullopt)
{
/* An expression result for each intermediate expression. */
Array<std::optional<CoarseResult>, inline_expr_array_size> expression_results(
root_expression.expression_array_size());
/* Process expressions in a pre-determined order. */
for (const Expr *expression : eval_order) {
CoarseResult &expr_result = expression_results[expression->index].emplace();
switch (expression->type) {
case Expr::Type::Atomic: {
const AtomicExpr &expr = expression->as_atomic();
IndexMask mask;
if (eval_bounds.has_value()) {
mask = expr.mask->slice_content(*eval_bounds);
}
else {
mask = *expr.mask;
}
if (!mask.is_empty()) {
const IndexRange bounds = mask.bounds();
if (const std::optional<IndexRange> range = mask.to_range()) {
expr_result.segments.append({CoarseSegment::Type::Full, bounds});
}
else {
expr_result.segments.append({CoarseSegment::Type::Copy, bounds, expr.mask});
}
}
break;
}
case Expr::Type::Union: {
const UnionExpr &expr = expression->as_union();
Vector<CourseBoundary, 16> boundaries;
for (const Expr *term : expr.terms) {
const CoarseResult &term_result = *expression_results[term->index];
for (const CoarseSegment &segment : term_result.segments) {
boundaries.append({segment.bounds.first(), true, &segment});
boundaries.append({segment.bounds.one_after_last(), false, &segment});
}
}
sort_course_boundaries(boundaries);
evaluate_coarse_union(boundaries, expr_result);
break;
}
case Expr::Type::Intersection: {
const IntersectionExpr &expr = expression->as_intersection();
Vector<CourseBoundary, 16> boundaries;
for (const Expr *term : expr.terms) {
const CoarseResult &term_result = *expression_results[term->index];
for (const CoarseSegment &segment : term_result.segments) {
boundaries.append({segment.bounds.first(), true, &segment});
boundaries.append({segment.bounds.one_after_last(), false, &segment});
}
}
sort_course_boundaries(boundaries);
evaluate_coarse_intersection(boundaries, expr.terms.size(), expr_result);
break;
}
case Expr::Type::Difference: {
const DifferenceExpr &expr = expression->as_difference();
Vector<DifferenceCourseBoundary, 16> boundaries;
const CoarseResult &main_term_result = *expression_results[expr.terms[0]->index];
for (const CoarseSegment &segment : main_term_result.segments) {
boundaries.append({{segment.bounds.first(), true, &segment}, true});
boundaries.append({{segment.bounds.one_after_last(), false, &segment}, true});
}
for (const Expr *term : expr.terms.as_span().drop_front(1)) {
const CoarseResult &term_result = *expression_results[term->index];
for (const CoarseSegment &segment : term_result.segments) {
boundaries.append({{segment.bounds.first(), true, &segment}, false});
boundaries.append({{segment.bounds.one_after_last(), false, &segment}, false});
}
}
sort_course_boundaries(boundaries);
evaluate_coarse_difference(boundaries, expr_result);
break;
}
}
}
CoarseResult &final_result = *expression_results[root_expression.index];
return std::move(final_result);
}
static Span<int16_t> bits_to_indices(const BoundedBitSpan bits, LinearAllocator<> &allocator)
{
/* TODO: Could first count the number of set bits. */
Vector<int16_t, max_segment_size> indices_vec;
bits::foreach_1_index(bits, [&](const int64_t i) {
BLI_assert(i < max_segment_size);
indices_vec.append_unchecked(int16_t(i));
});
return allocator.construct_array_copy<int16_t>(indices_vec);
}
/**
* Does an exact evaluation of the expression within the given bounds. The evaluation generally
* works in three steps:
* 1. Convert input indices into bit spans.
* 2. Use bit operations to evaluate the expression.
* 3. Convert resulting bit span back to indices.
*
* The trade-off here is that the actual expression evaluation is much faster but the conversions
* take some extra time. Therefore, this approach is best when the evaluation would otherwise take
* longer than the conversions which is usually the case for non-trivial expressions.
*/
static IndexMaskSegment evaluate_exact_with_bits(const Expr &root_expression,
LinearAllocator<> &allocator,
const IndexRange bounds,
const Span<const Expr *> eval_order)
{
BLI_assert(bounds.size() <= max_segment_size);
const int64_t bounds_min = bounds.start();
const int expr_array_size = root_expression.expression_array_size();
/* Make bit span sizes a multiple of `BitsPerInt`. This allows the bit-wise operations to run a
* bit more efficiently, because only full integers are processed. */
const int64_t ints_in_bounds = ceil_division(bounds.size(), bits::BitsPerInt);
BitGroupVector<16 * 1024> expression_results(
expr_array_size, ints_in_bounds * bits::BitsPerInt, false);
for (const Expr *expression : eval_order) {
MutableBoundedBitSpan expr_result = expression_results[expression->index];
switch (expression->type) {
case Expr::Type::Atomic: {
const AtomicExpr &expr = expression->as_atomic();
const IndexMask mask = expr.mask->slice_content(bounds);
mask.to_bits(expr_result, -bounds_min);
break;
}
case Expr::Type::Union: {
for (const Expr *term : expression->terms) {
expr_result |= expression_results[term->index];
}
break;
}
case Expr::Type::Intersection: {
bits::copy_from_or(expr_result, expression_results[expression->terms[0]->index]);
for (const Expr *term : expression->terms.as_span().drop_front(1)) {
expr_result &= expression_results[term->index];
}
break;
}
case Expr::Type::Difference: {
bits::copy_from_or(expr_result, expression_results[expression->terms[0]->index]);
for (const Expr *term : expression->terms.as_span().drop_front(1)) {
bits::mix_into_first_expr(
[](const bits::BitInt a, const bits::BitInt b) { return a & ~b; },
expr_result,
expression_results[term->index]);
}
break;
}
}
}
const BoundedBitSpan final_bits = expression_results[root_expression.index];
const Span<int16_t> indices = bits_to_indices(final_bits, allocator);
return IndexMaskSegment(bounds_min, indices);
}
/** Compute a new set of indices that is the union of the given segments. */
static IndexMaskSegment union_index_mask_segments(const Span<IndexMaskSegment> segments,
const int64_t bounds_min,
int16_t *r_values)
{
if (segments.is_empty()) {
return {};
}
if (segments.size() == 1) {
return segments[0];
}
if (segments.size() == 2) {
const IndexMaskSegment a = segments[0].shift(-bounds_min);
const IndexMaskSegment b = segments[1].shift(-bounds_min);
const int64_t size = std::set_union(a.begin(), a.end(), b.begin(), b.end(), r_values) -
r_values;
return {bounds_min, {r_values, size}};
}
/* Sort input segments by their size, so that smaller segments are unioned first. This results in
* smaller intermediate arrays and thus less work overall. */
Vector<IndexMaskSegment> sorted_segments(segments);
std::sort(
sorted_segments.begin(),
sorted_segments.end(),
[](const IndexMaskSegment &a, const IndexMaskSegment &b) { return a.size() < b.size(); });
std::array<int16_t, max_segment_size> tmp_indices;
/* Can use r_values for temporary values because if it's large enough for the final result, it's
* also large enough for intermediate results. */
int16_t *buffer_a = r_values;
int16_t *buffer_b = tmp_indices.data();
if (sorted_segments.size() % 2 == 1) {
/* Swap buffers so that the result is in #r_values in the end. */
std::swap(buffer_a, buffer_b);
}
int64_t count = 0;
{
/* Initial union. */
const IndexMaskSegment a = sorted_segments[0].shift(-bounds_min);
const IndexMaskSegment b = sorted_segments[1].shift(-bounds_min);
int16_t *dst = buffer_a;
count = std::set_union(a.begin(), a.end(), b.begin(), b.end(), dst) - dst;
}
/* Union one input into the result at a time. In theory, one could write an algorithm that unions
* multiple sorted arrays at once, but that's more complex and it's not obvious that it would be
* faster in the end. */
for (const int64_t segment_i : sorted_segments.index_range().drop_front(2)) {
const int16_t *a = buffer_a;
const IndexMaskSegment b = sorted_segments[segment_i].shift(-bounds_min);
int16_t *dst = buffer_b;
count = std::set_union(a, a + count, b.begin(), b.end(), dst) - dst;
std::swap(buffer_a, buffer_b);
}
return {bounds_min, {r_values, count}};
}
/** Compute a new set of indices that is the intersection of the given segments. */
static IndexMaskSegment intersect_index_mask_segments(const Span<IndexMaskSegment> segments,
const int64_t bounds_min,
int16_t *r_values)
{
if (segments.is_empty()) {
return {};
}
if (segments.size() == 1) {
return segments[0];
}
if (segments.size() == 2) {
const IndexMaskSegment a = segments[0].shift(-bounds_min);
const IndexMaskSegment b = segments[1].shift(-bounds_min);
const int64_t size = std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), r_values) -
r_values;
return {bounds_min, {r_values, size}};
}
/* Intersect smaller segments first, because then the intermediate results will generally be
* smaller. */
Vector<IndexMaskSegment> sorted_segments(segments);
std::sort(
sorted_segments.begin(),
sorted_segments.end(),
[](const IndexMaskSegment &a, const IndexMaskSegment &b) { return a.size() < b.size(); });
std::array<int16_t, max_segment_size> tmp_indices_1;
std::array<int16_t, max_segment_size> tmp_indices_2;
int16_t *buffer_a = tmp_indices_1.data();
int16_t *buffer_b = tmp_indices_2.data();
int64_t count = 0;
{
/* Initial intersection. */
const IndexMaskSegment a = sorted_segments[0].shift(-bounds_min);
const IndexMaskSegment b = sorted_segments[1].shift(-bounds_min);
int16_t *dst = buffer_a;
count = std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), dst) - dst;
}
for (const int64_t segment_i : sorted_segments.index_range().drop_front(2)) {
const int16_t *a = buffer_a;
const IndexMaskSegment b = sorted_segments[segment_i].shift(-bounds_min);
/* The result of the final intersection should be written directly to #r_values to avoid an
* additional copy in the end. */
int16_t *dst = (segment_i == sorted_segments.size() - 1) ? r_values : buffer_b;
count = std::set_intersection(a, a + count, b.begin(), b.end(), dst) - dst;
std::swap(buffer_a, buffer_b);
}
return {bounds_min, {r_values, count}};
}
/**
* Compute a new set of indices that is the difference between the main-segment and all the
* subtract-segments.
*/
static IndexMaskSegment difference_index_mask_segments(
const IndexMaskSegment main_segment,
const Span<IndexMaskSegment> subtract_segments,
const int64_t bounds_min,
int16_t *r_values)
{
if (main_segment.is_empty()) {
return {};
}
if (subtract_segments.is_empty()) {
return main_segment;
}
if (subtract_segments.size() == 1) {
const IndexMaskSegment shifted_main_segment = main_segment.shift(-bounds_min);
const IndexMaskSegment subtract_segment = subtract_segments[0].shift(-bounds_min);
const int64_t size = std::set_difference(shifted_main_segment.begin(),
shifted_main_segment.end(),
subtract_segment.begin(),
subtract_segment.end(),
r_values) -
r_values;
return {bounds_min, {r_values, size}};
}
int64_t subtract_count = 0;
for (const IndexMaskSegment &segment : subtract_segments) {
subtract_count += segment.size();
}
if (subtract_count < main_segment.size() / 2) {
/* Can be more efficient to union all the subtract indices first before computing the
* difference. This avoids potentially multiple larger intermediate arrays. */
std::array<int16_t, max_segment_size> union_indices;
const IndexMaskSegment shifted_main_segment = main_segment.shift(-bounds_min);
const IndexMaskSegment unioned_subtract_segment =
union_index_mask_segments(subtract_segments, bounds_min, union_indices.data())
.shift(-bounds_min);
const int64_t size = std::set_difference(shifted_main_segment.begin(),
shifted_main_segment.end(),
unioned_subtract_segment.begin(),
unioned_subtract_segment.end(),
r_values) -
r_values;
return {bounds_min, {r_values, size}};
}
/* Sort larger segments to the front. This way the intermediate arrays are likely smaller. */
Vector<IndexMaskSegment> sorted_subtract_segments(subtract_segments);
std::sort(
sorted_subtract_segments.begin(),
sorted_subtract_segments.end(),
[](const IndexMaskSegment &a, const IndexMaskSegment &b) { return a.size() > b.size(); });
std::array<int16_t, max_segment_size> tmp_indices_1;
std::array<int16_t, max_segment_size> tmp_indices_2;
int16_t *buffer_a = tmp_indices_1.data();
int16_t *buffer_b = tmp_indices_2.data();
int64_t count = 0;
{
/* Initial difference. */
const IndexMaskSegment shifted_main_segment = main_segment.shift(-bounds_min);
const IndexMaskSegment subtract_segment = sorted_subtract_segments[0].shift(-bounds_min);
int16_t *dst = buffer_a;
count = std::set_difference(shifted_main_segment.begin(),
shifted_main_segment.end(),
subtract_segment.begin(),
subtract_segment.end(),
dst) -
dst;
}
for (const int64_t segment_i : sorted_subtract_segments.index_range().drop_front(1)) {
const IndexMaskSegment &subtract_segment = sorted_subtract_segments[segment_i].shift(
-bounds_min);
/* The final result should be written directly to #r_values to avoid an additional copy. */
int16_t *dst = (segment_i == sorted_subtract_segments.size() - 1) ? r_values : buffer_b;
count = std::set_difference(buffer_a,
buffer_a + count,
subtract_segment.begin(),
subtract_segment.end(),
dst) -
dst;
std::swap(buffer_a, buffer_b);
}
return {bounds_min, {r_values, count}};
}
/**
* Does an exact evaluation of the expression with in the given bounds. The evaluation builds on
* top of algorithms like `std::set_union`. This approach is especially useful if the expression is
* simple and doesn't have many intermediate values.
*/
static IndexMaskSegment evaluate_exact_with_indices(const Expr &root_expression,
LinearAllocator<> &allocator,
const IndexRange bounds,
const Span<const Expr *> eval_order)
{
BLI_assert(bounds.size() <= max_segment_size);
const int64_t bounds_min = bounds.start();
const int expr_array_size = root_expression.expression_array_size();
Array<IndexMaskSegment, inline_expr_array_size> results(expr_array_size);
for (const Expr *expression : eval_order) {
switch (expression->type) {
case Expr::Type::Atomic: {
const AtomicExpr &expr = expression->as_atomic();
const IndexMask mask = expr.mask->slice_content(bounds);
/* The caller should make sure that the bounds are aligned to segment bounds. */
BLI_assert(mask.segments_num() <= 1);
if (mask.segments_num() == 1) {
results[expression->index] = mask.segment(0);
}
break;
}
case Expr::Type::Union: {
const UnionExpr &expr = expression->as_union();
Array<IndexMaskSegment> term_segments(expr.terms.size());
int64_t result_size_upper_bound = 0;
bool used_short_circuit = false;
for (const int64_t term_i : expr.terms.index_range()) {
const Expr &term = *expr.terms[term_i];
const IndexMaskSegment term_segment = results[term.index];
if (term_segment.size() == bounds.size()) {
/* Can skip computing the union if we know that one of the inputs contains all possible
* indices already. */
results[expression->index] = term_segment;
used_short_circuit = true;
break;
}
term_segments[term_i] = term_segment;
result_size_upper_bound += term_segment.size();
}
if (used_short_circuit) {
break;
}
result_size_upper_bound = std::min(result_size_upper_bound, bounds.size());
MutableSpan<int16_t> dst = allocator.allocate_array<int16_t>(result_size_upper_bound);
const IndexMaskSegment result_segment = union_index_mask_segments(
term_segments, bounds_min, dst.data());
allocator.free_end_of_previous_allocation(dst.size_in_bytes(),
result_segment.base_span().end());
results[expression->index] = result_segment;
break;
}
case Expr::Type::Intersection: {
const IntersectionExpr &expr = expression->as_intersection();
Array<IndexMaskSegment> term_segments(expr.terms.size());
int64_t result_size_upper_bound = bounds.size();
bool used_short_circuit = false;
for (const int64_t term_i : expr.terms.index_range()) {
const Expr &term = *expr.terms[term_i];
const IndexMaskSegment term_segment = results[term.index];
if (term_segment.is_empty()) {
/* Can skip computing the intersection if we know that one of the inputs is empty. */
results[expression->index] = {};
used_short_circuit = true;
break;
}
result_size_upper_bound = std::min(result_size_upper_bound, term_segment.size());
term_segments[term_i] = term_segment;
}
if (used_short_circuit) {
break;
}
MutableSpan<int16_t> dst = allocator.allocate_array<int16_t>(result_size_upper_bound);
const IndexMaskSegment result_segment = intersect_index_mask_segments(
term_segments, bounds_min, dst.data());
allocator.free_end_of_previous_allocation(dst.size_in_bytes(),
result_segment.base_span().end());
results[expression->index] = result_segment;
break;
}
case Expr::Type::Difference: {
const DifferenceExpr &expr = expression->as_difference();
const Expr &main_term = *expr.terms[0];
const IndexMaskSegment main_segment = results[main_term.index];
if (main_segment.is_empty()) {
/* Can skip the computation of the main segment is empty. */
results[expression->index] = {};
break;
}
int64_t result_size_upper_bound = main_segment.size();
bool used_short_circuit = false;
Array<IndexMaskSegment> subtract_segments(expr.terms.size() - 1);
for (const int64_t term_i : expr.terms.index_range().drop_front(1)) {
const Expr &subtract_term = *expr.terms[term_i];
const IndexMaskSegment term_segment = results[subtract_term.index];
if (term_segment.size() == bounds.size()) {
/* Can skip computing the difference if we know that one of the subtract-terms is
* full. */
results[expression->index] = {};
used_short_circuit = true;
break;
}
result_size_upper_bound = std::min(result_size_upper_bound,
bounds.size() - term_segment.size());
subtract_segments[term_i - 1] = term_segment;
}
if (used_short_circuit) {
break;
}
MutableSpan<int16_t> dst = allocator.allocate_array<int16_t>(result_size_upper_bound);
const IndexMaskSegment result_segment = difference_index_mask_segments(
main_segment, subtract_segments, bounds_min, dst.data());
allocator.free_end_of_previous_allocation(dst.size_in_bytes(),
result_segment.base_span().end());
results[expression->index] = result_segment;
break;
}
}
}
return results[root_expression.index];
}
/**
* Turn the evaluated segments into index mask segments that are then used to initialize the
* resulting index mask.
*/
static Vector<IndexMaskSegment> build_result_mask_segments(
const Span<EvaluatedSegment> evaluated_segments)
{
const std::array<int16_t, max_segment_size> &static_indices_array = get_static_indices_array();
Vector<IndexMaskSegment> result_mask_segments;
for (const EvaluatedSegment &evaluated_segment : evaluated_segments) {
switch (evaluated_segment.type) {
case EvaluatedSegment::Type::Full: {
const int64_t full_size = evaluated_segment.bounds.size();
for (int64_t i = 0; i < full_size; i += max_segment_size) {
const int64_t size = std::min(i + max_segment_size, full_size) - i;
result_mask_segments.append(IndexMaskSegment(
evaluated_segment.bounds.first() + i, Span(static_indices_array).take_front(size)));
}
break;
}
case EvaluatedSegment::Type::Copy: {
const IndexMask sliced_mask = evaluated_segment.copy_mask->slice_content(
evaluated_segment.bounds);
sliced_mask.foreach_segment(
[&](const IndexMaskSegment &segment) { result_mask_segments.append(segment); });
break;
}
case EvaluatedSegment::Type::Indices: {
result_mask_segments.append(evaluated_segment.indices);
break;
}
}
}
return result_mask_segments;
}
/**
* Computes an evaluation order of the expression. The important aspect is that all child terms
* come before the term that uses them.
*/
static Vector<const Expr *, inline_expr_array_size> compute_eval_order(const Expr &root_expression)
{
Vector<const Expr *, inline_expr_array_size> eval_order;
if (root_expression.type == Expr::Type::Atomic) {
eval_order.append(&root_expression);
return eval_order;
}
Array<bool, inline_expr_array_size> is_evaluated_states(root_expression.expression_array_size(),
false);
Stack<const Expr *, inline_expr_array_size> expr_stack;
expr_stack.push(&root_expression);
while (!expr_stack.is_empty()) {
const Expr &expression = *expr_stack.peek();
bool &is_evaluated = is_evaluated_states[expression.index];
if (is_evaluated) {
expr_stack.pop();
continue;
}
bool all_terms_evaluated = true;
for (const Expr *term : expression.terms) {
bool &term_evaluated = is_evaluated_states[term->index];
if (!term_evaluated) {
if (term->type == Expr::Type::Atomic) {
eval_order.append(term);
term_evaluated = true;
}
else {
expr_stack.push(term);
all_terms_evaluated = false;
}
}
}
if (all_terms_evaluated) {
eval_order.append(&expression);
is_evaluated = true;
expr_stack.pop();
}
}
return eval_order;
}
/** Uses a heuristic to decide which exact evaluation mode probably works best. */
static ExactEvalMode determine_exact_eval_mode(const Expr &root_expression)
{
for (const Expr *term : root_expression.terms) {
if (!term->terms.is_empty()) {
/* Use bits when there are nested expressions as this is often faster. */
return ExactEvalMode::Bits;
}
}
return ExactEvalMode::Indices;
}
static void evaluate_coarse_and_split_until_segments_are_short(
const Expr &root_expression,
const Span<const Expr *> eval_order,
Vector<EvaluatedSegment, 16> &r_evaluated_segments,
Vector<IndexRange, 16> &r_short_unknown_segments)
{
/* Coarse evaluation splits the full range into segments. Long segments are split up and get
* another coarse evaluation. Short segments will be evaluated exactly. */
Stack<IndexRange, 16> long_unknown_segments;
/* The point at which a range starts being "short". */
const int64_t coarse_segment_size_threshold = max_segment_size;
/* Checks the coarse results and inserts its segments into either `long_unknown_segments` for
* further coarse evaluation, `r_short_unknown_segments` for exact evaluation or
* `r_evaluated_segments` if no further evaluation is necessary. */
auto handle_coarse_result = [&](const CoarseResult &coarse_result) {
for (const CoarseSegment &segment : coarse_result.segments) {
switch (segment.type) {
case CoarseSegment::Type::Unknown: {
if (segment.bounds.size() > coarse_segment_size_threshold) {
long_unknown_segments.push(segment.bounds);
}
else {
r_short_unknown_segments.append(segment.bounds);
}
break;
}
case CoarseSegment::Type::Copy: {
BLI_assert(segment.mask);
r_evaluated_segments.append(
{EvaluatedSegment::Type::Copy, segment.bounds, segment.mask});
break;
}
case CoarseSegment::Type::Full: {
r_evaluated_segments.append({EvaluatedSegment::Type::Full, segment.bounds});
break;
}
}
}
};
/* Initial coarse evaluation without any explicit bounds. The bounds are implied by the index
* masks used in the expression. */
const CoarseResult initial_coarse_result = evaluate_coarse(root_expression, eval_order);
handle_coarse_result(initial_coarse_result);
/* Do coarse evaluation until all unknown segments are short enough to do exact evaluation. */
while (!long_unknown_segments.is_empty()) {
const IndexRange unknown_bounds = long_unknown_segments.pop();
const int64_t split_pos = unknown_bounds.size() / 2;
const IndexRange left_half = unknown_bounds.take_front(split_pos);
const IndexRange right_half = unknown_bounds.drop_front(split_pos);
const CoarseResult left_result = evaluate_coarse(root_expression, eval_order, left_half);
const CoarseResult right_result = evaluate_coarse(root_expression, eval_order, right_half);
handle_coarse_result(left_result);
handle_coarse_result(right_result);
}
}
static void evaluate_short_unknown_segments_exactly(
const Expr &root_expression,
const ExactEvalMode exact_eval_mode,
const Span<const Expr *> eval_order,
const Span<IndexRange> short_unknown_segments,
IndexMaskMemory &memory,
Vector<EvaluatedSegment, 16> &r_evaluated_segments)
{
/* Evaluate a segment exactly. */
auto evaluate_unknown_segment = [&](const IndexRange bounds,
LinearAllocator<> &allocator,
Vector<EvaluatedSegment, 16> &r_local_evaluated_segments) {
/* Use the predetermined evaluation mode. */
switch (exact_eval_mode) {
case ExactEvalMode::Bits: {
const IndexMaskSegment indices = evaluate_exact_with_bits(
root_expression, allocator, bounds, eval_order);
if (!indices.is_empty()) {
r_local_evaluated_segments.append(
{EvaluatedSegment::Type::Indices, bounds, nullptr, indices});
}
break;
}
case ExactEvalMode::Indices: {
/* #evaluate_exact_with_indices requires that all index masks have a single segment in the
* provided bounds. So split up the range into sub-ranges first if necessary. */
Vector<int64_t, 16> split_indices;
/* Always adding the beginning and end of the bounds simplifies the code below. */
split_indices.extend({bounds.first(), bounds.one_after_last()});
for (const int64_t eval_order_i : eval_order.index_range()) {
const Expr &expr = *eval_order[eval_order_i];
if (expr.type != Expr::Type::Atomic) {
continue;
}
const AtomicExpr &atomic_expr = expr.as_atomic();
const IndexMask mask = atomic_expr.mask->slice_content(bounds);
const int64_t segments_num = mask.segments_num();
if (segments_num <= 1) {
/* This mask only has a single segment in the bounds anyway, so no extra split-position
* is necessary. */
continue;
}
/* Split at the beginning of each segment. Skipping the first, because that does not need
* an extra split position. Alternatively, one could also split at the end of each
* segment except the last one. It doesn't matter much. */
for (const int64_t segment_i : IndexRange(segments_num).drop_front(1)) {
const IndexMaskSegment segment = mask.segment(segment_i);
split_indices.append(segment[0]);
}
}
std::sort(split_indices.begin(), split_indices.end());
for (const int64_t boundary_i : split_indices.index_range().drop_back(1)) {
const IndexRange sub_bounds = IndexRange::from_begin_end(split_indices[boundary_i],
split_indices[boundary_i + 1]);
if (sub_bounds.is_empty()) {
continue;
}
const IndexMaskSegment indices = evaluate_exact_with_indices(
root_expression, allocator, sub_bounds, eval_order);
if (!indices.is_empty()) {
r_local_evaluated_segments.append(
{EvaluatedSegment::Type::Indices, sub_bounds, nullptr, indices});
}
}
break;
}
}
};
/* Decide whether multi-threading should be used or not. There is some extra overhead even when
* just attempting to use multi-threading. */
const int64_t unknown_segment_eval_grain_size = 8;
if (short_unknown_segments.size() < unknown_segment_eval_grain_size) {
for (const IndexRange &bounds : short_unknown_segments) {
evaluate_unknown_segment(bounds, memory, r_evaluated_segments);
}
}
else {
/* Do exact evaluation in multiple threads. The allocators and evaluated segments created by
* each thread are merged in the end. */
struct LocalData {
LinearAllocator<> allocator;
Vector<EvaluatedSegment, 16> evaluated_segments;
};
threading::EnumerableThreadSpecific<LocalData> data_by_thread;
threading::parallel_for(short_unknown_segments.index_range(),
unknown_segment_eval_grain_size,
[&](const IndexRange range) {
LocalData &data = data_by_thread.local();
for (const IndexRange &bounds : short_unknown_segments.slice(range))
{
evaluate_unknown_segment(
bounds, data.allocator, data.evaluated_segments);
}
});
for (LocalData &data : data_by_thread) {
if (!data.evaluated_segments.is_empty()) {
r_evaluated_segments.extend(data.evaluated_segments);
memory.transfer_ownership_from(data.allocator);
}
}
}
}
static IndexMask evaluated_segments_to_index_mask(MutableSpan<EvaluatedSegment> evaluated_segments,
IndexMaskMemory &memory)
{
if (evaluated_segments.is_empty()) {
return {};
}
if (evaluated_segments.size() == 1) {
const EvaluatedSegment &evaluated_segment = evaluated_segments[0];
switch (evaluated_segment.type) {
case EvaluatedSegment::Type::Full: {
return IndexMask(IndexRange(evaluated_segment.bounds));
}
case EvaluatedSegment::Type::Copy: {
return evaluated_segment.copy_mask->slice_content(evaluated_segment.bounds);
}
case EvaluatedSegment::Type::Indices: {
return IndexMask::from_segments({evaluated_segment.indices}, memory);
}
}
}
std::sort(evaluated_segments.begin(),
evaluated_segments.end(),
[](const EvaluatedSegment &a, const EvaluatedSegment &b) {
return a.bounds.start() < b.bounds.start();
});
Vector<IndexMaskSegment> result_segments = build_result_mask_segments(evaluated_segments);
return IndexMask::from_segments(result_segments, memory);
}
static IndexMask evaluate_expression_impl(const Expr &root_expression,
IndexMaskMemory &memory,
const ExactEvalMode exact_eval_mode)
{
/* Precompute the evaluation order here, because it's used potentially many times throughout the
* algorithm. */
const Vector<const Expr *, inline_expr_array_size> eval_order = compute_eval_order(
root_expression);
/* Non-overlapping evaluated segments which become the resulting index mask in the end. Note that
* these segments are only sorted in the end. */
Vector<EvaluatedSegment, 16> evaluated_segments;
Vector<IndexRange, 16> short_unknown_segments;
evaluate_coarse_and_split_until_segments_are_short(
root_expression, eval_order, evaluated_segments, short_unknown_segments);
evaluate_short_unknown_segments_exactly(root_expression,
exact_eval_mode,
eval_order,
short_unknown_segments,
memory,
evaluated_segments);
return evaluated_segments_to_index_mask(evaluated_segments, memory);
}
IndexMask evaluate_expression(const Expr &expression, IndexMaskMemory &memory)
{
const ExactEvalMode exact_eval_mode = determine_exact_eval_mode(expression);
IndexMask mask = evaluate_expression_impl(expression, memory, exact_eval_mode);
#ifndef NDEBUG
{
/* Check that both exact eval modes have the same result. */
const ExactEvalMode other_exact_eval_mode = (exact_eval_mode == ExactEvalMode::Bits) ?
ExactEvalMode::Indices :
ExactEvalMode::Bits;
IndexMask other_mask = evaluate_expression_impl(expression, memory, other_exact_eval_mode);
BLI_assert(mask == other_mask);
}
#endif
return mask;
}
const UnionExpr &ExprBuilder::merge(const Span<Term> terms)
{
Vector<const Expr *> term_expressions;
for (const Term &term : terms) {
term_expressions.append(&this->term_to_expr(term));
}
UnionExpr &expr = scope_.construct<UnionExpr>();
expr.type = Expr::Type::Union;
expr.index = expr_count_++;
expr.terms = std::move(term_expressions);
return expr;
}
const DifferenceExpr &ExprBuilder::subtract(const Term &main_term, const Span<Term> subtract_terms)
{
Vector<const Expr *> term_expressions;
term_expressions.append(&this->term_to_expr(main_term));
for (const Term &subtract_term : subtract_terms) {
term_expressions.append(&this->term_to_expr(subtract_term));
}
DifferenceExpr &expr = scope_.construct<DifferenceExpr>();
expr.type = Expr::Type::Difference;
expr.index = expr_count_++;
expr.terms = std::move(term_expressions);
return expr;
}
const IntersectionExpr &ExprBuilder::intersect(const Span<Term> terms)
{
Vector<const Expr *> term_expressions;
for (const Term &term : terms) {
term_expressions.append(&this->term_to_expr(term));
}
IntersectionExpr &expr = scope_.construct<IntersectionExpr>();
expr.type = Expr::Type::Intersection;
expr.index += expr_count_++;
expr.terms = std::move(term_expressions);
return expr;
}
const Expr &ExprBuilder::term_to_expr(const Term &term)
{
if (const Expr *const *expr = std::get_if<const Expr *>(&term)) {
return **expr;
}
AtomicExpr &expr = scope_.construct<AtomicExpr>();
expr.type = Expr::Type::Atomic;
expr.index = expr_count_++;
if (const IndexRange *range = std::get_if<IndexRange>(&term)) {
expr.mask = &scope_.construct<IndexMask>(*range);
}
else {
expr.mask = std::get<const IndexMask *>(term);
}
return expr;
}
} // namespace blender::index_mask