From ee1fa8e1ca62506a6dd6fd5d053648569c672faf Mon Sep 17 00:00:00 2001
From: Jacques Lucke <jacques@blender.org>
Date: Sun, 17 Mar 2024 09:52:32 +0100
Subject: [PATCH] BLI: support set operations on index masks

The `IndexMask` data structure was designed to allow us to implement set
operations like `union`, `intersection` and `difference` efficiently
(2cfcb8b0b805401a0ffb252fd41750fadb0392ab). This patch adds an evaluator for
arbitrary expressions involving the mentioned operations. The evaluator makes
use of the design of the `IndexMask` data structure to be quite efficient.

In some common cases, the evaluator runs in constant time. So it's very fast
even if the mask contains many millions of indices. If possible the evaluator
works on entire segments at once instead of looking at the individual indices.
This results in a very low constant factor even if the evaluation time is
linear. If the evaluator has to look at the individual indices to be able to
perform the operation, it can make use of multi-threading.

The evaluation consists of the following steps:
1. A coarse evaluation that looks at entire segments at once.
2. All segments that couldn't be fully evaluated by the coarse evaluation are
   evaluated exactly by looking at the actual indices. There are two evaluators
   for this case. One that is based on `std::set_union` etc. The other one first
   converts the index masks to bit spans, then does bit operations to evaluate
   the expression, and then converts the bits back into indices. Depending on
   the expression, one or the other can be more efficient.
3. Construct an index mask from the evaluated segments.

Showing the performance of the evaluator is kind of difficult because it highly
depends on the input data. Comparing the performance to something that does not
short-circuit when there are full ranges is meaningless, because one can
construct an example where the new evaluator is arbitrarily faster. I'm still
working on a case where performance can be compared to e.g. using
`std::set_union`. This comparison is only fair when the input data when
constructing a case where the new evaluator can't short-circuit.

One of the main remaining bottlenecks are the calls to `slice_content` on large
index masks. I think the impact of those can still be reduced.

We are not using this evaluator much yet, except through `IndexMask::complement`
calls. I intend to use it when I get to refactoring the field evaluator for
geometry nodes to optimize the evaluation of selections.

Pull Request: https://projects.blender.org/blender/blender/pulls/117805
---
 source/blender/blenlib/BLI_index_mask.hh      |   24 +-
 .../blenlib/BLI_index_mask_expression.hh      |   94 ++
 source/blender/blenlib/BLI_index_range.hh     |    8 +
 .../blender/blenlib/BLI_linear_allocator.hh   |   32 +
 source/blender/blenlib/CMakeLists.txt         |    3 +
 source/blender/blenlib/intern/index_mask.cc   |  204 +--
 .../blenlib/intern/index_mask_expression.cc   | 1360 +++++++++++++++++
 .../tests/BLI_index_mask_expression_test.cc   |  269 ++++
 8 files changed, 1818 insertions(+), 176 deletions(-)
 create mode 100644 source/blender/blenlib/BLI_index_mask_expression.hh
 create mode 100644 source/blender/blenlib/intern/index_mask_expression.cc
 create mode 100644 source/blender/blenlib/tests/BLI_index_mask_expression_test.cc

diff --git a/source/blender/blenlib/BLI_index_mask.hh b/source/blender/blenlib/BLI_index_mask.hh
index 9ec96385339..7d5ae8a7ad3 100644
--- a/source/blender/blenlib/BLI_index_mask.hh
+++ b/source/blender/blenlib/BLI_index_mask.hh
@@ -128,6 +128,12 @@ class IndexMaskSegment : public OffsetSpan<int64_t, int16_t> {
 
   IndexMaskSegment slice(const IndexRange &range) const;
   IndexMaskSegment slice(const int64_t start, const int64_t size) const;
+
+  /**
+   * Get a new segment where each index is modified by the given amount. This works in constant
+   * time, because only the offset value is changed.
+   */
+  IndexMaskSegment shift(const int64_t shift) const;
 };
 
 /**
@@ -423,7 +429,7 @@ class IndexMask : private IndexMaskData {
   /**
    * Set the bits at indices in the mask to 1 and all other bits to 0.
    */
-  void to_bits(MutableBitSpan r_bits) const;
+  void to_bits(MutableBitSpan r_bits, int64_t offset = 0) const;
   /**
    * Set the bools at indices in the mask to true and all others to false.
    */
@@ -534,6 +540,16 @@ inline void masked_fill(MutableSpan<T> data, const T &value, const IndexMask &ma
  */
 template<typename T> void build_reverse_map(const IndexMask &mask, MutableSpan<T> r_map);
 
+/**
+ * Joins segments together based on heuristics. Generally, one wants as few segments as possible,
+ * but one also wants full-range-segments if possible and we don't want to copy too many indices
+ * around to reduce the number of segments.
+ *
+ * \return Number of consolidated segments. Those are ordered to the beginning of the span.
+ */
+int64_t consolidate_index_mask_segments(MutableSpan<IndexMaskSegment> segments,
+                                        IndexMaskMemory &memory);
+
 /* -------------------------------------------------------------------- */
 /** \name #RawMaskIterator Inline Methods
  * \{ */
@@ -568,6 +584,12 @@ inline IndexMaskSegment IndexMaskSegment::slice(const int64_t start, const int64
       static_cast<const OffsetSpan<int64_t, int16_t> *>(this)->slice(start, size));
 }
 
+inline IndexMaskSegment IndexMaskSegment::shift(const int64_t shift) const
+{
+  BLI_assert(this->is_empty() || (*this)[0] + shift >= 0);
+  return IndexMaskSegment(this->offset() + shift, this->base_span());
+}
+
 /* -------------------------------------------------------------------- */
 /** \name #IndexMask Inline Methods
  * \{ */
diff --git a/source/blender/blenlib/BLI_index_mask_expression.hh b/source/blender/blenlib/BLI_index_mask_expression.hh
new file mode 100644
index 00000000000..a11875349f7
--- /dev/null
+++ b/source/blender/blenlib/BLI_index_mask_expression.hh
@@ -0,0 +1,94 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "BLI_index_mask.hh"
+#include "BLI_resource_scope.hh"
+
+namespace blender::index_mask {
+
+struct AtomicExpr;
+struct UnionExpr;
+struct IntersectionExpr;
+struct DifferenceExpr;
+
+struct Expr {
+  enum class Type {
+    Atomic,
+    Union,
+    Intersection,
+    Difference,
+  };
+
+  Type type;
+  int index;
+  Vector<const Expr *> terms;
+
+  int expression_array_size() const;
+
+  const AtomicExpr &as_atomic() const;
+  const UnionExpr &as_union() const;
+  const IntersectionExpr &as_intersection() const;
+  const DifferenceExpr &as_difference() const;
+};
+
+struct AtomicExpr : public Expr {
+  const IndexMask *mask;
+};
+
+struct UnionExpr : public Expr {};
+
+struct IntersectionExpr : public Expr {};
+
+struct DifferenceExpr : public Expr {};
+
+class ExprBuilder {
+ private:
+  ResourceScope scope_;
+  int expr_count_ = 0;
+
+ public:
+  using Term = std::variant<const Expr *, const IndexMask *, IndexRange>;
+
+  const UnionExpr &merge(const Span<Term> terms);
+  const DifferenceExpr &subtract(const Term &main_term, const Span<Term> subtract_terms);
+  const IntersectionExpr &intersect(const Span<Term> terms);
+
+ private:
+  const Expr &term_to_expr(const Term &term);
+};
+
+IndexMask evaluate_expression(const Expr &expression, IndexMaskMemory &memory);
+
+inline int Expr::expression_array_size() const
+{
+  return this->index + 1;
+}
+
+inline const AtomicExpr &Expr::as_atomic() const
+{
+  BLI_assert(this->type == Type::Atomic);
+  return static_cast<const AtomicExpr &>(*this);
+}
+
+inline const UnionExpr &Expr::as_union() const
+{
+  BLI_assert(this->type == Type::Union);
+  return static_cast<const UnionExpr &>(*this);
+}
+
+inline const IntersectionExpr &Expr::as_intersection() const
+{
+  BLI_assert(this->type == Type::Intersection);
+  return static_cast<const IntersectionExpr &>(*this);
+}
+
+inline const DifferenceExpr &Expr::as_difference() const
+{
+  BLI_assert(this->type == Type::Difference);
+  return static_cast<const DifferenceExpr &>(*this);
+}
+
+}  // namespace blender::index_mask
diff --git a/source/blender/blenlib/BLI_index_range.hh b/source/blender/blenlib/BLI_index_range.hh
index 59a70fe96be..933be715f75 100644
--- a/source/blender/blenlib/BLI_index_range.hh
+++ b/source/blender/blenlib/BLI_index_range.hh
@@ -167,6 +167,14 @@ class IndexRange {
     return size_ == 0;
   }
 
+  /**
+   * Creates a new index range with the same beginning but a different end.
+   */
+  constexpr IndexRange with_new_end(const int64_t new_end) const
+  {
+    return IndexRange::from_begin_end(start_, new_end);
+  }
+
   /**
    * Create a new range starting at the end of the current one.
    */
diff --git a/source/blender/blenlib/BLI_linear_allocator.hh b/source/blender/blenlib/BLI_linear_allocator.hh
index 36151998e8c..5c8cb344367 100644
--- a/source/blender/blenlib/BLI_linear_allocator.hh
+++ b/source/blender/blenlib/BLI_linear_allocator.hh
@@ -213,6 +213,38 @@ template<typename Allocator = GuardedAllocator> class LinearAllocator : NonCopya
     this->provide_buffer(aligned_buffer.ptr(), Size);
   }
 
+  /**
+   * Some algorithms can be implemented more efficiently by over-allocating the destination memory
+   * a bit. This allows the algorithm not to worry about having enough memory. Generally, this can
+   * be a useful strategy if the actual required memory is not known in advance, but an upper bound
+   * can be found. Ideally, one can free the over-allocated memory in the end again to reduce
+   * memory consumption.
+   *
+   * A linear allocator generally does allow freeing any memory. However, there is one exception.
+   * One can free the end of the last allocation (but not any previous allocation). While uses of
+   * this approach are quite limited, it's still the best option in some situations.
+   */
+  void free_end_of_previous_allocation(const int64_t original_allocation_size,
+                                       const void *free_after)
+  {
+    /* If the original allocation size was large, it might have been separately allocated. In this
+     * case, we can't free the end of it anymore. */
+    if (original_allocation_size <= large_buffer_threshold) {
+      const int64_t new_begin = uintptr_t(free_after);
+      BLI_assert(new_begin <= current_begin_);
+#ifndef NDEBUG
+      /* This condition is not really necessary but it helps finding the cases where memory was
+       * freed. */
+      const int64_t freed_bytes_num = current_begin_ - new_begin;
+      if (freed_bytes_num > 0) {
+        current_begin_ = new_begin;
+      }
+#else
+      current_begin_ = new_begin;
+#endif
+    }
+  }
+
   /**
    * This allocator takes ownership of the buffers owned by `other`. Therefor, when `other` is
    * destructed, memory allocated using it is not freed.
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 230e69c122b..e8ef92025c4 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -82,6 +82,7 @@ set(SRC
   intern/hash_tables.cc
   intern/implicit_sharing.cc
   intern/index_mask.cc
+  intern/index_mask_expression.cc
   intern/index_range.cc
   intern/jitter_2d.c
   intern/kdtree_1d.c
@@ -252,6 +253,7 @@ set(SRC
   BLI_implicit_sharing_ptr.hh
   BLI_index_mask.hh
   BLI_index_mask_fwd.hh
+  BLI_index_mask_expression.hh
   BLI_index_range.hh
   BLI_inplace_priority_queue.hh
   BLI_iterator.h
@@ -516,6 +518,7 @@ if(WITH_GTESTS)
     tests/BLI_heap_test.cc
     tests/BLI_implicit_sharing_test.cc
     tests/BLI_index_mask_test.cc
+    tests/BLI_index_mask_expression_test.cc
     tests/BLI_index_range_test.cc
     tests/BLI_inplace_priority_queue_test.cc
     tests/BLI_kdopbvh_test.cc
diff --git a/source/blender/blenlib/intern/index_mask.cc b/source/blender/blenlib/intern/index_mask.cc
index 6c1a290dfde..d5039cc63d7 100644
--- a/source/blender/blenlib/intern/index_mask.cc
+++ b/source/blender/blenlib/intern/index_mask.cc
@@ -10,6 +10,7 @@
 #include "BLI_bit_vector.hh"
 #include "BLI_enumerable_thread_specific.hh"
 #include "BLI_index_mask.hh"
+#include "BLI_index_mask_expression.hh"
 #include "BLI_math_base.hh"
 #include "BLI_set.hh"
 #include "BLI_sort.hh"
@@ -211,15 +212,11 @@ IndexMask IndexMask::shift(const int64_t offset, IndexMaskMemory &memory) const
   return shifted_mask;
 }
 
-/**
- * Merges consecutive segments in some cases. Having fewer but larger segments generally allows for
- * better performance when using the mask later on.
- */
-static void consolidate_segments(Vector<IndexMaskSegment, 16> &segments,
-                                 IndexMaskMemory & /*memory*/)
+int64_t consolidate_index_mask_segments(MutableSpan<IndexMaskSegment> segments,
+                                        IndexMaskMemory & /*memory*/)
 {
   if (segments.is_empty()) {
-    return;
+    return 0;
   }
 
   const Span<int16_t> static_indices = get_static_indices_array();
@@ -268,7 +265,13 @@ static void consolidate_segments(Vector<IndexMaskSegment, 16> &segments,
   finish_group(segments.size() - 1);
 
   /* Remove all segments that have been merged into previous segments. */
-  segments.remove_if([](const IndexMaskSegment segment) { return segment.is_empty(); });
+  const int64_t new_segments_num = std::remove_if(segments.begin(),
+                                                  segments.end(),
+                                                  [](const IndexMaskSegment segment) {
+                                                    return segment.is_empty();
+                                                  }) -
+                                   segments.begin();
+  return new_segments_num;
 }
 
 IndexMask IndexMask::from_segments(const Span<IndexMaskSegment> segments, IndexMaskMemory &memory)
@@ -389,162 +392,12 @@ struct ParallelSegmentsCollector {
   }
 };
 
-/**
- * Convert a range to potentially multiple index mask segments.
- */
-static void range_to_segments(const IndexRange range, Vector<IndexMaskSegment, 16> &r_segments)
-{
-  const Span<int16_t> static_indices = get_static_indices_array();
-  for (int64_t start = 0; start < range.size(); start += max_segment_size) {
-    const int64_t size = std::min(max_segment_size, range.size() - start);
-    r_segments.append_as(range.start() + start, static_indices.take_front(size));
-  }
-}
-
-static int64_t get_size_before_gap(const Span<int16_t> indices)
-{
-  BLI_assert(indices.size() >= 2);
-  if (indices[1] > indices[0] + 1) {
-    /* For sparse indices, often the next gap is just after the next index.
-     * In this case we can skip the logarithmic check below. */
-    return 1;
-  }
-  return unique_sorted_indices::find_size_of_next_range(indices);
-}
-
-static void inverted_indices_to_segments(const IndexMaskSegment segment,
-                                         LinearAllocator<> &allocator,
-                                         Vector<IndexMaskSegment, 16> &r_segments)
-{
-  constexpr int64_t range_threshold = 64;
-  const int64_t offset = segment.offset();
-  const Span<int16_t> static_indices = get_static_indices_array();
-
-  int64_t inverted_index_count = 0;
-  std::array<int16_t, max_segment_size> inverted_indices_array;
-  auto add_indices = [&](const int16_t start, const int16_t num) {
-    int16_t *new_indices_begin = inverted_indices_array.data() + inverted_index_count;
-    std::iota(new_indices_begin, new_indices_begin + num, start);
-    inverted_index_count += num;
-  };
-
-  auto finish_indices = [&]() {
-    if (inverted_index_count == 0) {
-      return;
-    }
-    MutableSpan<int16_t> offset_indices = allocator.allocate_array<int16_t>(inverted_index_count);
-    offset_indices.copy_from(Span(inverted_indices_array).take_front(inverted_index_count));
-    r_segments.append_as(offset, offset_indices);
-    inverted_index_count = 0;
-  };
-
-  Span<int16_t> indices = segment.base_span();
-  while (indices.size() > 1) {
-    const int64_t size_before_gap = get_size_before_gap(indices);
-    if (size_before_gap == indices.size()) {
-      break;
-    }
-
-    const int16_t gap_first = indices[size_before_gap - 1] + 1;
-    const int16_t next = indices[size_before_gap];
-    const int16_t gap_size = next - gap_first;
-    if (gap_size > range_threshold) {
-      finish_indices();
-      r_segments.append_as(offset + gap_first, static_indices.take_front(gap_size));
-    }
-    else {
-      add_indices(gap_first, gap_size);
-    }
-
-    indices = indices.drop_front(size_before_gap);
-  }
-
-  finish_indices();
-}
-
-static void invert_segments(const IndexMask &mask,
-                            const IndexRange segment_range,
-                            LinearAllocator<> &allocator,
-                            Vector<IndexMaskSegment, 16> &r_segments)
-{
-  for (const int64_t segment_i : segment_range) {
-    const IndexMaskSegment segment = mask.segment(segment_i);
-    inverted_indices_to_segments(segment, allocator, r_segments);
-
-    const IndexMaskSegment next_segment = mask.segment(segment_i + 1);
-    const int64_t between_start = segment.last() + 1;
-    const int64_t size_between_segments = next_segment[0] - segment.last() - 1;
-    const IndexRange range_between_segments(between_start, size_between_segments);
-    if (!range_between_segments.is_empty()) {
-      range_to_segments(range_between_segments, r_segments);
-    }
-  }
-}
-
 IndexMask IndexMask::complement(const IndexRange universe, IndexMaskMemory &memory) const
 {
-  if (this->is_empty()) {
-    return universe;
-  }
-  if (universe.is_empty()) {
-    return {};
-  }
-  const std::optional<IndexRange> this_range = this->to_range();
-  if (this_range) {
-    const bool first_in_range = this_range->first() <= universe.first();
-    const bool last_in_range = this_range->last() >= universe.last();
-    if (first_in_range && last_in_range) {
-      /* This mask fills the entire universe, so the complement is empty. */
-      return {};
-    }
-    if (first_in_range) {
-      /* This mask is a range that contains the start of the universe.
-       * The complement is a range that contains the end of the universe. */
-      return IndexRange::from_begin_end(this_range->one_after_last(), universe.one_after_last());
-    }
-    if (last_in_range) {
-      /* This mask is a range that contains the end of the universe.
-       * The complement is a range that contains the start of the universe. */
-      return IndexRange::from_begin_end(universe.first(), this_range->first());
-    }
-  }
-
-  Vector<IndexMaskSegment, 16> segments;
-
-  if (universe.start() < this->first()) {
-    range_to_segments(universe.take_front(this->first() - universe.start()), segments);
-  }
-
-  if (!this_range) {
-    const int64_t segments_num = this->segments_num();
-
-    constexpr int64_t min_grain_size = 16;
-    constexpr int64_t max_grain_size = 4096;
-    const int64_t threads_num = BLI_system_thread_count();
-    const int64_t grain_size = std::clamp(
-        segments_num / threads_num, min_grain_size, max_grain_size);
-
-    const IndexRange non_last_segments = IndexRange(segments_num).drop_back(1);
-    if (segments_num < min_grain_size) {
-      invert_segments(*this, non_last_segments, memory, segments);
-    }
-    else {
-      ParallelSegmentsCollector segments_collector;
-      threading::parallel_for(non_last_segments, grain_size, [&](const IndexRange range) {
-        ParallelSegmentsCollector::LocalData &local_data =
-            segments_collector.data_by_thread.local();
-        invert_segments(*this, range, local_data.allocator, local_data.segments);
-      });
-      segments_collector.reduce(memory, segments);
-    }
-    inverted_indices_to_segments(this->segment(segments_num - 1), memory, segments);
-  }
-
-  if (universe.last() > this->first()) {
-    range_to_segments(universe.take_back(universe.last() - this->last()), segments);
-  }
-
-  return IndexMask::from_segments(segments, memory);
+  ExprBuilder builder;
+  const IndexMask universe_mask{universe};
+  const Expr &expr = builder.subtract(&universe_mask, {this});
+  return evaluate_expression(expr, memory);
 }
 
 template<typename T>
@@ -580,7 +433,8 @@ IndexMask IndexMask::from_indices(const Span<T> indices, IndexMaskMemory &memory
     });
     segments_collector.reduce(memory, segments);
   }
-  consolidate_segments(segments, memory);
+  const int64_t consolidated_segments_num = consolidate_index_mask_segments(segments, memory);
+  segments.resize(consolidated_segments_num);
   return IndexMask::from_segments(segments, memory);
 }
 
@@ -636,13 +490,9 @@ IndexMask IndexMask::from_union(const IndexMask &mask_a,
                                 const IndexMask &mask_b,
                                 IndexMaskMemory &memory)
 {
-  const int64_t new_size = math::max(mask_a.min_array_size(), mask_b.min_array_size());
-  Array<bool> tmp(new_size, false);
-  mask_a.foreach_index_optimized<int64_t>(GrainSize(2048),
-                                          [&](const int64_t i) { tmp[i] = true; });
-  mask_b.foreach_index_optimized<int64_t>(GrainSize(2048),
-                                          [&](const int64_t i) { tmp[i] = true; });
-  return IndexMask::from_bools(tmp, memory);
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask_a, &mask_b});
+  return evaluate_expression(expr, memory);
 }
 
 IndexMask IndexMask::from_initializers(const Span<Initializer> initializers,
@@ -684,17 +534,20 @@ template<typename T> void IndexMask::to_indices(MutableSpan<T> r_indices) const
       });
 }
 
-void IndexMask::to_bits(MutableBitSpan r_bits) const
+void IndexMask::to_bits(MutableBitSpan r_bits, const int64_t offset) const
 {
-  BLI_assert(r_bits.size() >= this->min_array_size());
+  BLI_assert(r_bits.size() >= this->min_array_size() + offset);
   r_bits.reset_all();
   this->foreach_segment_optimized([&](const auto segment) {
     if constexpr (std::is_same_v<std::decay_t<decltype(segment)>, IndexRange>) {
       const IndexRange range = segment;
-      r_bits.slice(range).set_all();
+      const IndexRange shifted_range = range.shift(offset);
+      r_bits.slice(shifted_range).set_all();
     }
     else {
-      for (const int64_t i : segment) {
+      const IndexMaskSegment indices = segment;
+      const IndexMaskSegment shifted_indices = indices.shift(offset);
+      for (const int64_t i : shifted_indices) {
         r_bits[i].set();
       }
     }
@@ -785,7 +638,8 @@ IndexMask from_predicate_impl(
     segments_collector.reduce(memory, segments);
   }
 
-  consolidate_segments(segments, memory);
+  const int64_t consolidated_segments_num = consolidate_index_mask_segments(segments, memory);
+  segments.resize(consolidated_segments_num);
   return IndexMask::from_segments(segments, memory);
 }
 }  // namespace detail
diff --git a/source/blender/blenlib/intern/index_mask_expression.cc b/source/blender/blenlib/intern/index_mask_expression.cc
new file mode 100644
index 00000000000..4252d912d32
--- /dev/null
+++ b/source/blender/blenlib/intern/index_mask_expression.cc
@@ -0,0 +1,1360 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/**
+ * Expression evaluation has multiple phases:
+ * 1. A coarse evaluation that tries to find segments which can be trivially evaluated. For
+ *    example, taking the union of two overlapping ranges can be done in O(1) time.
+ * 2. For all segments which can't be fully evaluated using coarse evaluation, an exact evaluation
+ *    is done. This uses either an index-based or bit-based approach depending on a heuristic.
+ * 3. Construct the final index mask based on the resulting intermediate segments.
+ */
+
+#include "BLI_array.hh"
+#include "BLI_bit_group_vector.hh"
+#include "BLI_bit_span_ops.hh"
+#include "BLI_enumerable_thread_specific.hh"
+#include "BLI_index_mask_expression.hh"
+#include "BLI_stack.hh"
+#include "BLI_strict_flags.h"
+#include "BLI_task.hh"
+#include "BLI_timeit.hh"
+
+namespace blender::index_mask {
+
+/**
+ * Number of expression terms which don't require extra allocations in some places.
+ */
+constexpr int64_t inline_expr_array_size = 16;
+
+/**
+ * The result of the coarse evaluation for a specific index range.
+ */
+struct CoarseSegment {
+  enum class Type {
+    /**
+     * Coarse evaluation couldn't fully resolve this segment. The segment requires another
+     * evaluation that is more detailed.
+     */
+    Unknown,
+    /** All indices in the segment are part of the result. */
+    Full,
+    /** The evaluated result of this segment is just the copy of an input index mask. */
+    Copy,
+  };
+  Type type = Type::Unknown;
+  IndexRange bounds;
+  /** Mask used when the type is #Copy. */
+  const IndexMask *mask = nullptr;
+};
+
+/** Contains the result of a coarse evaluation split into potentially many segments. */
+struct CoarseResult {
+  Vector<CoarseSegment> segments;
+};
+
+/** Used during coarse evaluation to split the full range into multiple segments. */
+struct CourseBoundary {
+  /**
+   * The position of the boundary. The boundary is right before this index. So if this boundary is
+   * a beginning of a segment, the index marks the first element. If it is the end, the index marks
+   * the one-after-last position.
+   */
+  int64_t index;
+  /** Whether this boundary is the beginning or end of the segment below. */
+  bool is_begin;
+  /** The segment this boundary comes from. */
+  const CoarseSegment *segment;
+};
+
+/** For the difference operation, we need to know if a boundary belongs to the main term or not. */
+struct DifferenceCourseBoundary : public CourseBoundary {
+  bool is_main;
+};
+
+/**
+ * Result of the expression evaluation within a specific index range. Sometimes this can be derived
+ * directly from the coarse evaluation, but sometimes an additional exact evaluation is necessary.
+ */
+struct EvaluatedSegment {
+  enum class Type {
+    /** All indices in this segment are part of the evaluated index mask. */
+    Full,
+    /** The result in this segment is the same as what is contained in the #copy_mask below. */
+    Copy,
+    /** The result comes from exact evaluation and is a new set of indices. */
+    Indices,
+  };
+
+  Type type = Type::Indices;
+  IndexRange bounds;
+  /** Only used when the type is #Type::Copy. */
+  const IndexMask *copy_mask = nullptr;
+  /** Only used when the type is #Type::Indices. */
+  IndexMaskSegment indices;
+};
+
+/**
+ * There are different ways to do the exact evaluation. Depending on the expression or data, one
+ * or the other is more efficient.
+ */
+enum class ExactEvalMode {
+  /**
+   * Does the evaluation by working directly with arrays of sorted indices. This is usually best
+   * when the expression does not have intermediate results, i.e. it is very simple.
+   */
+  Indices,
+  /**
+   * The evaluation works with bits. There is extra overhead to convert the input masks to bit
+   * arrays and to convert the final result back into indices. In exchange, the actual expression
+   * evaluation is significantly cheaper because it's just a bunch of bit operations. For larger
+   * expressions, this is typically much more efficient.
+   */
+  Bits,
+};
+
+static void sort_course_boundaries(MutableSpan<CourseBoundary> boundaries)
+{
+  std::sort(boundaries.begin(),
+            boundaries.end(),
+            [](const CourseBoundary &a, const CourseBoundary &b) { return a.index < b.index; });
+}
+
+static void sort_course_boundaries(MutableSpan<DifferenceCourseBoundary> boundaries)
+{
+  std::sort(boundaries.begin(),
+            boundaries.end(),
+            [](const DifferenceCourseBoundary &a, const DifferenceCourseBoundary &b) {
+              return a.index < b.index;
+            });
+}
+
+/** Smaller segments should generally be merged together. */
+static constexpr int64_t segment_size_threshold = 32;
+
+/** Extends a previous full segment or appends a new one. */
+static CoarseSegment &add_coarse_segment__full(CoarseSegment *prev_segment,
+                                               const int64_t prev_boundary_index,
+                                               const int64_t current_boundary_index,
+                                               CoarseResult &result)
+{
+  const int64_t size = current_boundary_index - prev_boundary_index;
+  if (prev_segment) {
+    if (prev_segment->type == CoarseSegment::Type::Full &&
+        prev_segment->bounds.one_after_last() == prev_boundary_index)
+    {
+      prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
+      return *prev_segment;
+    }
+    if (current_boundary_index - prev_segment->bounds.start() < max_segment_size) {
+      if (prev_segment->bounds.size() + size < segment_size_threshold) {
+        /* Extend the previous segment because it's so small and change it into an unknown one. */
+        prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
+        prev_segment->type = CoarseSegment::Type::Unknown;
+        return *prev_segment;
+      }
+    }
+  }
+  result.segments.append(
+      {CoarseSegment::Type::Full, IndexRange::from_begin_size(prev_boundary_index, size)});
+  return result.segments.last();
+}
+
+/** Extends a previous unknown segment or appends a new one. */
+static CoarseSegment &add_coarse_segment__unknown(CoarseSegment *prev_segment,
+                                                  const int64_t prev_boundary_index,
+                                                  const int64_t current_boundary_index,
+                                                  CoarseResult &result)
+{
+  if (prev_segment) {
+    if (prev_segment->bounds.start() + segment_size_threshold >= prev_boundary_index) {
+      /* The previous segment is very short, so extend it. */
+      prev_segment->type = CoarseSegment::Type::Unknown;
+      prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
+      return *prev_segment;
+    }
+  }
+  result.segments.append(
+      {CoarseSegment::Type::Unknown,
+       IndexRange::from_begin_end(prev_boundary_index, current_boundary_index)});
+  return result.segments.last();
+}
+
+/** Extends a previous copy segment or appends a new one. */
+static CoarseSegment &add_coarse_segment__copy(CoarseSegment *prev_segment,
+                                               const int64_t prev_boundary_index,
+                                               const int64_t current_boundary_index,
+                                               const IndexMask &copy_from_mask,
+                                               CoarseResult &result)
+{
+  if (prev_segment) {
+    if (prev_segment->type == CoarseSegment::Type::Copy &&
+        prev_segment->bounds.one_after_last() == prev_boundary_index &&
+        prev_segment->mask == &copy_from_mask)
+    {
+      /* Can extend the previous copy segment. */
+      prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
+      return *prev_segment;
+    }
+    if (prev_segment->bounds.start() + segment_size_threshold >= current_boundary_index) {
+      /* The previous and this segment together are very short, so better merge them together. */
+      prev_segment->bounds = prev_segment->bounds.with_new_end(current_boundary_index);
+      prev_segment->type = CoarseSegment::Type::Unknown;
+      return *prev_segment;
+    }
+  }
+  result.segments.append({CoarseSegment::Type::Copy,
+                          IndexRange::from_begin_end(prev_boundary_index, current_boundary_index),
+                          &copy_from_mask});
+  return result.segments.last();
+}
+
+static void evaluate_coarse_union(const Span<CourseBoundary> boundaries, CoarseResult &r_result)
+{
+  if (boundaries.is_empty()) {
+    return;
+  }
+
+  CoarseResult &result = r_result;
+  CoarseSegment *prev_segment = nullptr;
+  Vector<const CoarseSegment *, 16> active_segments;
+  int64_t prev_boundary_index = boundaries[0].index;
+
+  for (const CourseBoundary &boundary : boundaries) {
+    if (prev_boundary_index < boundary.index) {
+      /* Compute some properties of the input segments that were active between the current and the
+       * previous boundary. */
+      bool has_full = false;
+      bool has_unknown = false;
+      bool copy_from_single_mask = true;
+      const IndexMask *copy_from_mask = nullptr;
+      for (const CoarseSegment *active_segment : active_segments) {
+        switch (active_segment->type) {
+          case CoarseSegment::Type::Unknown: {
+            has_unknown = true;
+            break;
+          }
+          case CoarseSegment::Type::Full: {
+            has_full = true;
+            break;
+          }
+          case CoarseSegment::Type::Copy: {
+            if (copy_from_mask != nullptr && copy_from_mask != active_segment->mask) {
+              copy_from_single_mask = false;
+            }
+            copy_from_mask = active_segment->mask;
+            break;
+          }
+        }
+      }
+      /* Determine the resulting coarse segment type based on the properties computed above. */
+      if (has_full) {
+        prev_segment = &add_coarse_segment__full(
+            prev_segment, prev_boundary_index, boundary.index, result);
+      }
+      else if (has_unknown || !copy_from_single_mask) {
+        prev_segment = &add_coarse_segment__unknown(
+            prev_segment, prev_boundary_index, boundary.index, result);
+      }
+      else if (copy_from_mask != nullptr && copy_from_single_mask) {
+        prev_segment = &add_coarse_segment__copy(
+            prev_segment, prev_boundary_index, boundary.index, *copy_from_mask, result);
+      }
+
+      prev_boundary_index = boundary.index;
+    }
+
+    /* Update active segments. */
+    if (boundary.is_begin) {
+      active_segments.append(boundary.segment);
+    }
+    else {
+      active_segments.remove_first_occurrence_and_reorder(boundary.segment);
+    }
+  }
+}
+
+static void evaluate_coarse_intersection(const Span<CourseBoundary> boundaries,
+                                         const int64_t terms_num,
+                                         CoarseResult &r_result)
+{
+  if (boundaries.is_empty()) {
+    return;
+  }
+
+  CoarseResult &result = r_result;
+  CoarseSegment *prev_segment = nullptr;
+  Vector<const CoarseSegment *, 16> active_segments;
+  int64_t prev_boundary_index = boundaries[0].index;
+
+  for (const CourseBoundary &boundary : boundaries) {
+    if (prev_boundary_index < boundary.index) {
+      /* Only if one segment of each term is active, it's possible that the output contains
+       * anything. */
+      if (active_segments.size() == terms_num) {
+        /* Compute some properties of the input segments that were active between the current and
+         * previous boundary. */
+        int full_count = 0;
+        int unknown_count = 0;
+        int copy_count = 0;
+        bool copy_from_single_mask = true;
+        const IndexMask *copy_from_mask = nullptr;
+        for (const CoarseSegment *active_segment : active_segments) {
+          switch (active_segment->type) {
+            case CoarseSegment::Type::Unknown: {
+              unknown_count++;
+              break;
+            }
+            case CoarseSegment::Type::Full: {
+              full_count++;
+              break;
+            }
+            case CoarseSegment::Type::Copy: {
+              copy_count++;
+              if (copy_from_mask != nullptr && copy_from_mask != active_segment->mask) {
+                copy_from_single_mask = false;
+              }
+              copy_from_mask = active_segment->mask;
+              break;
+            }
+          }
+        }
+        /* Determine the resulting coarse segment type based on the properties computed above. */
+        BLI_assert(full_count + unknown_count + copy_count == terms_num);
+        if (full_count == terms_num) {
+          prev_segment = &add_coarse_segment__full(
+              prev_segment, prev_boundary_index, boundary.index, result);
+        }
+        else if (unknown_count > 0 || copy_count < terms_num || !copy_from_single_mask) {
+          prev_segment = &add_coarse_segment__unknown(
+              prev_segment, prev_boundary_index, boundary.index, result);
+        }
+        else if (copy_count == terms_num && copy_from_single_mask) {
+          prev_segment = &add_coarse_segment__copy(
+              prev_segment, prev_boundary_index, boundary.index, *copy_from_mask, result);
+        }
+      }
+
+      prev_boundary_index = boundary.index;
+    }
+
+    /* Update active segments. */
+    if (boundary.is_begin) {
+      active_segments.append(boundary.segment);
+    }
+    else {
+      active_segments.remove_first_occurrence_and_reorder(boundary.segment);
+    }
+  }
+}
+
+static void evaluate_coarse_difference(const Span<DifferenceCourseBoundary> boundaries,
+                                       CoarseResult &r_result)
+{
+  if (boundaries.is_empty()) {
+    return;
+  }
+
+  CoarseResult &result = r_result;
+  CoarseSegment *prev_segment = nullptr;
+  Vector<const CoarseSegment *> active_main_segments;
+  Vector<const CoarseSegment *, 16> active_subtract_segments;
+  int64_t prev_boundary_index = boundaries[0].index;
+
+  for (const DifferenceCourseBoundary &boundary : boundaries) {
+    if (prev_boundary_index < boundary.index) {
+      /* There is only one main term, so at most one main segment can be active at once. */
+      BLI_assert(active_main_segments.size() <= 1);
+      if (active_main_segments.size() == 1) {
+        const CoarseSegment &active_main_segment = *active_main_segments[0];
+        /* Compute some properties of the input segments that were active between the current and
+         * the previous boundary. */
+        bool has_subtract_full = false;
+        bool has_subtract_same_mask = false;
+        for (const CoarseSegment *active_subtract_segment : active_subtract_segments) {
+          switch (active_subtract_segment->type) {
+            case CoarseSegment::Type::Unknown: {
+              break;
+            }
+            case CoarseSegment::Type::Full: {
+              has_subtract_full = true;
+              break;
+            }
+            case CoarseSegment::Type::Copy: {
+              if (active_main_segment.type == CoarseSegment::Type::Copy) {
+                if (active_main_segment.mask == active_subtract_segment->mask) {
+                  has_subtract_same_mask = true;
+                }
+              }
+              break;
+            }
+          }
+        }
+        /* Determine the resulting coarse segment type based on the properties computed above. */
+        if (has_subtract_full) {
+          /* Do nothing, the resulting segment is empty for the current range. */
+        }
+        else {
+          switch (active_main_segment.type) {
+            case CoarseSegment::Type::Unknown: {
+              prev_segment = &add_coarse_segment__unknown(
+                  prev_segment, prev_boundary_index, boundary.index, result);
+              break;
+            }
+            case CoarseSegment::Type::Full: {
+              if (active_subtract_segments.is_empty()) {
+                prev_segment = &add_coarse_segment__full(
+                    prev_segment, prev_boundary_index, boundary.index, result);
+              }
+              else {
+                prev_segment = &add_coarse_segment__unknown(
+                    prev_segment, prev_boundary_index, boundary.index, result);
+              }
+              break;
+            }
+            case CoarseSegment::Type::Copy: {
+              if (active_subtract_segments.is_empty()) {
+                prev_segment = &add_coarse_segment__copy(prev_segment,
+                                                         prev_boundary_index,
+                                                         boundary.index,
+                                                         *active_main_segment.mask,
+                                                         result);
+              }
+              else if (has_subtract_same_mask) {
+                /* Do nothing, subtracting a mask from itself results in an empty mask. */
+              }
+              else {
+                prev_segment = &add_coarse_segment__unknown(
+                    prev_segment, prev_boundary_index, boundary.index, result);
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      prev_boundary_index = boundary.index;
+    }
+
+    /* Update active segments. */
+    if (boundary.is_main) {
+      if (boundary.is_begin) {
+        active_main_segments.append(boundary.segment);
+      }
+      else {
+        active_main_segments.remove_first_occurrence_and_reorder(boundary.segment);
+      }
+    }
+    else {
+      if (boundary.is_begin) {
+        active_subtract_segments.append(boundary.segment);
+      }
+      else {
+        active_subtract_segments.remove_first_occurrence_and_reorder(boundary.segment);
+      }
+    }
+  }
+}
+
+/**
+ * The coarse evaluation only looks at the index masks as a whole within the given bounds. This
+ * limitation allows it to do many operations in constant time independent of the number of indices
+ * within each mask. For example, it can detect that two full index masks that overlap result in a
+ * new full index mask when the union of intersection is computed.
+ *
+ * For more complex index-masks, coarse evaluation outputs segments with type
+ * #CoarseSegment::Type::Unknown. Those segments can be evaluated in more detail afterwards.
+ *
+ * \param root_expression: Expression to be evaluated.
+ * \param eval_order: Pre-computed evaluation order. All children of a term must come before
+ *   the term itself.
+ * \param eval_bounds: If given, the evaluation is restriced to those bounds. Otherwise, the full
+ *   referenced masks are used.
+ */
+static CoarseResult evaluate_coarse(const Expr &root_expression,
+                                    const Span<const Expr *> eval_order,
+                                    const std::optional<IndexRange> eval_bounds = std::nullopt)
+{
+  /* An expression result for each intermediate expression. */
+  Array<std::optional<CoarseResult>, inline_expr_array_size> expression_results(
+      root_expression.expression_array_size());
+
+  /* Process expressions in a pre-determined order. */
+  for (const Expr *expression : eval_order) {
+    CoarseResult &expr_result = expression_results[expression->index].emplace();
+    switch (expression->type) {
+      case Expr::Type::Atomic: {
+        const AtomicExpr &expr = expression->as_atomic();
+
+        IndexMask mask;
+        if (eval_bounds.has_value()) {
+          mask = expr.mask->slice_content(*eval_bounds);
+        }
+        else {
+          mask = *expr.mask;
+        }
+
+        if (!mask.is_empty()) {
+          const IndexRange bounds = mask.bounds();
+          if (const std::optional<IndexRange> range = mask.to_range()) {
+            expr_result.segments.append({CoarseSegment::Type::Full, bounds});
+          }
+          else {
+            expr_result.segments.append({CoarseSegment::Type::Copy, bounds, expr.mask});
+          }
+        }
+        break;
+      }
+      case Expr::Type::Union: {
+        const UnionExpr &expr = expression->as_union();
+        Vector<CourseBoundary, 16> boundaries;
+        for (const Expr *term : expr.terms) {
+          const CoarseResult &term_result = *expression_results[term->index];
+          for (const CoarseSegment &segment : term_result.segments) {
+            boundaries.append({segment.bounds.first(), true, &segment});
+            boundaries.append({segment.bounds.one_after_last(), false, &segment});
+          }
+        }
+        sort_course_boundaries(boundaries);
+        evaluate_coarse_union(boundaries, expr_result);
+        break;
+      }
+      case Expr::Type::Intersection: {
+        const IntersectionExpr &expr = expression->as_intersection();
+        Vector<CourseBoundary, 16> boundaries;
+        for (const Expr *term : expr.terms) {
+          const CoarseResult &term_result = *expression_results[term->index];
+          for (const CoarseSegment &segment : term_result.segments) {
+            boundaries.append({segment.bounds.first(), true, &segment});
+            boundaries.append({segment.bounds.one_after_last(), false, &segment});
+          }
+        }
+        sort_course_boundaries(boundaries);
+        evaluate_coarse_intersection(boundaries, expr.terms.size(), expr_result);
+        break;
+      }
+      case Expr::Type::Difference: {
+        const DifferenceExpr &expr = expression->as_difference();
+        Vector<DifferenceCourseBoundary, 16> boundaries;
+        const CoarseResult &main_term_result = *expression_results[expr.terms[0]->index];
+        for (const CoarseSegment &segment : main_term_result.segments) {
+          boundaries.append({{segment.bounds.first(), true, &segment}, true});
+          boundaries.append({{segment.bounds.one_after_last(), false, &segment}, true});
+        }
+        for (const Expr *term : expr.terms.as_span().drop_front(1)) {
+          const CoarseResult &term_result = *expression_results[term->index];
+          for (const CoarseSegment &segment : term_result.segments) {
+            boundaries.append({{segment.bounds.first(), true, &segment}, false});
+            boundaries.append({{segment.bounds.one_after_last(), false, &segment}, false});
+          }
+        }
+        sort_course_boundaries(boundaries);
+        evaluate_coarse_difference(boundaries, expr_result);
+        break;
+      }
+    }
+  }
+
+  CoarseResult &final_result = *expression_results[root_expression.index];
+  return std::move(final_result);
+}
+
+static Span<int16_t> bits_to_indices(const BoundedBitSpan bits, LinearAllocator<> &allocator)
+{
+  /* TODO: Could first count the number of set bits. */
+  Vector<int16_t, max_segment_size> indices_vec;
+  bits::foreach_1_index(bits, [&](const int64_t i) {
+    BLI_assert(i < max_segment_size);
+    indices_vec.append_unchecked(int16_t(i));
+  });
+  return allocator.construct_array_copy<int16_t>(indices_vec);
+}
+
+/**
+ * Does an exact evaluation of the expression within the given bounds. The evaluation generally
+ * works in three steps:
+ * 1. Convert input indices into bit spans.
+ * 2. Use bit operations to evaluate the expression.
+ * 3. Convert resulting bit span back to indices.
+ *
+ * The trade-off here is that the actual expression evaluation is much faster but the conversions
+ * take some extra time. Therefore, this approach is best when the evaluation would otherwise take
+ * longer than the conversions which is usually the case for non-trivial expressions.
+ */
+static IndexMaskSegment evaluate_exact_with_bits(const Expr &root_expression,
+                                                 LinearAllocator<> &allocator,
+                                                 const IndexRange bounds,
+                                                 const Span<const Expr *> eval_order)
+{
+  BLI_assert(bounds.size() <= max_segment_size);
+  const int64_t bounds_min = bounds.start();
+  const int expr_array_size = root_expression.expression_array_size();
+
+  /* Make bit span sizes a multiple of `BitsPerInt`. This allows the bit-wise operations to run a
+   * bit more efficiently, because only full integers are processed. */
+  const int64_t ints_in_bounds = ceil_division(bounds.size(), bits::BitsPerInt);
+  BitGroupVector<16 * 1024> expression_results(
+      expr_array_size, ints_in_bounds * bits::BitsPerInt, false);
+
+  for (const Expr *expression : eval_order) {
+    MutableBoundedBitSpan expr_result = expression_results[expression->index];
+    switch (expression->type) {
+      case Expr::Type::Atomic: {
+        const AtomicExpr &expr = expression->as_atomic();
+        const IndexMask mask = expr.mask->slice_content(bounds);
+        mask.to_bits(expr_result, -bounds_min);
+        break;
+      }
+      case Expr::Type::Union: {
+        for (const Expr *term : expression->terms) {
+          expr_result |= expression_results[term->index];
+        }
+        break;
+      }
+      case Expr::Type::Intersection: {
+        bits::copy_from_or(expr_result, expression_results[expression->terms[0]->index]);
+        for (const Expr *term : expression->terms.as_span().drop_front(1)) {
+          expr_result &= expression_results[term->index];
+        }
+        break;
+      }
+      case Expr::Type::Difference: {
+        bits::copy_from_or(expr_result, expression_results[expression->terms[0]->index]);
+        for (const Expr *term : expression->terms.as_span().drop_front(1)) {
+          bits::mix_into_first_expr(
+              [](const bits::BitInt a, const bits::BitInt b) { return a & ~b; },
+              expr_result,
+              expression_results[term->index]);
+        }
+        break;
+      }
+    }
+  }
+  const BoundedBitSpan final_bits = expression_results[root_expression.index];
+  const Span<int16_t> indices = bits_to_indices(final_bits, allocator);
+  return IndexMaskSegment(bounds_min, indices);
+}
+
+/** Compute a new set of indices that is the union of the given segments. */
+static IndexMaskSegment union_index_mask_segments(const Span<IndexMaskSegment> segments,
+                                                  const int64_t bounds_min,
+                                                  int16_t *r_values)
+{
+  if (segments.is_empty()) {
+    return {};
+  }
+  if (segments.size() == 1) {
+    return segments[0];
+  }
+  if (segments.size() == 2) {
+    const IndexMaskSegment a = segments[0].shift(-bounds_min);
+    const IndexMaskSegment b = segments[1].shift(-bounds_min);
+    const int64_t size = std::set_union(a.begin(), a.end(), b.begin(), b.end(), r_values) -
+                         r_values;
+    return {bounds_min, {r_values, size}};
+  }
+
+  /* Sort input segments by their size, so that smaller segments are unioned first. This results in
+   * smaller intermediate arrays and thus less work overall. */
+  Vector<IndexMaskSegment> sorted_segments(segments);
+  std::sort(
+      sorted_segments.begin(),
+      sorted_segments.end(),
+      [](const IndexMaskSegment &a, const IndexMaskSegment &b) { return a.size() < b.size(); });
+
+  std::array<int16_t, max_segment_size> tmp_indices;
+  /* Can use r_values for temporary values because if it's large enough for the final result, it's
+   * also large enough for intermediate results. */
+  int16_t *buffer_a = r_values;
+  int16_t *buffer_b = tmp_indices.data();
+
+  if (sorted_segments.size() % 2 == 1) {
+    /* Swap buffers so that the result is in #r_values in the end. */
+    std::swap(buffer_a, buffer_b);
+  }
+
+  int64_t count = 0;
+  {
+    /* Initial union. */
+    const IndexMaskSegment a = sorted_segments[0].shift(-bounds_min);
+    const IndexMaskSegment b = sorted_segments[1].shift(-bounds_min);
+    int16_t *dst = buffer_a;
+    count = std::set_union(a.begin(), a.end(), b.begin(), b.end(), dst) - dst;
+  }
+
+  /* Union one input into the result at a time. In theory, one could write an algorithm that unions
+   * multiple sorted arrays at once, but that's more complex and it's not obvious that it would be
+   * faster in the end. */
+  for (const int64_t segment_i : sorted_segments.index_range().drop_front(2)) {
+    const int16_t *a = buffer_a;
+    const IndexMaskSegment b = sorted_segments[segment_i].shift(-bounds_min);
+    int16_t *dst = buffer_b;
+    count = std::set_union(a, a + count, b.begin(), b.end(), dst) - dst;
+    std::swap(buffer_a, buffer_b);
+  }
+  return {bounds_min, {r_values, count}};
+}
+
+/** Compute a new set of indices that is the intersection of the given segments. */
+static IndexMaskSegment intersect_index_mask_segments(const Span<IndexMaskSegment> segments,
+                                                      const int64_t bounds_min,
+                                                      int16_t *r_values)
+{
+  if (segments.is_empty()) {
+    return {};
+  }
+  if (segments.size() == 1) {
+    return segments[0];
+  }
+  if (segments.size() == 2) {
+    const IndexMaskSegment a = segments[0].shift(-bounds_min);
+    const IndexMaskSegment b = segments[1].shift(-bounds_min);
+    const int64_t size = std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), r_values) -
+                         r_values;
+    return {bounds_min, {r_values, size}};
+  }
+
+  /* Intersect smaller segments first, because then the intermediate results will generally be
+   * smaller. */
+  Vector<IndexMaskSegment> sorted_segments(segments);
+  std::sort(
+      sorted_segments.begin(),
+      sorted_segments.end(),
+      [](const IndexMaskSegment &a, const IndexMaskSegment &b) { return a.size() < b.size(); });
+
+  std::array<int16_t, max_segment_size> tmp_indices_1;
+  std::array<int16_t, max_segment_size> tmp_indices_2;
+  int16_t *buffer_a = tmp_indices_1.data();
+  int16_t *buffer_b = tmp_indices_2.data();
+
+  int64_t count = 0;
+  {
+    /* Initial intersection. */
+    const IndexMaskSegment a = sorted_segments[0].shift(-bounds_min);
+    const IndexMaskSegment b = sorted_segments[1].shift(-bounds_min);
+    int16_t *dst = buffer_a;
+    count = std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), dst) - dst;
+  }
+
+  for (const int64_t segment_i : sorted_segments.index_range().drop_front(2)) {
+    const int16_t *a = buffer_a;
+    const IndexMaskSegment b = sorted_segments[segment_i].shift(-bounds_min);
+    /* The result of the final intersection should be written directly to #r_values to avoid an
+     * additional copy in the end. */
+    int16_t *dst = (segment_i == sorted_segments.size() - 1) ? r_values : buffer_b;
+    count = std::set_intersection(a, a + count, b.begin(), b.end(), dst) - dst;
+    std::swap(buffer_a, buffer_b);
+  }
+  return {bounds_min, {r_values, count}};
+}
+
+/**
+ * Compute a new set of indices that is the difference between the main-segment and all the
+ * subtract-segments.
+ */
+static IndexMaskSegment difference_index_mask_segments(
+    const IndexMaskSegment main_segment,
+    const Span<IndexMaskSegment> subtract_segments,
+    const int64_t bounds_min,
+    int16_t *r_values)
+{
+  if (main_segment.is_empty()) {
+    return {};
+  }
+  if (subtract_segments.is_empty()) {
+    return main_segment;
+  }
+  if (subtract_segments.size() == 1) {
+    const IndexMaskSegment shifted_main_segment = main_segment.shift(-bounds_min);
+    const IndexMaskSegment subtract_segment = subtract_segments[0].shift(-bounds_min);
+    const int64_t size = std::set_difference(shifted_main_segment.begin(),
+                                             shifted_main_segment.end(),
+                                             subtract_segment.begin(),
+                                             subtract_segment.end(),
+                                             r_values) -
+                         r_values;
+    return {bounds_min, {r_values, size}};
+  }
+
+  int64_t subtract_count = 0;
+  for (const IndexMaskSegment &segment : subtract_segments) {
+    subtract_count += segment.size();
+  }
+  if (subtract_count < main_segment.size() / 2) {
+    /* Can be more efficient to union all the subtract indices first before computing the
+     * difference. This avoids potentially multiple larger intermediate arrays. */
+    std::array<int16_t, max_segment_size> union_indices;
+    const IndexMaskSegment shifted_main_segment = main_segment.shift(-bounds_min);
+    const IndexMaskSegment unioned_subtract_segment =
+        union_index_mask_segments(subtract_segments, bounds_min, union_indices.data())
+            .shift(-bounds_min);
+    const int64_t size = std::set_difference(shifted_main_segment.begin(),
+                                             shifted_main_segment.end(),
+                                             unioned_subtract_segment.begin(),
+                                             unioned_subtract_segment.end(),
+                                             r_values) -
+                         r_values;
+    return {bounds_min, {r_values, size}};
+  }
+
+  /* Sort larger segments to the front. This way the intermediate arrays are likely smaller. */
+  Vector<IndexMaskSegment> sorted_subtract_segments(subtract_segments);
+  std::sort(
+      sorted_subtract_segments.begin(),
+      sorted_subtract_segments.end(),
+      [](const IndexMaskSegment &a, const IndexMaskSegment &b) { return a.size() > b.size(); });
+
+  std::array<int16_t, max_segment_size> tmp_indices_1;
+  std::array<int16_t, max_segment_size> tmp_indices_2;
+  int16_t *buffer_a = tmp_indices_1.data();
+  int16_t *buffer_b = tmp_indices_2.data();
+
+  int64_t count = 0;
+  {
+    /* Initial difference. */
+    const IndexMaskSegment shifted_main_segment = main_segment.shift(-bounds_min);
+    const IndexMaskSegment subtract_segment = sorted_subtract_segments[0].shift(-bounds_min);
+    int16_t *dst = buffer_a;
+    count = std::set_difference(shifted_main_segment.begin(),
+                                shifted_main_segment.end(),
+                                subtract_segment.begin(),
+                                subtract_segment.end(),
+                                dst) -
+            dst;
+  }
+
+  for (const int64_t segment_i : sorted_subtract_segments.index_range().drop_front(1)) {
+    const IndexMaskSegment &subtract_segment = sorted_subtract_segments[segment_i].shift(
+        -bounds_min);
+    /* The final result should be written directly to #r_values to avoid an additional copy. */
+    int16_t *dst = (segment_i == sorted_subtract_segments.size() - 1) ? r_values : buffer_b;
+    count = std::set_difference(buffer_a,
+                                buffer_a + count,
+                                subtract_segment.begin(),
+                                subtract_segment.end(),
+                                dst) -
+            dst;
+    std::swap(buffer_a, buffer_b);
+  }
+  return {bounds_min, {r_values, count}};
+}
+
+/**
+ * Does an exact evaluation of the expression with in the given bounds. The evaluation builds on
+ * top of algorithms like `std::set_union`. This approach is especially useful if the expression is
+ * simple and doesn't have many intermediate values.
+ */
+static IndexMaskSegment evaluate_exact_with_indices(const Expr &root_expression,
+                                                    LinearAllocator<> &allocator,
+                                                    const IndexRange bounds,
+                                                    const Span<const Expr *> eval_order)
+{
+  BLI_assert(bounds.size() <= max_segment_size);
+  const int64_t bounds_min = bounds.start();
+  const int expr_array_size = root_expression.expression_array_size();
+  Array<IndexMaskSegment, inline_expr_array_size> results(expr_array_size);
+  for (const Expr *expression : eval_order) {
+    switch (expression->type) {
+      case Expr::Type::Atomic: {
+        const AtomicExpr &expr = expression->as_atomic();
+        const IndexMask mask = expr.mask->slice_content(bounds);
+        /* The caller should make sure that the bounds are aligned to segment bounds. */
+        BLI_assert(mask.segments_num() <= 1);
+        if (mask.segments_num() == 1) {
+          results[expression->index] = mask.segment(0);
+        }
+        break;
+      }
+      case Expr::Type::Union: {
+        const UnionExpr &expr = expression->as_union();
+        Array<IndexMaskSegment> term_segments(expr.terms.size());
+        int64_t result_size_upper_bound = 0;
+        bool used_short_circuit = false;
+        for (const int64_t term_i : expr.terms.index_range()) {
+          const Expr &term = *expr.terms[term_i];
+          const IndexMaskSegment term_segment = results[term.index];
+          if (term_segment.size() == bounds.size()) {
+            /* Can skip computing the union if we know that one of the inputs contains all possible
+             * indices already.  */
+            results[expression->index] = term_segment;
+            used_short_circuit = true;
+            break;
+          }
+          term_segments[term_i] = term_segment;
+          result_size_upper_bound += term_segment.size();
+        }
+        if (used_short_circuit) {
+          break;
+        }
+        result_size_upper_bound = std::min(result_size_upper_bound, bounds.size());
+        MutableSpan<int16_t> dst = allocator.allocate_array<int16_t>(result_size_upper_bound);
+        const IndexMaskSegment result_segment = union_index_mask_segments(
+            term_segments, bounds_min, dst.data());
+        allocator.free_end_of_previous_allocation(dst.size_in_bytes(),
+                                                  result_segment.base_span().end());
+        results[expression->index] = result_segment;
+        break;
+      }
+      case Expr::Type::Intersection: {
+        const IntersectionExpr &expr = expression->as_intersection();
+        Array<IndexMaskSegment> term_segments(expr.terms.size());
+        int64_t result_size_upper_bound = bounds.size();
+        bool used_short_circuit = false;
+        for (const int64_t term_i : expr.terms.index_range()) {
+          const Expr &term = *expr.terms[term_i];
+          const IndexMaskSegment term_segment = results[term.index];
+          if (term_segment.is_empty()) {
+            /* Can skip computing the intersection if we know that one of the inputs is empty. */
+            results[expression->index] = {};
+            used_short_circuit = true;
+            break;
+          }
+          result_size_upper_bound = std::min(result_size_upper_bound, term_segment.size());
+          term_segments[term_i] = term_segment;
+        }
+        if (used_short_circuit) {
+          break;
+        }
+        MutableSpan<int16_t> dst = allocator.allocate_array<int16_t>(result_size_upper_bound);
+        const IndexMaskSegment result_segment = intersect_index_mask_segments(
+            term_segments, bounds_min, dst.data());
+        allocator.free_end_of_previous_allocation(dst.size_in_bytes(),
+                                                  result_segment.base_span().end());
+        results[expression->index] = result_segment;
+        break;
+      }
+      case Expr::Type::Difference: {
+        const DifferenceExpr &expr = expression->as_difference();
+        const Expr &main_term = *expr.terms[0];
+        const IndexMaskSegment main_segment = results[main_term.index];
+        if (main_segment.is_empty()) {
+          /* Can skip the computation of the main segment is empty. */
+          results[expression->index] = {};
+          break;
+        }
+        int64_t result_size_upper_bound = main_segment.size();
+        bool used_short_circuit = false;
+        Array<IndexMaskSegment> subtract_segments(expr.terms.size() - 1);
+        for (const int64_t term_i : expr.terms.index_range().drop_front(1)) {
+          const Expr &subtract_term = *expr.terms[term_i];
+          const IndexMaskSegment term_segment = results[subtract_term.index];
+          if (term_segment.size() == bounds.size()) {
+            /* Can skip computing the difference if we know that one of the subtract-terms is
+             * full. */
+            results[expression->index] = {};
+            used_short_circuit = true;
+            break;
+          }
+          result_size_upper_bound = std::min(result_size_upper_bound,
+                                             bounds.size() - term_segment.size());
+          subtract_segments[term_i - 1] = term_segment;
+        }
+        if (used_short_circuit) {
+          break;
+        }
+        MutableSpan<int16_t> dst = allocator.allocate_array<int16_t>(result_size_upper_bound);
+        const IndexMaskSegment result_segment = difference_index_mask_segments(
+            main_segment, subtract_segments, bounds_min, dst.data());
+        allocator.free_end_of_previous_allocation(dst.size_in_bytes(),
+                                                  result_segment.base_span().end());
+        results[expression->index] = result_segment;
+        break;
+      }
+    }
+  }
+  return results[root_expression.index];
+}
+
+/**
+ * Turn the evaluated segments into index mask segments that are then used to initialize the
+ * resulting index mask.
+ */
+static Vector<IndexMaskSegment> build_result_mask_segments(
+    const Span<EvaluatedSegment> evaluated_segments)
+{
+  const std::array<int16_t, max_segment_size> &static_indices_array = get_static_indices_array();
+
+  Vector<IndexMaskSegment> result_mask_segments;
+  for (const EvaluatedSegment &evaluated_segment : evaluated_segments) {
+    switch (evaluated_segment.type) {
+      case EvaluatedSegment::Type::Full: {
+        const int64_t full_size = evaluated_segment.bounds.size();
+        for (int64_t i = 0; i < full_size; i += max_segment_size) {
+          const int64_t size = std::min(i + max_segment_size, full_size) - i;
+          result_mask_segments.append(IndexMaskSegment(
+              evaluated_segment.bounds.first() + i, Span(static_indices_array).take_front(size)));
+        }
+        break;
+      }
+      case EvaluatedSegment::Type::Copy: {
+        const IndexMask sliced_mask = evaluated_segment.copy_mask->slice_content(
+            evaluated_segment.bounds);
+        sliced_mask.foreach_segment(
+            [&](const IndexMaskSegment &segment) { result_mask_segments.append(segment); });
+        break;
+      }
+      case EvaluatedSegment::Type::Indices: {
+        result_mask_segments.append(evaluated_segment.indices);
+        break;
+      }
+    }
+  }
+  return result_mask_segments;
+}
+
+/**
+ * Computes an evaluation order of the expression. The important aspect is that all child terms
+ * come before the term that uses them.
+ */
+static Vector<const Expr *, inline_expr_array_size> compute_eval_order(const Expr &root_expression)
+{
+  Vector<const Expr *, inline_expr_array_size> eval_order;
+  if (root_expression.type == Expr::Type::Atomic) {
+    eval_order.append(&root_expression);
+    return eval_order;
+  }
+
+  Array<bool, inline_expr_array_size> is_evaluated_states(root_expression.expression_array_size(),
+                                                          false);
+  Stack<const Expr *, inline_expr_array_size> expr_stack;
+  expr_stack.push(&root_expression);
+
+  while (!expr_stack.is_empty()) {
+    const Expr &expression = *expr_stack.peek();
+    bool &is_evaluated = is_evaluated_states[expression.index];
+    if (is_evaluated) {
+      expr_stack.pop();
+      continue;
+    }
+    bool all_terms_evaluated = true;
+    for (const Expr *term : expression.terms) {
+      bool &term_evaluated = is_evaluated_states[term->index];
+      if (!term_evaluated) {
+        if (term->type == Expr::Type::Atomic) {
+          eval_order.append(term);
+          term_evaluated = true;
+        }
+        else {
+          expr_stack.push(term);
+          all_terms_evaluated = false;
+        }
+      }
+    }
+    if (all_terms_evaluated) {
+      eval_order.append(&expression);
+      is_evaluated = true;
+      expr_stack.pop();
+    }
+  }
+
+  return eval_order;
+}
+
+/** Uses a heuristic to decide which exact evaluation mode probably works best. */
+static ExactEvalMode determine_exact_eval_mode(const Expr &root_expression)
+{
+  for (const Expr *term : root_expression.terms) {
+    if (!term->terms.is_empty()) {
+      /* Use bits when there are nested expressions as this is often faster. */
+      return ExactEvalMode::Bits;
+    }
+  }
+  return ExactEvalMode::Indices;
+}
+
+static void evaluate_coarse_and_split_until_segments_are_short(
+    const Expr &root_expression,
+    const Span<const Expr *> eval_order,
+    Vector<EvaluatedSegment, 16> &r_evaluated_segments,
+    Vector<IndexRange, 16> &r_short_unknown_segments)
+{
+  /* Coarse evaluation splits the full range into segments. Long segments are split up and get
+   * another coarse evaluation. Short segments will be evaluated exactly. */
+  Stack<IndexRange, 16> long_unknown_segments;
+
+  /* The point at which a range starts being "short". */
+  const int64_t coarse_segment_size_threshold = max_segment_size;
+
+  /* Checks the coarse results and inserts its segments into either `long_unknown_segments` for
+   * further coarse evaluation, `r_short_unknown_segments` for exact evaluation or
+   * `r_evaluated_segments` if no further evaluation is necessary. */
+  auto handle_coarse_result = [&](const CoarseResult &coarse_result) {
+    for (const CoarseSegment &segment : coarse_result.segments) {
+      switch (segment.type) {
+        case CoarseSegment::Type::Unknown: {
+          if (segment.bounds.size() > coarse_segment_size_threshold) {
+            long_unknown_segments.push(segment.bounds);
+          }
+          else {
+            r_short_unknown_segments.append(segment.bounds);
+          }
+          break;
+        }
+        case CoarseSegment::Type::Copy: {
+          BLI_assert(segment.mask);
+          r_evaluated_segments.append(
+              {EvaluatedSegment::Type::Copy, segment.bounds, segment.mask});
+          break;
+        }
+        case CoarseSegment::Type::Full: {
+          r_evaluated_segments.append({EvaluatedSegment::Type::Full, segment.bounds});
+          break;
+        }
+      }
+    }
+  };
+
+  /* Initial coarse evaluation without any explicit bounds. The bounds are implied by the index
+   * masks used in the expression. */
+  const CoarseResult initial_coarse_result = evaluate_coarse(root_expression, eval_order);
+  handle_coarse_result(initial_coarse_result);
+
+  /* Do coarse evaluation until all unknown segments are short enough to do exact evaluation. */
+  while (!long_unknown_segments.is_empty()) {
+    const IndexRange unknown_bounds = long_unknown_segments.pop();
+    const int64_t split_pos = unknown_bounds.size() / 2;
+    const IndexRange left_half = unknown_bounds.take_front(split_pos);
+    const IndexRange right_half = unknown_bounds.drop_front(split_pos);
+    const CoarseResult left_result = evaluate_coarse(root_expression, eval_order, left_half);
+    const CoarseResult right_result = evaluate_coarse(root_expression, eval_order, right_half);
+    handle_coarse_result(left_result);
+    handle_coarse_result(right_result);
+  }
+}
+
+static void evaluate_short_unknown_segments_exactly(
+    const Expr &root_expression,
+    const ExactEvalMode exact_eval_mode,
+    const Span<const Expr *> eval_order,
+    const Span<IndexRange> short_unknown_segments,
+    IndexMaskMemory &memory,
+    Vector<EvaluatedSegment, 16> &r_evaluated_segments)
+{
+  /* Evaluate a segment exactly. */
+  auto evaluate_unknown_segment = [&](const IndexRange bounds,
+                                      LinearAllocator<> &allocator,
+                                      Vector<EvaluatedSegment, 16> &r_local_evaluated_segments) {
+    /* Use the predetermined evaluation mode. */
+    switch (exact_eval_mode) {
+      case ExactEvalMode::Bits: {
+        const IndexMaskSegment indices = evaluate_exact_with_bits(
+            root_expression, allocator, bounds, eval_order);
+        if (!indices.is_empty()) {
+          r_local_evaluated_segments.append(
+              {EvaluatedSegment::Type::Indices, bounds, nullptr, indices});
+        }
+        break;
+      }
+      case ExactEvalMode::Indices: {
+        /* #evaluate_exact_with_indices requires that all index masks have a single segment in the
+         * provided bounds. So split up the range into subranges first if necessary. */
+        Vector<int64_t, 16> split_indices;
+        /* Always adding the beginning and end of the bounds simplifies the code below. */
+        split_indices.extend({bounds.first(), bounds.one_after_last()});
+        for (const int64_t eval_order_i : eval_order.index_range()) {
+          const Expr &expr = *eval_order[eval_order_i];
+          if (expr.type != Expr::Type::Atomic) {
+            continue;
+          }
+          const AtomicExpr &atomic_expr = expr.as_atomic();
+          const IndexMask mask = atomic_expr.mask->slice_content(bounds);
+          const int64_t segments_num = mask.segments_num();
+          if (segments_num <= 1) {
+            /* This mask only has a single segment in the bounds anyway, so no extra split-position
+             * is necessary. */
+            continue;
+          }
+          /* Split at the beginning of each segment. Skipping the first, because that does not need
+           * an extra split position. Alternatively, one could also split at the end of each
+           * segment except the last one. It doesn't matter much. */
+          for (const int64_t segment_i : IndexRange(segments_num).drop_front(1)) {
+            const IndexMaskSegment segment = mask.segment(segment_i);
+            split_indices.append(segment[0]);
+          }
+        }
+        std::sort(split_indices.begin(), split_indices.end());
+        for (const int64_t boundary_i : split_indices.index_range().drop_back(1)) {
+          const IndexRange sub_bounds = IndexRange::from_begin_end(split_indices[boundary_i],
+                                                                   split_indices[boundary_i + 1]);
+          if (sub_bounds.is_empty()) {
+            continue;
+          }
+          const IndexMaskSegment indices = evaluate_exact_with_indices(
+              root_expression, allocator, sub_bounds, eval_order);
+          if (!indices.is_empty()) {
+            r_local_evaluated_segments.append(
+                {EvaluatedSegment::Type::Indices, sub_bounds, nullptr, indices});
+          }
+        }
+        break;
+      }
+    }
+  };
+
+  /* Decide whether multi-threading should be used or not. There is some extra overhead even when
+   * just attempting to use multi-threading. */
+  const int64_t unknown_segment_eval_grain_size = 8;
+  if (short_unknown_segments.size() < unknown_segment_eval_grain_size) {
+    for (const IndexRange &bounds : short_unknown_segments) {
+      evaluate_unknown_segment(bounds, memory, r_evaluated_segments);
+    }
+  }
+  else {
+    /* Do exact evaluation in multiple threads. The allocators and evaluated segments created by
+     * each thread are merged in the end.  */
+    struct LocalData {
+      LinearAllocator<> allocator;
+      Vector<EvaluatedSegment, 16> evaluated_segments;
+    };
+    threading::EnumerableThreadSpecific<LocalData> data_by_thread;
+    threading::parallel_for(short_unknown_segments.index_range(),
+                            unknown_segment_eval_grain_size,
+                            [&](const IndexRange range) {
+                              LocalData &data = data_by_thread.local();
+                              for (const IndexRange &bounds : short_unknown_segments.slice(range))
+                              {
+                                evaluate_unknown_segment(
+                                    bounds, data.allocator, data.evaluated_segments);
+                              }
+                            });
+    for (LocalData &data : data_by_thread) {
+      if (!data.evaluated_segments.is_empty()) {
+        r_evaluated_segments.extend(data.evaluated_segments);
+        memory.transfer_ownership_from(data.allocator);
+      }
+    }
+  }
+}
+
+static IndexMask evaluated_segments_to_index_mask(MutableSpan<EvaluatedSegment> evaluated_segments,
+                                                  IndexMaskMemory &memory)
+{
+  if (evaluated_segments.is_empty()) {
+    return {};
+  }
+  if (evaluated_segments.size() == 1) {
+    const EvaluatedSegment &evaluated_segment = evaluated_segments[0];
+    switch (evaluated_segment.type) {
+      case EvaluatedSegment::Type::Full: {
+        return IndexMask(IndexRange(evaluated_segment.bounds));
+      }
+      case EvaluatedSegment::Type::Copy: {
+        return evaluated_segment.copy_mask->slice_content(evaluated_segment.bounds);
+      }
+      case EvaluatedSegment::Type::Indices: {
+        return IndexMask::from_segments({evaluated_segment.indices}, memory);
+      }
+    }
+  }
+
+  std::sort(evaluated_segments.begin(),
+            evaluated_segments.end(),
+            [](const EvaluatedSegment &a, const EvaluatedSegment &b) {
+              return a.bounds.start() < b.bounds.start();
+            });
+
+  Vector<IndexMaskSegment> result_segments = build_result_mask_segments(evaluated_segments);
+  return IndexMask::from_segments(result_segments, memory);
+}
+
+static IndexMask evaluate_expression_impl(const Expr &root_expression,
+                                          IndexMaskMemory &memory,
+                                          const ExactEvalMode exact_eval_mode)
+{
+  /* Precompute the evaluation order here, because it's used potentially many times throughout the
+   * algorithm. */
+  const Vector<const Expr *, inline_expr_array_size> eval_order = compute_eval_order(
+      root_expression);
+
+  /* Non-overlapping evaluated segments which become the resulting index mask in the end. Note that
+   * these segments are only sorted in the end. */
+  Vector<EvaluatedSegment, 16> evaluated_segments;
+  Vector<IndexRange, 16> short_unknown_segments;
+
+  evaluate_coarse_and_split_until_segments_are_short(
+      root_expression, eval_order, evaluated_segments, short_unknown_segments);
+  evaluate_short_unknown_segments_exactly(root_expression,
+                                          exact_eval_mode,
+                                          eval_order,
+                                          short_unknown_segments,
+                                          memory,
+                                          evaluated_segments);
+  return evaluated_segments_to_index_mask(evaluated_segments, memory);
+}
+
+IndexMask evaluate_expression(const Expr &expression, IndexMaskMemory &memory)
+{
+  const ExactEvalMode exact_eval_mode = determine_exact_eval_mode(expression);
+  IndexMask mask = evaluate_expression_impl(expression, memory, exact_eval_mode);
+#ifndef NDEBUG
+  {
+    /* Check that both exact eval modes have the same result. */
+    const ExactEvalMode other_exact_eval_mode = (exact_eval_mode == ExactEvalMode::Bits) ?
+                                                    ExactEvalMode::Indices :
+                                                    ExactEvalMode::Bits;
+    IndexMask other_mask = evaluate_expression_impl(expression, memory, other_exact_eval_mode);
+    BLI_assert(mask == other_mask);
+  }
+#endif
+  return mask;
+}
+
+const UnionExpr &ExprBuilder::merge(const Span<Term> terms)
+{
+  Vector<const Expr *> term_expressions;
+  for (const Term &term : terms) {
+    term_expressions.append(&this->term_to_expr(term));
+  }
+  UnionExpr &expr = scope_.construct<UnionExpr>();
+  expr.type = Expr::Type::Union;
+  expr.index = expr_count_++;
+  expr.terms = std::move(term_expressions);
+  return expr;
+}
+
+const DifferenceExpr &ExprBuilder::subtract(const Term &main_term, const Span<Term> subtract_terms)
+{
+  Vector<const Expr *> term_expressions;
+  term_expressions.append(&this->term_to_expr(main_term));
+  for (const Term &subtract_term : subtract_terms) {
+    term_expressions.append(&this->term_to_expr(subtract_term));
+  }
+  DifferenceExpr &expr = scope_.construct<DifferenceExpr>();
+  expr.type = Expr::Type::Difference;
+  expr.index = expr_count_++;
+  expr.terms = std::move(term_expressions);
+  return expr;
+}
+
+const IntersectionExpr &ExprBuilder::intersect(const Span<Term> terms)
+{
+  Vector<const Expr *> term_expressions;
+  for (const Term &term : terms) {
+    term_expressions.append(&this->term_to_expr(term));
+  }
+  IntersectionExpr &expr = scope_.construct<IntersectionExpr>();
+  expr.type = Expr::Type::Intersection;
+  expr.index += expr_count_++;
+  expr.terms = std::move(term_expressions);
+  return expr;
+}
+
+const Expr &ExprBuilder::term_to_expr(const Term &term)
+{
+  if (const Expr *const *expr = std::get_if<const Expr *>(&term)) {
+    return **expr;
+  }
+  AtomicExpr &expr = scope_.construct<AtomicExpr>();
+  expr.type = Expr::Type::Atomic;
+  expr.index = expr_count_++;
+  if (const IndexRange *range = std::get_if<IndexRange>(&term)) {
+    expr.mask = &scope_.construct<IndexMask>(*range);
+  }
+  else {
+    expr.mask = std::get<const IndexMask *>(term);
+  }
+  return expr;
+}
+
+}  // namespace blender::index_mask
diff --git a/source/blender/blenlib/tests/BLI_index_mask_expression_test.cc b/source/blender/blenlib/tests/BLI_index_mask_expression_test.cc
new file mode 100644
index 00000000000..d8cb53117f9
--- /dev/null
+++ b/source/blender/blenlib/tests/BLI_index_mask_expression_test.cc
@@ -0,0 +1,269 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include "BLI_array.hh"
+#include "BLI_index_mask_expression.hh"
+#include "BLI_rand.hh"
+#include "BLI_set.hh"
+#include "BLI_strict_flags.h"
+#include "BLI_timeit.hh"
+
+#include "testing/testing.h"
+
+namespace blender::index_mask::tests {
+
+TEST(index_mask_expression, Union)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({5, IndexRange(50, 100), 100'000}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers({IndexRange(10, 10), 60, 200}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask_a, &mask_b});
+  const IndexMask union_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(union_mask,
+            IndexMask::from_initializers(
+                {5, IndexRange(10, 10), IndexRange(50, 100), 200, 100'000}, memory));
+}
+
+TEST(index_mask_expression, UnionMulti)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({3, 5, 6, 8, 9}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers({4, 6, 7, 12}, memory);
+  const IndexMask mask_c = IndexMask::from_initializers({0, 5}, memory);
+  const IndexMask mask_d = IndexMask::from_initializers({6, 7, 10}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask_a, &mask_b, &mask_c, &mask_d});
+  const IndexMask union_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(union_mask, IndexMask::from_initializers({0, 3, 4, 5, 6, 7, 8, 9, 10, 12}, memory));
+}
+
+TEST(index_mask_expression, IntersectMulti)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({3, 5, 6, 8, 9}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers({2, 5, 6, 10}, memory);
+  const IndexMask mask_c = IndexMask::from_initializers({4, 5, 6}, memory);
+  const IndexMask mask_d = IndexMask::from_initializers({1, 5, 10}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.intersect({&mask_a, &mask_b, &mask_c, &mask_d});
+  const IndexMask intersect_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(intersect_mask, IndexMask::from_initializers({5}, memory));
+}
+
+TEST(index_mask_expression, DifferenceMulti)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({1, 2, 3, 5, 6, 7, 9, 10}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers({2, 5, 6, 10}, memory);
+  const IndexMask mask_c = IndexMask::from_initializers({4, 5, 6}, memory);
+  const IndexMask mask_d = IndexMask::from_initializers({1, 5, 10}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.subtract(&mask_a, {&mask_b, &mask_c, &mask_d});
+  const IndexMask difference_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(difference_mask, IndexMask::from_initializers({3, 7, 9}, memory));
+}
+
+TEST(index_mask_expression, Intersection)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({5, IndexRange(50, 100), 100'000}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers(
+      {5, 6, IndexRange(100, 100), 80000, 100'000}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.intersect({&mask_a, &mask_b});
+  const IndexMask intersection_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(intersection_mask,
+            IndexMask::from_initializers({5, IndexRange(100, 50), 100'000}, memory));
+}
+
+TEST(index_mask_expression, Difference)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({5, IndexRange(50, 100), 100'000}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers({5, 60, IndexRange(100, 20)}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.subtract(&mask_a, {&mask_b});
+  const IndexMask difference_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(difference_mask,
+            IndexMask::from_initializers(
+                {IndexRange(50, 10), IndexRange(61, 39), IndexRange(120, 30), 100'000}, memory));
+}
+
+TEST(index_mask_expression, FizzBuzz)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_3 = IndexMask::from_every_nth(3, 11, 0, memory); /* 0 - 30 */
+  const IndexMask mask_5 = IndexMask::from_every_nth(5, 11, 0, memory); /* 0 - 50 */
+
+  {
+    ExprBuilder builder;
+    const Expr &expr = builder.merge({&mask_3, &mask_5});
+    const IndexMask result = evaluate_expression(expr, memory);
+    EXPECT_EQ(
+        result,
+        IndexMask::from_initializers(
+            {0, 3, 5, 6, 9, 10, 12, 15, 18, 20, 21, 24, 25, 27, 30, 35, 40, 45, 50}, memory));
+  }
+  {
+    ExprBuilder builder;
+    const Expr &expr = builder.intersect({&mask_3, &mask_5});
+    const IndexMask result = evaluate_expression(expr, memory);
+    EXPECT_EQ(result, IndexMask::from_initializers({0, 15, 30}, memory));
+  }
+  {
+    ExprBuilder builder;
+    const Expr &expr = builder.subtract(&mask_3, {&mask_5});
+    const IndexMask result = evaluate_expression(expr, memory);
+    EXPECT_EQ(result, IndexMask::from_initializers({3, 6, 9, 12, 18, 21, 24, 27}, memory));
+  }
+  {
+    ExprBuilder builder;
+    const Expr &expr = builder.merge(
+        {&builder.intersect({&mask_3, &mask_5}), &builder.subtract(&mask_3, {&mask_5})});
+    const IndexMask &result = evaluate_expression(expr, memory);
+    EXPECT_EQ(result,
+              IndexMask::from_initializers({0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30}, memory));
+  }
+}
+
+TEST(index_mask_expression, UnionToFullRange)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_1 = IndexMask::from_initializers({2, 4, 5, 7}, memory);
+  const IndexMask mask_2 = IndexMask::from_initializers({6, 8}, memory);
+  const IndexMask mask_3 = IndexMask::from_initializers({1, 3}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask_1, &mask_2, &mask_3});
+  const IndexMask result = evaluate_expression(expr, memory);
+  EXPECT_TRUE(result.to_range().has_value());
+  EXPECT_EQ(*result.to_range(), IndexRange::from_begin_end_inclusive(1, 8));
+  EXPECT_EQ(result.segments_num(), 1);
+}
+
+TEST(index_mask_expression, UnionIndividualIndices)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_1 = IndexMask::from_initializers({3}, memory);
+  const IndexMask mask_2 = IndexMask::from_initializers({6}, memory);
+  const IndexMask mask_3 = IndexMask::from_initializers({5}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask_1, &mask_2, &mask_3});
+  const IndexMask result = evaluate_expression(expr, memory);
+  EXPECT_EQ(result, IndexMask::from_initializers({3, 5, 6}, memory));
+  EXPECT_EQ(result.segments_num(), 1);
+}
+
+TEST(index_mask_expression, UnionLargeRanges)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a(IndexRange(0, 1'000'000));
+  const IndexMask mask_b(IndexRange(900'000, 1'100'000));
+
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask_a, &mask_b});
+  const IndexMask result_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(result_mask, IndexMask(IndexRange(0, 2'000'000)));
+}
+
+TEST(index_mask_expression, SubtractSmall)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask_a = IndexMask::from_initializers({3, 4, 5, 6, 7, 8, 9}, memory);
+  const IndexMask mask_b = IndexMask::from_initializers({5, 7}, memory);
+  const IndexMask mask_c = IndexMask::from_initializers({8}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.subtract(&mask_a, {&mask_b, &mask_c});
+  const IndexMask result = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(result, IndexMask::from_initializers({3, 4, 6, 9}, memory));
+  EXPECT_EQ(result.segments_num(), 1);
+}
+
+TEST(index_mask_expression, RangeTerms)
+{
+  IndexMaskMemory memory;
+  ExprBuilder builder;
+
+  const IndexRange range_a = IndexRange::from_begin_end(30'000, 50'000);
+  const IndexRange range_b = IndexRange::from_begin_end(40'000, 100'000);
+  const IndexRange range_c = IndexRange::from_begin_end(45'000, 48'000);
+
+  const Expr &expr = builder.subtract(&builder.merge({range_a, range_b}), {range_c});
+  const IndexMask result_mask = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(result_mask,
+            IndexMask::from_initializers({IndexRange::from_begin_end(30'000, 45'000),
+                                          IndexRange::from_begin_end(48'000, 100'000)},
+                                         memory));
+}
+
+TEST(index_mask_expression, SingleMask)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask = IndexMask::from_initializers({5, 6, 8, 9}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.merge({&mask});
+  const IndexMask result = evaluate_expression(expr, memory);
+
+  EXPECT_EQ(result, mask);
+}
+
+TEST(index_mask_expression, SubtractSelf)
+{
+  IndexMaskMemory memory;
+  const IndexMask mask = IndexMask ::from_initializers({6, 8, 10, 100}, memory);
+
+  ExprBuilder builder;
+  const Expr &expr = builder.subtract(&mask, {&mask});
+  const IndexMask result = evaluate_expression(expr, memory);
+
+  EXPECT_TRUE(result.is_empty());
+}
+
+/* Disable benchmark by default. */
+#if 0
+TEST(index_mask_expression, Benchmark)
+{
+#  ifdef NDEBUG
+  const int64_t iterations = 100;
+#  else
+  const int64_t iterations = 1;
+#  endif
+
+  for ([[maybe_unused]] const int64_t _1 : IndexRange(5)) {
+    IndexMaskMemory m;
+    const IndexMask a = IndexMask::from_every_nth(3, 1'000'000, 0, m);
+    const IndexMask b = IndexMask::from_every_nth(100, 5'000, 0, m);
+    ExprBuilder builder;
+    const Expr &expr = builder.merge({&a, &b});
+
+    SCOPED_TIMER("benchmark");
+    for ([[maybe_unused]] const int64_t _2 : IndexRange(iterations)) {
+      IndexMaskMemory memory;
+      const IndexMask result = evaluate_expression(expr, memory);
+      UNUSED_VARS(result);
+    }
+  }
+}
+#endif
+
+}  // namespace blender::index_mask::tests