test/source/blender/blenlib/intern/index_mask.cc

/* SPDX-FileCopyrightText: 2023 Blender Authors
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

#include <mutex>

#include "BLI_array.hh"
#include "BLI_bit_vector.hh"
#include "BLI_enumerable_thread_specific.hh"
#include "BLI_index_mask.hh"
#include "BLI_math_base.hh"
#include "BLI_set.hh"
#include "BLI_sort.hh"
#include "BLI_strict_flags.h"
#include "BLI_task.hh"
#include "BLI_threads.h"
#include "BLI_timeit.hh"
#include "BLI_virtual_array.hh"

namespace blender::index_mask {

std::array<int16_t, max_segment_size> build_static_indices_array()
{
  std::array<int16_t, max_segment_size> data;
  for (int16_t i = 0; i < max_segment_size; i++) {
    data[size_t(i)] = i;
  }
  return data;
}

const IndexMask &get_static_index_mask_for_min_size(const int64_t min_size)
{
  static constexpr int64_t size_shift = 31;
  static constexpr int64_t max_size = (int64_t(1) << size_shift);      /* 2'147'483'648 */
  static constexpr int64_t segments_num = max_size / max_segment_size; /*       131'072 */

  /* Make sure we are never requesting a size that's larger than what was statically allocated.
   * If that's ever needed, we can either increase #size_shift or dynamically allocate an even
   * larger mask. */
  BLI_assert(min_size <= max_size);
  UNUSED_VARS_NDEBUG(min_size);

  static IndexMask static_mask = []() {
    static Array<const int16_t *> indices_by_segment(segments_num);
    /* The offsets and cumulative segment sizes array contain the same values here, so just use a
     * single array for both. */
    static Array<int64_t> segment_offsets(segments_num + 1);

    static const int16_t *static_offsets = get_static_indices_array().data();

    /* Isolate because the mutex protecting the initialization of #static_mask is locked. */
    threading::isolate_task([&]() {
      threading::parallel_for(IndexRange(segments_num), 1024, [&](const IndexRange range) {
        for (const int64_t segment_i : range) {
          indices_by_segment[segment_i] = static_offsets;
          segment_offsets[segment_i] = segment_i * max_segment_size;
        }
      });
    });
    segment_offsets.last() = max_size;

    IndexMask mask;
    IndexMaskData &data = mask.data_for_inplace_construction();
    data.indices_num_ = max_size;
    data.segments_num_ = segments_num;
    data.indices_by_segment_ = indices_by_segment.data();
    data.segment_offsets_ = segment_offsets.data();
    data.cumulative_segment_sizes_ = segment_offsets.data();
    data.begin_index_in_segment_ = 0;
    data.end_index_in_segment_ = max_segment_size;

    return mask;
  }();
  return static_mask;
}

std::ostream &operator<<(std::ostream &stream, const IndexMask &mask)
{
  Array<int64_t> indices(mask.size());
  mask.to_indices<int64_t>(indices);
  Vector<std::variant<IndexRange, Span<int64_t>>> segments;
  unique_sorted_indices::split_to_ranges_and_spans<int64_t>(indices, 8, segments);
  std::cout << "(Size: " << mask.size() << " | ";
  for (const std::variant<IndexRange, Span<int64_t>> &segment : segments) {
    if (std::holds_alternative<IndexRange>(segment)) {
      const IndexRange range = std::get<IndexRange>(segment);
      std::cout << range;
    }
    else {
      const Span<int64_t> segment_indices = std::get<Span<int64_t>>(segment);
      std::cout << "[";
      for (const int64_t index : segment_indices) {
        std::cout << index << ",";
      }
      std::cout << "]";
    }
    std::cout << ", ";
  }
  std::cout << ")";
  return stream;
}

IndexMask IndexMask::slice(const int64_t start, const int64_t size) const
{
  if (size == 0) {
    return {};
  }
  const RawMaskIterator first_it = this->index_to_iterator(start);
  const RawMaskIterator last_it = this->index_to_iterator(start + size - 1);

  IndexMask sliced = *this;
  sliced.indices_num_ = size;
  sliced.segments_num_ = last_it.segment_i - first_it.segment_i + 1;
  sliced.indices_by_segment_ += first_it.segment_i;
  sliced.segment_offsets_ += first_it.segment_i;
  sliced.cumulative_segment_sizes_ += first_it.segment_i;
  sliced.begin_index_in_segment_ = first_it.index_in_segment;
  sliced.end_index_in_segment_ = last_it.index_in_segment + 1;
  return sliced;
}

IndexMask IndexMask::slice_and_offset(const IndexRange range,
                                      const int64_t offset,
                                      IndexMaskMemory &memory) const
{
  return this->slice_and_offset(range.start(), range.size(), offset, memory);
}

IndexMask IndexMask::slice_and_offset(const int64_t start,
                                      const int64_t size,
                                      const int64_t offset,
                                      IndexMaskMemory &memory) const
{
  if (size == 0) {
    return {};
  }
  if (std::optional<IndexRange> range = this->to_range()) {
    return range->slice(start, size).shift(offset);
  }
  IndexMask sliced_mask = this->slice(start, size);
  if (offset == 0) {
    return sliced_mask;
  }
  if (std::optional<IndexRange> range = sliced_mask.to_range()) {
    return range->shift(offset);
  }
  MutableSpan<int64_t> new_segment_offsets = memory.allocate_array<int64_t>(
      sliced_mask.segments_num_);
  for (const int64_t i : new_segment_offsets.index_range()) {
    new_segment_offsets[i] = sliced_mask.segment_offsets_[i] + offset;
  }
  sliced_mask.segment_offsets_ = new_segment_offsets.data();
  return sliced_mask;
}

/**
 * Merges consecutive segments in some cases. Having fewer but larger segments generally allows for
 * better performance when using the mask later on.
 */
static void consolidate_segments(Vector<IndexMaskSegment, 16> &segments,
                                 IndexMaskMemory & /*memory*/)
{
  if (segments.is_empty()) {
    return;
  }

  const Span<int16_t> static_indices = get_static_indices_array();

  /* TODO: Support merging non-range segments in some cases as well. */
  int64_t group_start_segment_i = 0;
  int64_t group_first = segments[0][0];
  int64_t group_last = segments[0].last();
  bool group_as_range = unique_sorted_indices::non_empty_is_range(segments[0].base_span());

  auto finish_group = [&](const int64_t last_segment_i) {
    if (group_start_segment_i == last_segment_i) {
      return;
    }
    /* Join multiple ranges together into a bigger range. */
    const IndexRange range{group_first, group_last + 1 - group_first};
    segments[group_start_segment_i] = IndexMaskSegment(range[0],
                                                       static_indices.take_front(range.size()));
    for (int64_t i = group_start_segment_i + 1; i <= last_segment_i; i++) {
      segments[i] = {};
    }
  };

  for (const int64_t segment_i : segments.index_range().drop_front(1)) {
    const IndexMaskSegment segment = segments[segment_i];
    const std::optional<IndexRange> segment_base_range =
        unique_sorted_indices::non_empty_as_range_try(segment.base_span());
    const bool segment_is_range = segment_base_range.has_value();

    if (group_as_range && segment_is_range) {
      if (group_last + 1 == segment[0]) {
        if (segment.last() - group_first + 1 < max_segment_size) {
          /* Can combine previous and current range. */
          group_last = segment.last();
          continue;
        }
      }
    }
    finish_group(segment_i - 1);

    group_start_segment_i = segment_i;
    group_first = segment[0];
    group_last = segment.last();
    group_as_range = segment_is_range;
  }
  finish_group(segments.size() - 1);

  /* Remove all segments that have been merged into previous segments. */
  segments.remove_if([](const IndexMaskSegment segment) { return segment.is_empty(); });
}

/**
 * Create a new #IndexMask from the given segments. The provided segments are expected to be
 * owned by #memory already.
 */
static IndexMask mask_from_segments(const Span<IndexMaskSegment> segments, IndexMaskMemory &memory)
{
  if (segments.is_empty()) {
    return {};
  }
  const int64_t segments_num = segments.size();

  /* Allocate buffers for the mask. */
  MutableSpan<const int16_t *> indices_by_segment = memory.allocate_array<const int16_t *>(
      segments_num);
  MutableSpan<int64_t> segment_offsets = memory.allocate_array<int64_t>(segments_num);
  MutableSpan<int64_t> cumulative_segment_sizes = memory.allocate_array<int64_t>(segments_num + 1);

  /* Fill buffers. */
  cumulative_segment_sizes[0] = 0;
  for (const int64_t segment_i : segments.index_range()) {
    const IndexMaskSegment segment = segments[segment_i];
    indices_by_segment[segment_i] = segment.base_span().data();
    segment_offsets[segment_i] = segment.offset();
    cumulative_segment_sizes[segment_i + 1] = cumulative_segment_sizes[segment_i] + segment.size();
  }

  /* Initialize mask. */
  IndexMask mask;
  IndexMaskData &data = mask.data_for_inplace_construction();
  data.indices_num_ = cumulative_segment_sizes.last();
  data.segments_num_ = segments_num;
  data.indices_by_segment_ = indices_by_segment.data();
  data.segment_offsets_ = segment_offsets.data();
  data.cumulative_segment_sizes_ = cumulative_segment_sizes.data();
  data.begin_index_in_segment_ = 0;
  data.end_index_in_segment_ = segments.last().size();
  return mask;
}

/**
 * Split the indices into segments. Afterwards, the indices referenced by #r_segments are either
 * owned by #allocator or statically allocated.
 */
template<typename T, int64_t InlineBufferSize>
static void segments_from_indices(const Span<T> indices,
                                  LinearAllocator<> &allocator,
                                  Vector<IndexMaskSegment, InlineBufferSize> &r_segments)
{
  Vector<std::variant<IndexRange, Span<T>>, 16> segments;

  for (int64_t start = 0; start < indices.size(); start += max_segment_size) {
    /* Slice to make sure that each segment is no longer than #max_segment_size. */
    const Span<T> indices_slice = indices.slice_safe(start, max_segment_size);
    unique_sorted_indices::split_to_ranges_and_spans<T>(indices_slice, 64, segments);
  }

  const Span<int16_t> static_indices = get_static_indices_array();
  for (const auto &segment : segments) {
    if (std::holds_alternative<IndexRange>(segment)) {
      const IndexRange segment_range = std::get<IndexRange>(segment);
      r_segments.append_as(segment_range.start(), static_indices.take_front(segment_range.size()));
    }
    else {
      Span<T> segment_indices = std::get<Span<T>>(segment);
      MutableSpan<int16_t> offset_indices = allocator.allocate_array<int16_t>(
          segment_indices.size());
      while (!segment_indices.is_empty()) {
        const int64_t offset = segment_indices[0];
        const int64_t next_segment_size = binary_search::find_predicate_begin(
            segment_indices.take_front(max_segment_size),
            [&](const T value) { return value - offset >= max_segment_size; });
        for (const int64_t i : IndexRange(next_segment_size)) {
          const int64_t offset_index = segment_indices[i] - offset;
          BLI_assert(offset_index < max_segment_size);
          offset_indices[i] = int16_t(offset_index);
        }
        r_segments.append_as(offset, offset_indices.take_front(next_segment_size));
        segment_indices = segment_indices.drop_front(next_segment_size);
        offset_indices = offset_indices.drop_front(next_segment_size);
      }
    }
  }
}

/**
 * Utility to generate segments on multiple threads and to reduce the result in the end.
 */
struct ParallelSegmentsCollector {
  struct LocalData {
    LinearAllocator<> allocator;
    Vector<IndexMaskSegment, 16> segments;
  };

  threading::EnumerableThreadSpecific<LocalData> data_by_thread;

  /**
   * Move ownership of memory allocated from all threads to #main_allocator. Also, extend
   * #main_segments with the segments created on each thread. The segments are also sorted to make
   * sure that they are in the correct order.
   */
  void reduce(LinearAllocator<> &main_allocator, Vector<IndexMaskSegment, 16> &main_segments)
  {
    for (LocalData &data : this->data_by_thread) {
      main_allocator.transfer_ownership_from(data.allocator);
      main_segments.extend(data.segments);
    }
    parallel_sort(main_segments.begin(),
                  main_segments.end(),
                  [](const IndexMaskSegment a, const IndexMaskSegment b) { return a[0] < b[0]; });
  }
};

/**
 * Convert a range to potentially multiple index mask segments.
 */
static void range_to_segments(const IndexRange range, Vector<IndexMaskSegment, 16> &r_segments)
{
  const Span<int16_t> static_indices = get_static_indices_array();
  for (int64_t start = 0; start < range.size(); start += max_segment_size) {
    const int64_t size = std::min(max_segment_size, range.size() - start);
    r_segments.append_as(range.start() + start, static_indices.take_front(size));
  }
}

static int64_t get_size_before_gap(const Span<int16_t> indices)
{
  BLI_assert(indices.size() >= 2);
  if (indices[1] > indices[0] + 1) {
    /* For sparse indices, often the next gap is just after the next index.
     * In this case we can skip the logarithmic check below. */
    return 1;
  }
  return unique_sorted_indices::find_size_of_next_range(indices);
}

static void inverted_indices_to_segments(const IndexMaskSegment segment,
                                         LinearAllocator<> &allocator,
                                         Vector<IndexMaskSegment, 16> &r_segments)
{
  constexpr int64_t range_threshold = 64;
  const int64_t offset = segment.offset();
  const Span<int16_t> static_indices = get_static_indices_array();

  int64_t inverted_index_count = 0;
  std::array<int16_t, max_segment_size> inverted_indices_array;
  auto add_indices = [&](const int16_t start, const int16_t num) {
    int16_t *new_indices_begin = inverted_indices_array.data() + inverted_index_count;
    std::iota(new_indices_begin, new_indices_begin + num, start);
    inverted_index_count += num;
  };

  auto finish_indices = [&]() {
    if (inverted_index_count == 0) {
      return;
    }
    MutableSpan<int16_t> offset_indices = allocator.allocate_array<int16_t>(inverted_index_count);
    offset_indices.copy_from(Span(inverted_indices_array).take_front(inverted_index_count));
    r_segments.append_as(offset, offset_indices);
    inverted_index_count = 0;
  };

  Span<int16_t> indices = segment.base_span();
  while (indices.size() > 1) {
    const int64_t size_before_gap = get_size_before_gap(indices);
    if (size_before_gap == indices.size()) {
      break;
    }

    const int16_t gap_first = indices[size_before_gap - 1] + 1;
    const int16_t next = indices[size_before_gap];
    const int16_t gap_size = next - gap_first;
    if (gap_size > range_threshold) {
      finish_indices();
      r_segments.append_as(offset + gap_first, static_indices.take_front(gap_size));
    }
    else {
      add_indices(gap_first, gap_size);
    }

    indices = indices.drop_front(size_before_gap);
  }

  finish_indices();
}

static void invert_segments(const IndexMask &mask,
                            const IndexRange segment_range,
                            LinearAllocator<> &allocator,
                            Vector<IndexMaskSegment, 16> &r_segments)
{
  for (const int64_t segment_i : segment_range) {
    const IndexMaskSegment segment = mask.segment(segment_i);
    inverted_indices_to_segments(segment, allocator, r_segments);

    const IndexMaskSegment next_segment = mask.segment(segment_i + 1);
    const int64_t between_start = segment.last() + 1;
    const int64_t size_between_segments = next_segment[0] - segment.last() - 1;
    const IndexRange range_between_segments(between_start, size_between_segments);
    if (!range_between_segments.is_empty()) {
      range_to_segments(range_between_segments, r_segments);
    }
  }
}

IndexMask IndexMask::complement(const IndexRange universe, IndexMaskMemory &memory) const
{
  if (this->is_empty()) {
    return universe;
  }
  if (universe.is_empty()) {
    return {};
  }
  const std::optional<IndexRange> this_range = this->to_range();
  if (this_range) {
    const bool first_in_range = this_range->first() <= universe.first();
    const bool last_in_range = this_range->last() >= universe.last();
    if (first_in_range && last_in_range) {
      /* This mask fills the entire universe, so the complement is empty. */
      return {};
    }
    if (first_in_range) {
      /* This mask is a range that contains the start of the universe.
       * The complement is a range that contains the end of the universe. */
      const int64_t complement_start = this_range->one_after_last();
      const int64_t complement_size = universe.one_after_last() - complement_start;
      return IndexRange(complement_start, complement_size);
    }
    if (last_in_range) {
      /* This mask is a range that contains the end of the universe.
       * The complement is a range that contains the start of the universe. */
      const int64_t complement_start = universe.first();
      const int64_t complement_size = this_range->first() - complement_start;
      return IndexRange(complement_start, complement_size);
    }
  }

  Vector<IndexMaskSegment, 16> segments;

  if (universe.start() < this->first()) {
    range_to_segments(universe.take_front(this->first() - universe.start()), segments);
  }

  if (!this_range) {
    const int64_t segments_num = this->segments_num();

    constexpr int64_t min_grain_size = 16;
    constexpr int64_t max_grain_size = 4096;
    const int64_t threads_num = BLI_system_thread_count();
    const int64_t grain_size = std::clamp(
        segments_num / threads_num, min_grain_size, max_grain_size);

    const IndexRange non_last_segments = IndexRange(segments_num).drop_back(1);
    if (segments_num < min_grain_size) {
      invert_segments(*this, non_last_segments, memory, segments);
    }
    else {
      ParallelSegmentsCollector segments_collector;
      threading::parallel_for(non_last_segments, grain_size, [&](const IndexRange range) {
        ParallelSegmentsCollector::LocalData &local_data =
            segments_collector.data_by_thread.local();
        invert_segments(*this, range, local_data.allocator, local_data.segments);
      });
      segments_collector.reduce(memory, segments);
    }
    inverted_indices_to_segments(this->segment(segments_num - 1), memory, segments);
  }

  if (universe.last() > this->first()) {
    range_to_segments(universe.take_back(universe.last() - this->last()), segments);
  }

  return mask_from_segments(segments, memory);
}

template<typename T>
IndexMask IndexMask::from_indices(const Span<T> indices, IndexMaskMemory &memory)
{
  if (indices.is_empty()) {
    return {};
  }
  if (const std::optional<IndexRange> range = unique_sorted_indices::non_empty_as_range_try(
          indices)) {
    /* Fast case when the indices encode a single range. */
    return *range;
  }

  Vector<IndexMaskSegment, 16> segments;

  constexpr int64_t min_grain_size = 4096;
  constexpr int64_t max_grain_size = max_segment_size;
  if (indices.size() <= min_grain_size) {
    segments_from_indices(indices, memory, segments);
  }
  else {
    const int64_t threads_num = BLI_system_thread_count();
    /* Can be faster with a larger grain size, but only when there are enough indices. */
    const int64_t grain_size = std::clamp(
        indices.size() / (threads_num * 4), min_grain_size, max_grain_size);

    ParallelSegmentsCollector segments_collector;
    threading::parallel_for(indices.index_range(), grain_size, [&](const IndexRange range) {
      ParallelSegmentsCollector::LocalData &local_data = segments_collector.data_by_thread.local();
      segments_from_indices(indices.slice(range), local_data.allocator, local_data.segments);
    });
    segments_collector.reduce(memory, segments);
  }
  consolidate_segments(segments, memory);
  return mask_from_segments(segments, memory);
}

IndexMask IndexMask::from_bits(const BitSpan bits, IndexMaskMemory &memory)
{
  return IndexMask::from_bits(bits.index_range(), bits, memory);
}

IndexMask IndexMask::from_bits(const IndexMask &universe,
                               const BitSpan bits,
                               IndexMaskMemory &memory)
{
  return IndexMask::from_predicate(universe, GrainSize(1024), memory, [bits](const int64_t index) {
    return bits[index].test();
  });
}

IndexMask IndexMask::from_bools(Span<bool> bools, IndexMaskMemory &memory)
{
  return IndexMask::from_bools(bools.index_range(), bools, memory);
}

IndexMask IndexMask::from_bools(const VArray<bool> &bools, IndexMaskMemory &memory)
{
  return IndexMask::from_bools(bools.index_range(), bools, memory);
}

IndexMask IndexMask::from_bools(const IndexMask &universe,
                                Span<bool> bools,
                                IndexMaskMemory &memory)
{
  return IndexMask::from_predicate(
      universe, GrainSize(1024), memory, [bools](const int64_t index) { return bools[index]; });
}

IndexMask IndexMask::from_bools(const IndexMask &universe,
                                const VArray<bool> &bools,
                                IndexMaskMemory &memory)
{
  const CommonVArrayInfo info = bools.common_info();
  if (info.type == CommonVArrayInfo::Type::Single) {
    return *static_cast<const bool *>(info.data) ? universe : IndexMask();
  }
  if (info.type == CommonVArrayInfo::Type::Span) {
    const Span<bool> span(static_cast<const bool *>(info.data), bools.size());
    return IndexMask::from_bools(universe, span, memory);
  }
  return IndexMask::from_predicate(
      universe, GrainSize(512), memory, [&](const int64_t index) { return bools[index]; });
}

IndexMask IndexMask::from_union(const IndexMask &mask_a,
                                const IndexMask &mask_b,
                                IndexMaskMemory &memory)

{
  const int64_t new_size = math::max(mask_a.min_array_size(), mask_b.min_array_size());
  Array<bool> tmp(new_size, false);
  mask_a.foreach_index_optimized<int64_t>(GrainSize(2048),
                                          [&](const int64_t i) { tmp[i] = true; });
  mask_b.foreach_index_optimized<int64_t>(GrainSize(2048),
                                          [&](const int64_t i) { tmp[i] = true; });
  return IndexMask::from_bools(tmp, memory);
}

template<typename T> void IndexMask::to_indices(MutableSpan<T> r_indices) const
{
  BLI_assert(this->size() == r_indices.size());
  this->foreach_index_optimized<int64_t>(
      GrainSize(1024), [r_indices = r_indices.data()](const int64_t i, const int64_t pos) {
        r_indices[pos] = T(i);
      });
}

void IndexMask::to_bits(MutableBitSpan r_bits) const
{
  BLI_assert(r_bits.size() >= this->min_array_size());
  r_bits.reset_all();
  this->foreach_segment_optimized([&](const auto segment) {
    if constexpr (std::is_same_v<std::decay_t<decltype(segment)>, IndexRange>) {
      const IndexRange range = segment;
      r_bits.slice(range).set_all();
    }
    else {
      for (const int64_t i : segment) {
        r_bits[i].set();
      }
    }
  });
}

void IndexMask::to_bools(MutableSpan<bool> r_bools) const
{
  BLI_assert(r_bools.size() >= this->min_array_size());
  r_bools.fill(false);
  this->foreach_index_optimized<int64_t>(GrainSize(2048),
                                         [&](const int64_t i) { r_bools[i] = true; });
}

Vector<IndexRange> IndexMask::to_ranges() const
{
  Vector<IndexRange> ranges;
  this->foreach_range([&](const IndexRange range) { ranges.append(range); });
  return ranges;
}

Vector<IndexRange> IndexMask::to_ranges_invert(const IndexRange universe) const
{
  IndexMaskMemory memory;
  return this->complement(universe, memory).to_ranges();
}

namespace detail {

/**
 * Filter the indices from #universe_segment using #filter_indices. Store the resulting indices as
 * segments.
 */
static void segments_from_predicate_filter(
    const IndexMaskSegment universe_segment,
    LinearAllocator<> &allocator,
    const FunctionRef<int64_t(IndexMaskSegment indices, int16_t *r_true_indices)> filter_indices,
    Vector<IndexMaskSegment, 16> &r_segments)
{
  std::array<int16_t, max_segment_size> indices_array;
  const int64_t true_indices_num = filter_indices(universe_segment, indices_array.data());
  if (true_indices_num == 0) {
    return;
  }
  const Span<int16_t> true_indices{indices_array.data(), true_indices_num};
  Vector<std::variant<IndexRange, Span<int16_t>>> true_segments;
  unique_sorted_indices::split_to_ranges_and_spans<int16_t>(true_indices, 64, true_segments);

  const Span<int16_t> static_indices = get_static_indices_array();

  for (const auto &true_segment : true_segments) {
    if (std::holds_alternative<IndexRange>(true_segment)) {
      const IndexRange segment_range = std::get<IndexRange>(true_segment);
      r_segments.append_as(universe_segment.offset(), static_indices.slice(segment_range));
    }
    else {
      const Span<int16_t> segment_indices = std::get<Span<int16_t>>(true_segment);
      r_segments.append_as(universe_segment.offset(),
                           allocator.construct_array_copy(segment_indices));
    }
  }
}

IndexMask from_predicate_impl(
    const IndexMask &universe,
    const GrainSize grain_size,
    IndexMaskMemory &memory,
    const FunctionRef<int64_t(IndexMaskSegment indices, int16_t *r_true_indices)> filter_indices)
{
  if (universe.is_empty()) {
    return {};
  }

  Vector<IndexMaskSegment, 16> segments;
  if (universe.size() <= grain_size.value) {
    for (const int64_t segment_i : IndexRange(universe.segments_num())) {
      const IndexMaskSegment universe_segment = universe.segment(segment_i);
      segments_from_predicate_filter(universe_segment, memory, filter_indices, segments);
    }
  }
  else {
    ParallelSegmentsCollector segments_collector;
    universe.foreach_segment(grain_size, [&](const IndexMaskSegment universe_segment) {
      ParallelSegmentsCollector::LocalData &data = segments_collector.data_by_thread.local();
      segments_from_predicate_filter(
          universe_segment, data.allocator, filter_indices, data.segments);
    });
    segments_collector.reduce(memory, segments);
  }

  consolidate_segments(segments, memory);
  return mask_from_segments(segments, memory);
}
}  // namespace detail

std::optional<RawMaskIterator> IndexMask::find(const int64_t query_index) const
{
  if (this->is_empty()) {
    return std::nullopt;
  }
  if (query_index < this->first()) {
    return std::nullopt;
  }
  if (query_index > this->last()) {
    return std::nullopt;
  }

  const int64_t segment_i = -1 + binary_search::find_predicate_begin(
                                     IndexRange(segments_num_), [&](const int64_t value) {
                                       return query_index < this->segment(value)[0];
                                     });

  const IndexMaskSegment segment = this->segment(segment_i);
  const Span<int16_t> local_segment = segment.base_span();
  const int64_t local_query_index = query_index - segment.offset();
  if (local_query_index > local_segment.last()) {
    return std::nullopt;
  }
  const int64_t index_in_segment = -1 + binary_search::find_predicate_begin(
                                            local_segment, [&](const int16_t value) {
                                              return local_query_index < value;
                                            });
  if (local_segment[index_in_segment] != local_query_index) {
    return std::nullopt;
  }
  return RawMaskIterator{segment_i, int16_t(index_in_segment)};
}

bool IndexMask::contains(const int64_t query_index) const
{
  return this->find(query_index).has_value();
}

template IndexMask IndexMask::from_indices(Span<int32_t>, IndexMaskMemory &);
template IndexMask IndexMask::from_indices(Span<int64_t>, IndexMaskMemory &);
template void IndexMask::to_indices(MutableSpan<int32_t>) const;
template void IndexMask::to_indices(MutableSpan<int64_t>) const;

}  // namespace blender::index_mask