diff --git a/source/blender/blenlib/intern/bit_bool_conversion.cc b/source/blender/blenlib/intern/bit_bool_conversion.cc index 5955a5f4bca..dd637749f89 100644 --- a/source/blender/blenlib/intern/bit_bool_conversion.cc +++ b/source/blender/blenlib/intern/bit_bool_conversion.cc @@ -7,45 +7,40 @@ namespace blender::bits { -bool or_bools_into_bits(const Span bools, +template +bool or_bytes_into_bits(const Span bytes, MutableBitSpan r_bits, - const int64_t allowed_overshoot) + const int64_t allowed_overshoot, + const ByteToBit &byte_to_bit) { - BLI_assert(r_bits.size() >= bools.size()); - if (bools.is_empty()) { + BLI_assert(r_bits.size() >= bytes.size()); + if (bytes.is_empty()) { return false; } - int64_t bool_i = 0; - const bool *bools_ = bools.data(); + int64_t byte_i = 0; + const char *bytes_ = bytes.data(); bool any_true = false; -/* Conversion from bools to bits can be way faster with intrinsics. That's because instead of +/* Conversion from bytes to bits can be way faster with intrinsics. That's because instead of * processing one element at a time, we can process 16 at once. */ #if BLI_HAVE_SSE2 - /* Initialize zeros, so that we can compare against it. */ - const __m128i zero_bytes = _mm_set1_epi8(0); - int64_t iteration_end = bools.size(); + int64_t iteration_end = bytes.size(); if (iteration_end % 16 > 0) { if (allowed_overshoot >= 16) { iteration_end = (iteration_end + 16) & ~15; } } - /* Iterate over chunks of booleans. */ - for (; bool_i + 16 <= iteration_end; bool_i += 16) { - /* Load 16 bools at once. */ - const __m128i group = _mm_loadu_si128(reinterpret_cast(bools_ + bool_i)); - /* Compare them all against zero. The result is a mask of the form [0x00, 0xff, 0xff, ...]. */ - const __m128i is_false_byte_mask = _mm_cmpeq_epi8(group, zero_bytes); - /* Compress the byte-mask into a bit mask. This takes one bit from each byte. */ - const uint16_t is_false_mask = _mm_movemask_epi8(is_false_byte_mask); - /* Now we have a bit mask where each bit corresponds to an input boolean. */ - const uint16_t is_true_mask = ~is_false_mask; + /* Iterate over chunks of bytes. */ + for (; byte_i + 16 <= iteration_end; byte_i += 16) { + /* Load 16 bytes at once. */ + const __m128i group = _mm_loadu_si128(reinterpret_cast(bytes_ + byte_i)); + const uint16_t is_true_mask = byte_to_bit.see2_chunk(group); any_true |= is_true_mask != 0; - const int start_bit_in_int = (r_bits.bit_range().start() + bool_i) & BitIndexMask; - BitInt *start_bit_int = int_containing_bit(r_bits.data(), r_bits.bit_range().start() + bool_i); + const int start_bit_in_int = (r_bits.bit_range().start() + byte_i) & BitIndexMask; + BitInt *start_bit_int = int_containing_bit(r_bits.data(), r_bits.bit_range().start() + byte_i); *start_bit_int |= BitInt(is_true_mask) << start_bit_in_int; if (start_bit_in_int > BitsPerInt - 16) { @@ -55,14 +50,42 @@ bool or_bools_into_bits(const Span bools, } #endif - /* Process remaining bools. */ - for (; bool_i < bools.size(); bool_i++) { - if (bools_[bool_i]) { - r_bits[bool_i].set(); + /* Process remaining bytes. */ + for (; byte_i < bytes.size(); byte_i++) { + if (byte_to_bit.single(bytes_[byte_i])) { + r_bits[byte_i].set(); any_true = true; } } return any_true; } +struct BoolToBit { + static bool single(const char c) + { + return bool(c); + } + +#if BLI_HAVE_SSE2 + static uint16_t see2_chunk(const __m128i chunk) + { + const __m128i zero_bytes = _mm_set1_epi8(0); + /* Compare them all against zero. The result is a mask of the form [0x00, 0xff, 0xff, ...]. */ + const __m128i is_false_byte_mask = _mm_cmpeq_epi8(chunk, zero_bytes); + /* Compress the byte-mask into a bit mask. This takes one bit from each byte. */ + const uint16_t is_false_mask = _mm_movemask_epi8(is_false_byte_mask); + /* Now we have a bit mask where each bit corresponds to an input byte. */ + const uint16_t is_true_mask = ~is_false_mask; + return is_true_mask; + } +#endif +}; + +bool or_bools_into_bits(const Span bools, + MutableBitSpan r_bits, + const int64_t allowed_overshoot) +{ + return or_bytes_into_bits(bools.cast(), r_bits, allowed_overshoot, BoolToBit()); +} + } // namespace blender::bits