Files
test/source/blender/blenlib/intern/array_store_rle.cc
Campbell Barton 6a1fa176ef Cleanup: spelling in comments & duplicate terms (check_spelling.py)
Also minor clarification in doc-string.
2025-06-04 01:51:29 +00:00

425 lines
14 KiB
C++

/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup bli
*
* \brief Run length encoding for arrays.
*
* The intended use is to pre-process arrays before storing in #BArrayStore.
* This should be used in cases arrays are likely to contain large spans of contiguous data
* (which doesn't de-duplicate so well).
*
* Intended for byte arrays as there is no special logic to handle alignment.
* Note that this could be supported and would be useful to de-duplicate
* repeating patterns of non-byte data.
*
* Notes:
* - For random data, the size overhead is only `sizeof(size_t[4])` (header & footer).
*
* - The main down-side in that case of random data is detecting there are no spans to RLE encode,
* and creating the "encoded" copy.
*
* - For an array containing a single value the resulting size
* will be `sizeof(size_t[3]) + sizeof(uint8_t)`.
*
* - This is not intended to be used for compression, it would be possible
* to use less memory by packing the size of short spans into fewer bits.
* This isn't done as it requires more computation when encoding.
*
* - This RLE implementation is a balance between working well for random bytes
* as well as arrays containing large contiguous spans.
*
* There is *some* bias towards performing well with arrays containing contiguous spans
* mainly because the benefits are greater and the likelihood is that RLE encoding is used
* because there is a probability the data will be able to take advantage of RLE.
* Having said this - encoding random bytes must not be *slow* either.
*/
#include <cstdlib>
#include <cstring>
#include "MEM_guardedalloc.h"
#include "BLI_assert.h"
#include "BLI_utildefines.h"
#include "BLI_array_store.h" /* Own include. */
#include "BLI_strict_flags.h" /* IWYU pragma: keep. Keep last. */
/* -------------------------------------------------------------------- */
/** \name Internal Utilities
* \{ */
/**
* Use faster method of spanning for change by stepping over larger values.
*
* NOTE(@ideasman42) In practice this gives ~3.5x overall speedup when encoding large arrays.
* For random data the performance is worse, about ~5% slower.
*/
#define USE_FIND_FASTPATH
static size_t find_byte_not_equal_to(const uint8_t *data,
size_t offset,
const size_t size,
const uint8_t value)
{
BLI_assert(offset <= size);
#ifdef USE_FIND_FASTPATH
using fast_int = uintptr_t;
/* In the case of random data, early exit without entering more involved steps. */
/* Calculate the minimum size which may use an optimized search. */
constexpr size_t min_size_for_fast_path = (
/* Pass 1: scans a fixed size. */
sizeof(size_t[2]) +
/* Pass 2: scans a fixed size but aligns to `fast_int`. */
sizeof(size_t) + sizeof(fast_int) +
/* Pass 3: trims the end of `data` by `fast_int`
* add to ensure there is at least one item to read. */
sizeof(fast_int));
if (LIKELY(size - offset > min_size_for_fast_path)) {
/* Pass 1: Scan forward with a fixed size to check if an early exit
* is needed (this may exit on the first few bytes). */
const uint8_t *p = data + offset;
const uint8_t *p_end = p + sizeof(size_t[2]);
do {
if (LIKELY(*p != value)) {
return size_t(p - data);
}
p++;
} while (p < p_end);
/* `offset` is no longer valid and needs to be updated from `p` before use. */
/* Pass 2: Scan forward at least `sizeof(size_t)` bytes,
* aligned to the next `sizeof(fast_int)` aligned boundary. */
p_end = reinterpret_cast<const uint8_t *>(
((uintptr_t(p) + sizeof(size_t) + sizeof(fast_int)) & ~(sizeof(fast_int) - 1)));
do {
if (LIKELY(*p != value)) {
return size_t(p - data);
}
p++;
} while (p < p_end);
/* Pass 3: Scan forward the `fast_int` aligned chunks (the fast path).
* This block is responsible for scanning over large spans of contiguous bytes. */
/* There are at least `sizeof(size_t[2])` number of bytes all equal.
* Use `fast_int` aligned reads for a faster search. */
BLI_assert((uintptr_t(p) & (sizeof(fast_int) - 1)) == 0);
const fast_int *p_fast = reinterpret_cast<const fast_int *>(p);
/* Not aligned, but this doesn't matter as it's only used for comparison. */
const fast_int *p_fast_last = reinterpret_cast<const fast_int *>(data +
(size - sizeof(fast_int)));
BLI_assert(p_fast <= p_fast_last);
fast_int value_fast;
memset(&value_fast, value, sizeof(value_fast));
do {
/* Use unlikely given many of the previous bytes match. */
if (UNLIKELY(*p_fast != value_fast)) {
break;
}
p_fast++;
} while (p_fast <= p_fast_last);
offset = size_t(reinterpret_cast<const uint8_t *>(p_fast) - data);
/* Perform byte level check with any trailing data. */
}
#endif /* USE_FIND_FASTPATH */
while ((offset < size) && (value == data[offset])) {
offset += 1;
}
return offset;
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name Private API
* \{ */
struct RLE_Head {
/**
* - When zero, this struct is interpreted as a #RLE_Literal.
* - When non-zero, this struct is interpreted as a #RLE_Span.
* The `value` is a `uint_8` (to reduce the size of the struct).
*/
size_t span_size;
};
struct RLE_Literal {
uint8_t _span_size_pad[sizeof(size_t)];
size_t value;
};
struct RLE_Span {
uint8_t _span_size_pad[sizeof(size_t)];
uint8_t value;
};
BLI_STATIC_ASSERT(sizeof(RLE_Span) == sizeof(size_t) + sizeof(uint8_t), "");
struct RLE_Elem {
union {
RLE_Head head;
RLE_Span span;
RLE_Literal literal;
};
};
struct RLE_ElemChunk {
RLE_ElemChunk *next;
size_t links_num;
/** Use 4KB chunks for efficient small allocations. */
RLE_Elem links[(4096 / sizeof(RLE_Elem)) -
(sizeof(RLE_ElemChunk *) + sizeof(size_t) + MEM_SIZE_OVERHEAD)];
};
BLI_STATIC_ASSERT(sizeof(RLE_ElemChunk) <= 4096 - MEM_SIZE_OVERHEAD, "");
struct RLE_ElemChunkIter {
RLE_ElemChunk *iter;
size_t link_curr;
};
static void rle_link_chunk_iter_new(RLE_ElemChunk *links_block, RLE_ElemChunkIter *link_block_iter)
{
link_block_iter->iter = links_block;
link_block_iter->link_curr = 0;
}
static RLE_Elem *rle_link_chunk_iter_step(RLE_ElemChunkIter *link_block_iter)
{
RLE_ElemChunk *link_block = link_block_iter->iter;
if (link_block_iter->link_curr < link_block->links_num) {
return &link_block->links[link_block_iter->link_curr++];
}
if (link_block->next) {
link_block = link_block_iter->iter = link_block->next;
link_block_iter->link_curr = 1;
return &link_block->links[0];
}
return nullptr;
}
static RLE_ElemChunk *rle_link_chunk_new()
{
RLE_ElemChunk *link_block = MEM_mallocN<RLE_ElemChunk>(__func__);
link_block->next = nullptr;
link_block->links_num = 0;
return link_block;
}
static void rle_link_chunk_free_all(RLE_ElemChunk *link_block)
{
while (RLE_ElemChunk *link_iter = link_block) {
link_block = link_iter->next;
MEM_freeN(link_iter);
}
}
static RLE_Elem *rle_link_chunk_elem_new(RLE_ElemChunk **link_block_p)
{
RLE_ElemChunk *link_block = *link_block_p;
if (UNLIKELY(link_block->links_num == ARRAY_SIZE(link_block->links))) {
RLE_ElemChunk *link_block_next = rle_link_chunk_new();
link_block->next = link_block_next;
link_block = link_block_next;
*link_block_p = link_block_next;
}
return &link_block->links[link_block->links_num++];
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name Public API
* \{ */
uint8_t *BLI_array_store_rle_encode(const uint8_t *data_dec,
const size_t data_dec_len,
const size_t data_enc_extra_size,
size_t *r_data_enc_len)
{
size_t data_enc_alloc_size = data_enc_extra_size +
sizeof(RLE_Literal); /* A single null terminator. */
/* Notes on the threshold for choosing when to include literal data or RLE encode.
* From testing a ~4 million array of booleans.
*
* Regarding space efficiency:
*
* - For data with fewer changes: `sizeof(RLE_Literal)` (16 on a 64bit system) is optimal.
* The improvement varies, between 5-20%.
* - For random data: `sizeof(RLE_Literal) + sizeof(size_t)` (24 on a 64bit system) is optimal.
* The improvement is only ~5% though.
*
* The time difference between each is roughly the same.
*/
constexpr size_t rle_skip_threshold = sizeof(RLE_Literal);
RLE_ElemChunk *link_blocks = rle_link_chunk_new();
RLE_ElemChunk *link_blocks_first = link_blocks;
/* Re-use results from scanning ahead (as needed). */
for (size_t ofs_dec = 0, span_skip_next = 1; ofs_dec < data_dec_len;) {
/* Scan ahead to detect the size of the non-RLE span. */
size_t ofs_dec_next = ofs_dec + span_skip_next;
span_skip_next = 1;
/* Detect and use the `span` if possible. */
uint8_t value_start = data_dec[ofs_dec];
ofs_dec_next = find_byte_not_equal_to(data_dec, ofs_dec_next, data_dec_len, value_start);
RLE_Elem *e = rle_link_chunk_elem_new(&link_blocks);
const size_t span = ofs_dec_next - ofs_dec;
if (span >= rle_skip_threshold) {
/* Catch off by one errors. */
BLI_assert(data_dec[ofs_dec] == data_dec[(ofs_dec + span) - 1]);
BLI_assert((ofs_dec + span == data_dec_len) ||
(data_dec[ofs_dec] != data_dec[(ofs_dec + span)]));
e->head.span_size = span;
e->span.value = value_start;
data_enc_alloc_size += sizeof(RLE_Span);
}
else {
/* A large enough span was not found,
* scan ahead to detect the size of the non-RLE span. */
/* Check the offset isn't at the very end of the array. */
size_t ofs_dec_test = ofs_dec_next + 1;
if (LIKELY(ofs_dec_test < data_dec_len)) {
/* The first value that changed, start searching here. */
size_t ofs_dec_test_start = ofs_dec_next;
value_start = data_dec[ofs_dec_test_start];
while (true) {
if (value_start == data_dec[ofs_dec_test]) {
ofs_dec_test += 1;
const size_t span_test = ofs_dec_test - ofs_dec_test_start;
BLI_assert(span_test <= rle_skip_threshold);
if (span_test == rle_skip_threshold) {
/* Write the span of non-RLE data,
* then start scanning the magnitude of the RLE span at the start of the loop. */
span_skip_next = span_test;
ofs_dec_next = ofs_dec_test_start;
break;
}
}
else {
BLI_assert(ofs_dec_test - ofs_dec_test_start < rle_skip_threshold);
value_start = data_dec[ofs_dec_test];
ofs_dec_test_start = ofs_dec_test;
ofs_dec_test += 1;
}
if (UNLIKELY(ofs_dec_test == data_dec_len)) {
ofs_dec_next = data_dec_len;
break;
}
}
}
else {
ofs_dec_next = data_dec_len;
}
/* Interleave the #RLE_Literal. */
const size_t non_rle_span = ofs_dec_next - ofs_dec;
e->head.span_size = 0;
e->literal.value = non_rle_span;
data_enc_alloc_size += sizeof(RLE_Literal) + non_rle_span;
}
ofs_dec = ofs_dec_next;
}
/* Encode RLE and literal data into this flat buffer. */
uint8_t *data_enc = MEM_malloc_arrayN<uint8_t>(data_enc_alloc_size, __func__);
data_enc += data_enc_extra_size;
size_t ofs_enc = 0;
size_t ofs_dec = 0;
RLE_ElemChunkIter link_block_iter;
rle_link_chunk_iter_new(link_blocks_first, &link_block_iter);
while (RLE_Elem *e = rle_link_chunk_iter_step(&link_block_iter)) {
BLI_assert(ofs_dec <= data_dec_len);
if (e->head.span_size) {
memcpy(data_enc + ofs_enc, &e->span, sizeof(RLE_Span));
ofs_enc += sizeof(RLE_Span);
ofs_dec += e->head.span_size;
}
else {
memcpy(data_enc + ofs_enc, &e->literal, sizeof(RLE_Literal));
ofs_enc += sizeof(RLE_Literal);
BLI_assert(e->literal.value > 0);
const size_t non_rle_span = e->literal.value;
memcpy(data_enc + ofs_enc, data_dec + ofs_dec, non_rle_span);
ofs_enc += non_rle_span;
ofs_dec += non_rle_span;
}
}
rle_link_chunk_free_all(link_blocks_first);
BLI_assert(data_enc_extra_size + ofs_enc + sizeof(RLE_Literal) == data_enc_alloc_size);
BLI_assert(ofs_dec == data_dec_len);
/* Set the `RLE_Literal` span & value to 0 to terminate. */
memset(data_enc + ofs_enc, 0x0, sizeof(RLE_Literal));
*r_data_enc_len = data_enc_alloc_size - data_enc_extra_size;
data_enc -= data_enc_extra_size;
return data_enc;
}
void BLI_array_store_rle_decode(const uint8_t *data_enc,
const size_t data_enc_len,
void *data_dec_v,
const size_t data_dec_len)
{
/* NOTE: `data_enc_len` & `data_dec_len` could be omitted.
* They're just to ensure data isn't corrupt. */
uint8_t *data_dec = reinterpret_cast<uint8_t *>(data_dec_v);
size_t ofs_enc = 0;
size_t ofs_dec = 0;
while (true) {
/* Copy as this may not be aligned. */
RLE_Head e;
memcpy(&e, data_enc + ofs_enc, sizeof(RLE_Head));
ofs_enc += sizeof(RLE_Head);
if (e.span_size != 0) {
/* Read #RLE_Span::value directly from memory. */
const uint8_t value = *reinterpret_cast<const uint8_t *>(data_enc + ofs_enc);
memset(data_dec + ofs_dec, int(value), e.span_size);
ofs_enc += sizeof(uint8_t);
ofs_dec += e.span_size;
}
else {
/* Read #RLE_Literal::value directly from memory. */
size_t non_rle_span;
memcpy(&non_rle_span, data_enc + ofs_enc, sizeof(size_t));
ofs_enc += sizeof(size_t);
if (non_rle_span) {
memcpy(data_dec + ofs_dec, data_enc + ofs_enc, non_rle_span);
ofs_enc += non_rle_span;
ofs_dec += non_rle_span;
}
else {
/* Both are zero - an end-of-buffer signal. */
break;
}
}
}
BLI_assert(ofs_enc == data_enc_len);
BLI_assert(ofs_dec == data_dec_len);
UNUSED_VARS_NDEBUG(data_enc_len, data_dec_len);
}
/** \} */