BLI: optimize Vector move-construction for common cases
This optimizes the move-constructor for `blender::Vector` when all of the following are true: * The source and destination vector have exactly the same type. * The stored type is trivial. * The inline buffer is `<= 32` bytes large (this value is a heuristic that could be changed). The basic idea of the optimization is that under these circumstances one can just copy the entire inline-buffer over instead of only copying it partially based on the vector size. While that can mean that more bytes have to be copied, the machine code that does the copying can be more efficient due to less branching and the hardcoded size. The performance impact is quite measurable. Note that the speedup depends on how many elements are in vector and thus how many elements of the inline buffer are used. The following table shows the move construction performance of a `Vector<void *, 4>`. Starting at 5 elements, the performance doesn't change much anymore, because the inline buffer is just ignored. | Elements | Old | New | |----------|------|------| | 0 | 20.3 | 14.6 | | 1 | 22.7 | 21.5 | | 2 | 36.4 | 21.6 | | 3 | 36.4 | 21.5 | | 4 | 36.5 | 21.6 | | 5 | 21.4 | 21.1 | | 6 | 21.3 | 21.1 | | 7 | 21.4 | 21.1 | | 8 | 21.5 | 21.0 | | 9 | 21.4 | 20.9 | | 10 | 21.3 | 20.9 | The binary size stays effectively unchanged (< 2kb change). Pull Request: https://projects.blender.org/blender/blender/pulls/131841
This commit is contained in:
@@ -240,24 +240,47 @@ class Vector {
|
||||
is_nothrow_move_constructible())
|
||||
: Vector(NoExceptConstructor(), other.allocator_)
|
||||
{
|
||||
const int64_t size = other.size();
|
||||
|
||||
if (other.is_inline()) {
|
||||
/* This first check is not strictly necessary, but improves performance because it can be
|
||||
* done at compile time and makes the size check at run-time unnecessary. */
|
||||
if (OtherInlineBufferCapacity <= InlineBufferCapacity || size <= InlineBufferCapacity) {
|
||||
/* Copy between inline buffers. */
|
||||
uninitialized_relocate_n(other.begin_, size, begin_);
|
||||
end_ = begin_ + size;
|
||||
const int64_t size = other.size();
|
||||
|
||||
/* Optimize the case by copying the full inline buffer. */
|
||||
constexpr bool other_is_same_type = std::is_same_v<Vector, std::decay_t<decltype(other)>>;
|
||||
constexpr size_t max_full_copy_size = 32;
|
||||
if constexpr (other_is_same_type && std::is_trivial_v<T> &&
|
||||
sizeof(inline_buffer_) <= max_full_copy_size)
|
||||
{
|
||||
/* This check is technically optional. However, benchmarking shows that skipping work
|
||||
* for empty vectors (which is a common case) is worth the extra check even in the case
|
||||
* when the vector is not empty. */
|
||||
if (size > 0) {
|
||||
/* Copy the full inline buffer instead of only the used parts. This may copy
|
||||
* uninitialized values but allows producing more optimal code than when the copy size
|
||||
* would depend on a dynamic value. */
|
||||
memcpy(inline_buffer_, other.inline_buffer_, sizeof(inline_buffer_));
|
||||
this->increase_size_by_unchecked(size);
|
||||
/* Reset other vector. */
|
||||
other.end_ = other.inline_buffer_;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Copy from inline buffer to newly allocated buffer. */
|
||||
const int64_t capacity = size;
|
||||
begin_ = static_cast<T *>(
|
||||
allocator_.allocate(sizeof(T) * size_t(capacity), alignof(T), AT));
|
||||
capacity_end_ = begin_ + capacity;
|
||||
uninitialized_relocate_n(other.begin_, size, begin_);
|
||||
end_ = begin_ + size;
|
||||
/* This first check is not strictly necessary, but improves performance because it can be
|
||||
* done at compile time and makes the size check at run-time unnecessary. */
|
||||
if (OtherInlineBufferCapacity <= InlineBufferCapacity || size <= InlineBufferCapacity) {
|
||||
/* Copy between inline buffers. */
|
||||
uninitialized_relocate_n(other.begin_, size, begin_);
|
||||
end_ = begin_ + size;
|
||||
}
|
||||
else {
|
||||
/* Copy from inline buffer to newly allocated buffer. */
|
||||
const int64_t capacity = size;
|
||||
begin_ = static_cast<T *>(
|
||||
allocator_.allocate(sizeof(T) * size_t(capacity), alignof(T), AT));
|
||||
capacity_end_ = begin_ + capacity;
|
||||
uninitialized_relocate_n(other.begin_, size, begin_);
|
||||
end_ = begin_ + size;
|
||||
}
|
||||
/* Reset other vector. */
|
||||
other.end_ = other.inline_buffer_;
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -265,11 +288,13 @@ class Vector {
|
||||
begin_ = other.begin_;
|
||||
end_ = other.end_;
|
||||
capacity_end_ = other.capacity_end_;
|
||||
|
||||
/* Reset other vector. */
|
||||
other.begin_ = other.inline_buffer_;
|
||||
other.end_ = other.inline_buffer_;
|
||||
other.capacity_end_ = other.inline_buffer_ + OtherInlineBufferCapacity;
|
||||
}
|
||||
|
||||
other.begin_ = other.inline_buffer_;
|
||||
other.end_ = other.begin_;
|
||||
other.capacity_end_ = other.begin_ + OtherInlineBufferCapacity;
|
||||
UPDATE_VECTOR_SIZE(this);
|
||||
UPDATE_VECTOR_SIZE(&other);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user