Geometry Nodes: deduplicate arrays in baked data
This adds hash-based data deduplication when baking in geometry nodes. All arrays that are written to `.blob` files are hashed. If an array is detected to have the same hash as a previously written array, it is not written again. Instead the same memory is reused. We already have a similar optimization, but that only worked .with data that was already implicitly shared. Doing this kind of deduplication with implicitly shared data has the benefit, that the equality check is constant time. The hash based approach implemented here requires linear time in the size of the array, but works on all kinds of data. Both optimizations work together. So the hashing is skipped if possible. The hash-based deduplication primarily benefits cases where the data is regenerated on each frame, so the data between .frames is not shared. One example used to require 2.9 GB disk space. Now it only requires 542 MB. Additionally, the duplicate arrays will now be implicitly shared between frames when reading the baked data later. An extended version of this approach which also detects partial duplicates is implemented in #117749. Pull Request: https://projects.blender.org/blender/blender/pulls/117768
This commit is contained in:
@@ -72,6 +72,24 @@ class BlobWriteSharing : NonCopyable, NonMovable {
|
||||
*/
|
||||
Map<const ImplicitSharingInfo *, StoredByRuntimeValue> stored_by_runtime_;
|
||||
|
||||
struct SliceHash {
|
||||
uint64_t a;
|
||||
uint64_t b;
|
||||
|
||||
BLI_STRUCT_EQUALITY_OPERATORS_2(SliceHash, a, b)
|
||||
|
||||
uint64_t hash() const
|
||||
{
|
||||
return get_default_hash(this->a, this->b);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Remembers where data was stored based on the hash of the data. This allows us to skip writing
|
||||
* the same array again if it has the same hash.
|
||||
*/
|
||||
Map<SliceHash, BlobSlice> slice_by_content_hash_;
|
||||
|
||||
public:
|
||||
~BlobWriteSharing();
|
||||
|
||||
@@ -84,6 +102,14 @@ class BlobWriteSharing : NonCopyable, NonMovable {
|
||||
[[nodiscard]] std::shared_ptr<io::serialize::DictionaryValue> write_implicitly_shared(
|
||||
const ImplicitSharingInfo *sharing_info,
|
||||
FunctionRef<std::shared_ptr<io::serialize::DictionaryValue>()> write_fn);
|
||||
|
||||
/**
|
||||
* Checks if the given data was written before. If it was, it's not written again, but a
|
||||
* reference to the previously written data is returned. If the data is new, it's written now.
|
||||
* Its hash is remembered so that the same data won't be written again.
|
||||
*/
|
||||
[[nodiscard]] std::shared_ptr<io::serialize::DictionaryValue> write_deduplicated(
|
||||
BlobWriter &writer, const void *data, int64_t size_in_bytes);
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
#include "BLI_endian_defines.h"
|
||||
#include "BLI_endian_switch.h"
|
||||
#include "BLI_hash_md5.hh"
|
||||
#include "BLI_math_matrix_types.hh"
|
||||
#include "BLI_path_util.h"
|
||||
|
||||
@@ -131,6 +132,16 @@ DictionaryValuePtr BlobWriteSharing::write_implicitly_shared(
|
||||
});
|
||||
}
|
||||
|
||||
std::shared_ptr<io::serialize::DictionaryValue> BlobWriteSharing::write_deduplicated(
|
||||
BlobWriter &writer, const void *data, const int64_t size_in_bytes)
|
||||
{
|
||||
SliceHash content_hash;
|
||||
BLI_hash_md5_buffer(static_cast<const char *>(data), size_in_bytes, &content_hash);
|
||||
const BlobSlice slice = slice_by_content_hash_.lookup_or_add_cb(
|
||||
content_hash, [&]() { return writer.write(data, size_in_bytes); });
|
||||
return slice.serialize();
|
||||
}
|
||||
|
||||
std::optional<ImplicitSharingInfoAndData> BlobReadSharing::read_shared(
|
||||
const DictionaryValue &io_data,
|
||||
FunctionRef<std::optional<ImplicitSharingInfoAndData>()> read_fn) const
|
||||
@@ -202,9 +213,12 @@ static std::optional<eCustomDataType> get_data_type_from_io_name(const StringRef
|
||||
* Write the data and remember which endianness the data had.
|
||||
*/
|
||||
static std::shared_ptr<DictionaryValue> write_blob_raw_data_with_endian(
|
||||
BlobWriter &blob_writer, const void *data, const int64_t size_in_bytes)
|
||||
BlobWriter &blob_writer,
|
||||
BlobWriteSharing &blob_sharing,
|
||||
const void *data,
|
||||
const int64_t size_in_bytes)
|
||||
{
|
||||
auto io_data = blob_writer.write(data, size_in_bytes).serialize();
|
||||
auto io_data = blob_sharing.write_deduplicated(blob_writer, data, size_in_bytes);
|
||||
if (ENDIAN_ORDER == B_ENDIAN) {
|
||||
io_data->append_str("endian", get_endian_io_name(ENDIAN_ORDER));
|
||||
}
|
||||
@@ -255,10 +269,11 @@ static std::shared_ptr<DictionaryValue> write_blob_raw_data_with_endian(
|
||||
|
||||
/** Write bytes ignoring endianness. */
|
||||
static std::shared_ptr<DictionaryValue> write_blob_raw_bytes(BlobWriter &blob_writer,
|
||||
BlobWriteSharing &blob_sharing,
|
||||
const void *data,
|
||||
const int64_t size_in_bytes)
|
||||
{
|
||||
return blob_writer.write(data, size_in_bytes).serialize();
|
||||
return blob_sharing.write_deduplicated(blob_writer, data, size_in_bytes);
|
||||
}
|
||||
|
||||
/** Read bytes ignoring endianness. */
|
||||
@@ -278,14 +293,16 @@ static std::shared_ptr<DictionaryValue> write_blob_raw_bytes(BlobWriter &blob_wr
|
||||
}
|
||||
|
||||
static std::shared_ptr<DictionaryValue> write_blob_simple_gspan(BlobWriter &blob_writer,
|
||||
BlobWriteSharing &blob_sharing,
|
||||
const GSpan data)
|
||||
{
|
||||
const CPPType &type = data.type();
|
||||
BLI_assert(type.is_trivial());
|
||||
if (type.size() == 1 || type.is<ColorGeometry4b>()) {
|
||||
return write_blob_raw_bytes(blob_writer, data.data(), data.size_in_bytes());
|
||||
return write_blob_raw_bytes(blob_writer, blob_sharing, data.data(), data.size_in_bytes());
|
||||
}
|
||||
return write_blob_raw_data_with_endian(blob_writer, data.data(), data.size_in_bytes());
|
||||
return write_blob_raw_data_with_endian(
|
||||
blob_writer, blob_sharing, data.data(), data.size_in_bytes());
|
||||
}
|
||||
|
||||
[[nodiscard]] static bool read_blob_simple_gspan(const BlobReader &blob_reader,
|
||||
@@ -327,7 +344,7 @@ static std::shared_ptr<DictionaryValue> write_blob_shared_simple_gspan(
|
||||
const ImplicitSharingInfo *sharing_info)
|
||||
{
|
||||
return blob_sharing.write_implicitly_shared(
|
||||
sharing_info, [&]() { return write_blob_simple_gspan(blob_writer, data); });
|
||||
sharing_info, [&]() { return write_blob_simple_gspan(blob_writer, blob_sharing, data); });
|
||||
}
|
||||
|
||||
[[nodiscard]] static const void *read_blob_shared_simple_gspan(
|
||||
@@ -814,10 +831,11 @@ static std::shared_ptr<DictionaryValue> serialize_geometry_set(const GeometrySet
|
||||
serialize_geometry_set(reference.geometry_set(), blob_writer, blob_sharing));
|
||||
}
|
||||
|
||||
io_instances->append("transforms",
|
||||
write_blob_simple_gspan(blob_writer, instances.transforms()));
|
||||
io_instances->append("handles",
|
||||
write_blob_simple_gspan(blob_writer, instances.reference_handles()));
|
||||
io_instances->append(
|
||||
"transforms", write_blob_simple_gspan(blob_writer, blob_sharing, instances.transforms()));
|
||||
io_instances->append(
|
||||
"handles",
|
||||
write_blob_simple_gspan(blob_writer, blob_sharing, instances.reference_handles()));
|
||||
|
||||
auto io_attributes = serialize_attributes(
|
||||
instances.attributes(), blob_writer, blob_sharing, {"position"});
|
||||
@@ -1037,7 +1055,8 @@ static void serialize_bake_item(const BakeItem &item,
|
||||
r_io_item.append_str("data", string_state_item->value());
|
||||
}
|
||||
else {
|
||||
r_io_item.append("data", write_blob_raw_bytes(blob_writer, str.data(), str.size()));
|
||||
r_io_item.append("data",
|
||||
write_blob_raw_bytes(blob_writer, blob_sharing, str.data(), str.size()));
|
||||
}
|
||||
}
|
||||
else if (const auto *primitive_state_item = dynamic_cast<const PrimitiveBakeItem *>(&item)) {
|
||||
|
||||
Reference in New Issue
Block a user