Geometry Nodes: deduplicate arrays in baked data

This adds hash-based data deduplication when baking in
geometry nodes. All arrays that are written to `.blob` files
are hashed. If an array is detected to have the same hash
as a previously written array, it is not written again. Instead
the same memory is reused.

We already have a similar optimization, but that only worked .with data that was already implicitly shared. Doing this kind
of deduplication with implicitly shared data has the benefit,
that the equality check is constant time. The hash based
approach implemented here requires linear time in the size
of the array, but works on all kinds of data. Both optimizations
work together. So the hashing is skipped if possible.

The hash-based deduplication primarily benefits cases where
the data is regenerated on each frame, so the data between .frames is not shared. One example  used to require 2.9 GB
disk space. Now it only requires 542 MB. Additionally, the
duplicate arrays will now be implicitly shared between frames
when reading the baked data later.

An extended version of this approach which also detects partial
duplicates is implemented in #117749.

Pull Request: https://projects.blender.org/blender/blender/pulls/117768
This commit is contained in:
Jacques Lucke
2024-02-02 22:33:00 +01:00
parent 17f075222f
commit 586fadd6d2
2 changed files with 56 additions and 11 deletions

View File

@@ -72,6 +72,24 @@ class BlobWriteSharing : NonCopyable, NonMovable {
*/
Map<const ImplicitSharingInfo *, StoredByRuntimeValue> stored_by_runtime_;
struct SliceHash {
uint64_t a;
uint64_t b;
BLI_STRUCT_EQUALITY_OPERATORS_2(SliceHash, a, b)
uint64_t hash() const
{
return get_default_hash(this->a, this->b);
}
};
/**
* Remembers where data was stored based on the hash of the data. This allows us to skip writing
* the same array again if it has the same hash.
*/
Map<SliceHash, BlobSlice> slice_by_content_hash_;
public:
~BlobWriteSharing();
@@ -84,6 +102,14 @@ class BlobWriteSharing : NonCopyable, NonMovable {
[[nodiscard]] std::shared_ptr<io::serialize::DictionaryValue> write_implicitly_shared(
const ImplicitSharingInfo *sharing_info,
FunctionRef<std::shared_ptr<io::serialize::DictionaryValue>()> write_fn);
/**
* Checks if the given data was written before. If it was, it's not written again, but a
* reference to the previously written data is returned. If the data is new, it's written now.
* Its hash is remembered so that the same data won't be written again.
*/
[[nodiscard]] std::shared_ptr<io::serialize::DictionaryValue> write_deduplicated(
BlobWriter &writer, const void *data, int64_t size_in_bytes);
};
/**

View File

@@ -13,6 +13,7 @@
#include "BLI_endian_defines.h"
#include "BLI_endian_switch.h"
#include "BLI_hash_md5.hh"
#include "BLI_math_matrix_types.hh"
#include "BLI_path_util.h"
@@ -131,6 +132,16 @@ DictionaryValuePtr BlobWriteSharing::write_implicitly_shared(
});
}
std::shared_ptr<io::serialize::DictionaryValue> BlobWriteSharing::write_deduplicated(
BlobWriter &writer, const void *data, const int64_t size_in_bytes)
{
SliceHash content_hash;
BLI_hash_md5_buffer(static_cast<const char *>(data), size_in_bytes, &content_hash);
const BlobSlice slice = slice_by_content_hash_.lookup_or_add_cb(
content_hash, [&]() { return writer.write(data, size_in_bytes); });
return slice.serialize();
}
std::optional<ImplicitSharingInfoAndData> BlobReadSharing::read_shared(
const DictionaryValue &io_data,
FunctionRef<std::optional<ImplicitSharingInfoAndData>()> read_fn) const
@@ -202,9 +213,12 @@ static std::optional<eCustomDataType> get_data_type_from_io_name(const StringRef
* Write the data and remember which endianness the data had.
*/
static std::shared_ptr<DictionaryValue> write_blob_raw_data_with_endian(
BlobWriter &blob_writer, const void *data, const int64_t size_in_bytes)
BlobWriter &blob_writer,
BlobWriteSharing &blob_sharing,
const void *data,
const int64_t size_in_bytes)
{
auto io_data = blob_writer.write(data, size_in_bytes).serialize();
auto io_data = blob_sharing.write_deduplicated(blob_writer, data, size_in_bytes);
if (ENDIAN_ORDER == B_ENDIAN) {
io_data->append_str("endian", get_endian_io_name(ENDIAN_ORDER));
}
@@ -255,10 +269,11 @@ static std::shared_ptr<DictionaryValue> write_blob_raw_data_with_endian(
/** Write bytes ignoring endianness. */
static std::shared_ptr<DictionaryValue> write_blob_raw_bytes(BlobWriter &blob_writer,
BlobWriteSharing &blob_sharing,
const void *data,
const int64_t size_in_bytes)
{
return blob_writer.write(data, size_in_bytes).serialize();
return blob_sharing.write_deduplicated(blob_writer, data, size_in_bytes);
}
/** Read bytes ignoring endianness. */
@@ -278,14 +293,16 @@ static std::shared_ptr<DictionaryValue> write_blob_raw_bytes(BlobWriter &blob_wr
}
static std::shared_ptr<DictionaryValue> write_blob_simple_gspan(BlobWriter &blob_writer,
BlobWriteSharing &blob_sharing,
const GSpan data)
{
const CPPType &type = data.type();
BLI_assert(type.is_trivial());
if (type.size() == 1 || type.is<ColorGeometry4b>()) {
return write_blob_raw_bytes(blob_writer, data.data(), data.size_in_bytes());
return write_blob_raw_bytes(blob_writer, blob_sharing, data.data(), data.size_in_bytes());
}
return write_blob_raw_data_with_endian(blob_writer, data.data(), data.size_in_bytes());
return write_blob_raw_data_with_endian(
blob_writer, blob_sharing, data.data(), data.size_in_bytes());
}
[[nodiscard]] static bool read_blob_simple_gspan(const BlobReader &blob_reader,
@@ -327,7 +344,7 @@ static std::shared_ptr<DictionaryValue> write_blob_shared_simple_gspan(
const ImplicitSharingInfo *sharing_info)
{
return blob_sharing.write_implicitly_shared(
sharing_info, [&]() { return write_blob_simple_gspan(blob_writer, data); });
sharing_info, [&]() { return write_blob_simple_gspan(blob_writer, blob_sharing, data); });
}
[[nodiscard]] static const void *read_blob_shared_simple_gspan(
@@ -814,10 +831,11 @@ static std::shared_ptr<DictionaryValue> serialize_geometry_set(const GeometrySet
serialize_geometry_set(reference.geometry_set(), blob_writer, blob_sharing));
}
io_instances->append("transforms",
write_blob_simple_gspan(blob_writer, instances.transforms()));
io_instances->append("handles",
write_blob_simple_gspan(blob_writer, instances.reference_handles()));
io_instances->append(
"transforms", write_blob_simple_gspan(blob_writer, blob_sharing, instances.transforms()));
io_instances->append(
"handles",
write_blob_simple_gspan(blob_writer, blob_sharing, instances.reference_handles()));
auto io_attributes = serialize_attributes(
instances.attributes(), blob_writer, blob_sharing, {"position"});
@@ -1037,7 +1055,8 @@ static void serialize_bake_item(const BakeItem &item,
r_io_item.append_str("data", string_state_item->value());
}
else {
r_io_item.append("data", write_blob_raw_bytes(blob_writer, str.data(), str.size()));
r_io_item.append("data",
write_blob_raw_bytes(blob_writer, blob_sharing, str.data(), str.size()));
}
}
else if (const auto *primitive_state_item = dynamic_cast<const PrimitiveBakeItem *>(&item)) {