From 586fadd6d2747a313f0ec9c7f4e45bde10d08efe Mon Sep 17 00:00:00 2001 From: Jacques Lucke Date: Fri, 2 Feb 2024 22:33:00 +0100 Subject: [PATCH] Geometry Nodes: deduplicate arrays in baked data This adds hash-based data deduplication when baking in geometry nodes. All arrays that are written to `.blob` files are hashed. If an array is detected to have the same hash as a previously written array, it is not written again. Instead the same memory is reused. We already have a similar optimization, but that only worked .with data that was already implicitly shared. Doing this kind of deduplication with implicitly shared data has the benefit, that the equality check is constant time. The hash based approach implemented here requires linear time in the size of the array, but works on all kinds of data. Both optimizations work together. So the hashing is skipped if possible. The hash-based deduplication primarily benefits cases where the data is regenerated on each frame, so the data between .frames is not shared. One example used to require 2.9 GB disk space. Now it only requires 542 MB. Additionally, the duplicate arrays will now be implicitly shared between frames when reading the baked data later. An extended version of this approach which also detects partial duplicates is implemented in #117749. Pull Request: https://projects.blender.org/blender/blender/pulls/117768 --- .../blenkernel/BKE_bake_items_serialize.hh | 26 ++++++++++++ .../blenkernel/intern/bake_items_serialize.cc | 41 ++++++++++++++----- 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/source/blender/blenkernel/BKE_bake_items_serialize.hh b/source/blender/blenkernel/BKE_bake_items_serialize.hh index 4f73b20e9f8..349b8c41ce9 100644 --- a/source/blender/blenkernel/BKE_bake_items_serialize.hh +++ b/source/blender/blenkernel/BKE_bake_items_serialize.hh @@ -72,6 +72,24 @@ class BlobWriteSharing : NonCopyable, NonMovable { */ Map stored_by_runtime_; + struct SliceHash { + uint64_t a; + uint64_t b; + + BLI_STRUCT_EQUALITY_OPERATORS_2(SliceHash, a, b) + + uint64_t hash() const + { + return get_default_hash(this->a, this->b); + } + }; + + /** + * Remembers where data was stored based on the hash of the data. This allows us to skip writing + * the same array again if it has the same hash. + */ + Map slice_by_content_hash_; + public: ~BlobWriteSharing(); @@ -84,6 +102,14 @@ class BlobWriteSharing : NonCopyable, NonMovable { [[nodiscard]] std::shared_ptr write_implicitly_shared( const ImplicitSharingInfo *sharing_info, FunctionRef()> write_fn); + + /** + * Checks if the given data was written before. If it was, it's not written again, but a + * reference to the previously written data is returned. If the data is new, it's written now. + * Its hash is remembered so that the same data won't be written again. + */ + [[nodiscard]] std::shared_ptr write_deduplicated( + BlobWriter &writer, const void *data, int64_t size_in_bytes); }; /** diff --git a/source/blender/blenkernel/intern/bake_items_serialize.cc b/source/blender/blenkernel/intern/bake_items_serialize.cc index 55c36c21c9d..938d3bcc46e 100644 --- a/source/blender/blenkernel/intern/bake_items_serialize.cc +++ b/source/blender/blenkernel/intern/bake_items_serialize.cc @@ -13,6 +13,7 @@ #include "BLI_endian_defines.h" #include "BLI_endian_switch.h" +#include "BLI_hash_md5.hh" #include "BLI_math_matrix_types.hh" #include "BLI_path_util.h" @@ -131,6 +132,16 @@ DictionaryValuePtr BlobWriteSharing::write_implicitly_shared( }); } +std::shared_ptr BlobWriteSharing::write_deduplicated( + BlobWriter &writer, const void *data, const int64_t size_in_bytes) +{ + SliceHash content_hash; + BLI_hash_md5_buffer(static_cast(data), size_in_bytes, &content_hash); + const BlobSlice slice = slice_by_content_hash_.lookup_or_add_cb( + content_hash, [&]() { return writer.write(data, size_in_bytes); }); + return slice.serialize(); +} + std::optional BlobReadSharing::read_shared( const DictionaryValue &io_data, FunctionRef()> read_fn) const @@ -202,9 +213,12 @@ static std::optional get_data_type_from_io_name(const StringRef * Write the data and remember which endianness the data had. */ static std::shared_ptr write_blob_raw_data_with_endian( - BlobWriter &blob_writer, const void *data, const int64_t size_in_bytes) + BlobWriter &blob_writer, + BlobWriteSharing &blob_sharing, + const void *data, + const int64_t size_in_bytes) { - auto io_data = blob_writer.write(data, size_in_bytes).serialize(); + auto io_data = blob_sharing.write_deduplicated(blob_writer, data, size_in_bytes); if (ENDIAN_ORDER == B_ENDIAN) { io_data->append_str("endian", get_endian_io_name(ENDIAN_ORDER)); } @@ -255,10 +269,11 @@ static std::shared_ptr write_blob_raw_data_with_endian( /** Write bytes ignoring endianness. */ static std::shared_ptr write_blob_raw_bytes(BlobWriter &blob_writer, + BlobWriteSharing &blob_sharing, const void *data, const int64_t size_in_bytes) { - return blob_writer.write(data, size_in_bytes).serialize(); + return blob_sharing.write_deduplicated(blob_writer, data, size_in_bytes); } /** Read bytes ignoring endianness. */ @@ -278,14 +293,16 @@ static std::shared_ptr write_blob_raw_bytes(BlobWriter &blob_wr } static std::shared_ptr write_blob_simple_gspan(BlobWriter &blob_writer, + BlobWriteSharing &blob_sharing, const GSpan data) { const CPPType &type = data.type(); BLI_assert(type.is_trivial()); if (type.size() == 1 || type.is()) { - return write_blob_raw_bytes(blob_writer, data.data(), data.size_in_bytes()); + return write_blob_raw_bytes(blob_writer, blob_sharing, data.data(), data.size_in_bytes()); } - return write_blob_raw_data_with_endian(blob_writer, data.data(), data.size_in_bytes()); + return write_blob_raw_data_with_endian( + blob_writer, blob_sharing, data.data(), data.size_in_bytes()); } [[nodiscard]] static bool read_blob_simple_gspan(const BlobReader &blob_reader, @@ -327,7 +344,7 @@ static std::shared_ptr write_blob_shared_simple_gspan( const ImplicitSharingInfo *sharing_info) { return blob_sharing.write_implicitly_shared( - sharing_info, [&]() { return write_blob_simple_gspan(blob_writer, data); }); + sharing_info, [&]() { return write_blob_simple_gspan(blob_writer, blob_sharing, data); }); } [[nodiscard]] static const void *read_blob_shared_simple_gspan( @@ -814,10 +831,11 @@ static std::shared_ptr serialize_geometry_set(const GeometrySet serialize_geometry_set(reference.geometry_set(), blob_writer, blob_sharing)); } - io_instances->append("transforms", - write_blob_simple_gspan(blob_writer, instances.transforms())); - io_instances->append("handles", - write_blob_simple_gspan(blob_writer, instances.reference_handles())); + io_instances->append( + "transforms", write_blob_simple_gspan(blob_writer, blob_sharing, instances.transforms())); + io_instances->append( + "handles", + write_blob_simple_gspan(blob_writer, blob_sharing, instances.reference_handles())); auto io_attributes = serialize_attributes( instances.attributes(), blob_writer, blob_sharing, {"position"}); @@ -1037,7 +1055,8 @@ static void serialize_bake_item(const BakeItem &item, r_io_item.append_str("data", string_state_item->value()); } else { - r_io_item.append("data", write_blob_raw_bytes(blob_writer, str.data(), str.size())); + r_io_item.append("data", + write_blob_raw_bytes(blob_writer, blob_sharing, str.data(), str.size())); } } else if (const auto *primitive_state_item = dynamic_cast(&item)) {