Core: extract blendfile_header.py as common utility for parsing .blend files

This new file can parse the file header (first few bytes) as well as the block headers. Right now, this is used by two places: * `blendfile.py` which is used by `blend2json.py` * `blend_render_info.py` This new module is shipped with Blender because it's needed for `blend_render_info.py` which is shipped with Blender too. This makes using it in `blendfile.py` (which is not shipped with Blender) a bit more annoying. However, this is already not ideal, because e.g. `blend2json` also has to add to `sys.path` already to be able to import `blendfile.py`. This new file could also be used by blender-asset-tracer (BAT). The new `BlendFileHeader` and `BlockHeader` types may be subclassed by code using it, because it wants to store additional derived data (`blendfile.py` and BAT need this). New tests have been added that check that the file and block header is parsed correctly for different kinds of .blend files. Pull Request: https://projects.blender.org/blender/blender/pulls/140341
2025-06-23 12:53:55 +02:00
parent a5399af388
commit f0c7e52ff2
10 changed files with 468 additions and 220 deletions
--- a/scripts/modules/blend_render_info.py
+++ b/scripts/modules/blend_render_info.py
@@ -4,20 +4,14 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 # This module can get render info without running from inside blender.
-#
-# This struct won't change according to Ton.
-# Note that the size differs on 32/64bit
-#
-# typedef struct BHead {
-#     int code, len;
-#     void *old;
-#     int SDNAnr, nr;
-# } BHead;
+

 __all__ = (
    "read_blend_rend_chunk",
 )

+import blendfile_header
+

 class RawBlendFileReader:
    """
@@ -64,75 +58,51 @@ class RawBlendFileReader:
        return False


+def get_render_info_structure(endian_str, size):
+    import struct
+    # The maximum size of the scene name changed over time, so create a different
+    # structure depending on the size of the entire block.
+    if size == 2 * 4 + 24:
+        return struct.Struct(endian_str + b'ii24s')
+    if size == 2 * 4 + 64:
+        return struct.Struct(endian_str + b'ii64s')
+    if size == 2 * 4 + 256:
+        return struct.Struct(endian_str + b'ii256s')
+    raise ValueError("Unknown REND chunk size: {:d}".format(size))
+
+
 def _read_blend_rend_chunk_from_file(blendfile, filepath):
    import struct
    import sys

    from os import SEEK_CUR

-    head = blendfile.read(7)
-    if head != b'BLENDER':
+    try:
+        blender_header = blendfile_header.BlendFileHeader(blendfile)
+    except blendfile_header.BlendHeaderError:
        sys.stderr.write("Not a blend file: {:s}\n".format(filepath))
        return []

-    is_64_bit = (blendfile.read(1) == b'-')
-
-    # true for PPC, false for X86
-    is_big_endian = (blendfile.read(1) == b'V')
-
-    # Now read the bhead chunk!
-    blendfile.seek(3, SEEK_CUR)  # Skip the version.
-
    scenes = []

-    sizeof_bhead = 24 if is_64_bit else 20
+    endian_str = b'<' if blender_header.is_little_endian else b'>'

-    # Should always be 4, but a malformed/corrupt file may be less.
-    while (bhead_id := blendfile.read(4)) != b'ENDB':
-
-        if len(bhead_id) != 4:
-            sys.stderr.write("Unable to read until ENDB block (corrupt file): {:s}\n".format(filepath))
+    block_header_struct = blender_header.create_block_header_struct()
+    while bhead := blendfile_header.BlockHeader(blendfile, block_header_struct):
+        if bhead.code == b'ENDB':
            break
-
-        sizeof_data_left = struct.unpack('>i' if is_big_endian else '<i', blendfile.read(4))[0]
-        if sizeof_data_left < 0:
-            # Very unlikely, but prevent other errors.
-            sys.stderr.write("Negative block size found (corrupt file): {:s}\n".format(filepath))
-            break
-
-        # 4 from the `head_id`, another 4 for the size of the BHEAD.
-        sizeof_bhead_left = sizeof_bhead - 8
-
-        # The remainder of the BHEAD struct is not used.
-        blendfile.seek(sizeof_bhead_left, SEEK_CUR)
-
-        if bhead_id == b'REND':
-            # Now we want the scene name, start and end frame. this is 32bits long.
-            start_frame, end_frame = struct.unpack('>2i' if is_big_endian else '<2i', blendfile.read(8))
-            sizeof_data_left -= 8
-
-            scene_name = blendfile.read(64)
-            sizeof_data_left -= 64
-            if b'\0' not in scene_name:
-                if sizeof_data_left >= 192:
-                    # Assume new, up to 256 bytes name.
-                    scene_name += blendfile.read(192)
-                    sizeof_data_left -= 192
-            if b'\0' not in scene_name:
-                scene_name = scene_name[:-1] + b'\0'
+        remaining_bytes = bhead.size
+        if bhead.code == b'REND':
+            rend_block_struct = get_render_info_structure(endian_str, bhead.size)
+            start_frame, end_frame, scene_name = rend_block_struct.unpack(blendfile.read(rend_block_struct.size))
+            remaining_bytes -= rend_block_struct.size

            scene_name = scene_name[:scene_name.index(b'\0')]
            # It's possible old blend files are not UTF8 compliant, use `surrogateescape`.
            scene_name = scene_name.decode("utf8", errors="surrogateescape")
-
            scenes.append((start_frame, end_frame, scene_name))

-        if sizeof_data_left > 0:
-            blendfile.seek(sizeof_data_left, SEEK_CUR)
-        elif sizeof_data_left < 0:
-            # Very unlikely, but prevent attempting to further parse corrupt data.
-            sys.stderr.write("Error calculating next block (corrupt file): {:s}\n".format(filepath))
-            break
+        blendfile.seek(remaining_bytes, SEEK_CUR)

    return scenes

--- a/scripts/modules/blendfile_header.py
+++ b/scripts/modules/blendfile_header.py
@@ -0,0 +1,234 @@
+# SPDX-FileCopyrightText: 2025 Blender Authors
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+'''
+This module contains utility classes for reading headers in .blend files.
+
+This is a pure Python implementation of the corresponding C++ code in Blender
+in BLO_core_blend_header.hh and BLO_core_bhead.hh.
+'''
+
+import os
+import struct
+import typing
+
+from dataclasses import dataclass
+
+
+class BlendHeaderError(Exception):
+    pass
+
+
+@dataclass
+class BHead4:
+    code: bytes
+    len: int
+    old: int
+    SDNAnr: int
+    nr: int
+
+
+@dataclass
+class SmallBHead8:
+    code: bytes
+    len: int
+    old: int
+    SDNAnr: int
+    nr: int
+
+
+@dataclass
+class LargeBHead8:
+    code: bytes
+    SDNAnr: int
+    old: int
+    len: int
+    nr: int
+
+
+@dataclass
+class BlockHeaderStruct:
+    # Binary format of the encoded header.
+    struct: struct.Struct
+    # Corresponding Python type for retrieving block header values.
+    type: typing.Type[typing.Union[BHead4, SmallBHead8, LargeBHead8]]
+
+    @property
+    def size(self) -> int:
+        return self.struct.size
+
+    def parse(self, data: bytes) -> typing.Union[BHead4, SmallBHead8, LargeBHead8]:
+        return self.type(*self.struct.unpack(data))
+
+
+class BlendFileHeader:
+    """
+    BlendFileHeader represents the first 12-17 bytes of a blend file.
+
+    It contains information about the hardware architecture, which is relevant
+    to the structure of the rest of the file.
+    """
+
+    # Always 'BLENDER'.
+    magic: bytes
+    # Currently always 0 or 1.
+    file_format_version: int
+    # Either 4 or 8.
+    pointer_size: int
+    # Endianness of values stored in the file.
+    is_little_endian: bool
+    # Blender version the file has been written with.
+    # The last two digits are the minor version. So 280 is 2.80.
+    version: int
+
+    def __init__(self, file: typing.IO[bytes]) -> None:
+        file.seek(0, os.SEEK_SET)
+
+        bytes_0_6 = file.read(7)
+        if bytes_0_6 != b'BLENDER':
+            raise BlendHeaderError("invalid first bytes %r" % bytes_0_6)
+        self.magic = bytes_0_6
+
+        byte_7 = file.read(1)
+        is_legacy_header = byte_7 in (b'_', b'-')
+        if is_legacy_header:
+            self.file_format_version = 0
+            if byte_7 == b'_':
+                self.pointer_size = 4
+            elif byte_7 == b'-':
+                self.pointer_size = 8
+            else:
+                raise BlendHeaderError("invalid pointer size %r" % byte_7)
+            byte_8 = file.read(1)
+            if byte_8 == b'v':
+                self.is_little_endian = True
+            elif byte_8 == b'V':
+                self.is_little_endian = False
+            else:
+                raise BlendHeaderError("invalid endian indicator %r" % byte_8)
+            bytes_9_11 = file.read(3)
+            self.version = int(bytes_9_11)
+        else:
+            byte_8 = file.read(1)
+            header_size = int(byte_7 + byte_8)
+            if header_size != 17:
+                raise BlendHeaderError("unknown file header size %d" % header_size)
+            byte_9 = file.read(1)
+            if byte_9 != b'-':
+                raise BlendHeaderError("invalid file header")
+            self.pointer_size = 8
+            byte_10_11 = file.read(2)
+            self.file_format_version = int(byte_10_11)
+            if self.file_format_version != 1:
+                raise BlendHeaderError("unsupported file format version %r" % self.file_format_version)
+            byte_12 = file.read(1)
+            if byte_12 != b'v':
+                raise BlendHeaderError("invalid file header")
+            self.is_little_endian = True
+            byte_13_16 = file.read(4)
+            self.version = int(byte_13_16)
+
+    def create_block_header_struct(self) -> BlockHeaderStruct:
+        assert self.file_format_version in (0, 1)
+        endian_str = b'<' if self.is_little_endian else b'>'
+        if self.file_format_version == 1:
+            header_struct = struct.Struct(b''.join((
+                endian_str,
+                # LargeBHead8.code
+                b'4s',
+                # LargeBHead8.SDNAnr
+                b'i',
+                # LargeBHead8.old
+                b'Q',
+                # LargeBHead8.len
+                b'q',
+                # LargeBHead8.nr
+                b'q',
+            )))
+            return BlockHeaderStruct(header_struct, LargeBHead8)
+
+        if self.pointer_size == 4:
+            header_struct = struct.Struct(b''.join((
+                endian_str,
+                # BHead4.code
+                b'4s',
+                # BHead4.len
+                b'i',
+                # BHead4.old
+                b'I',
+                # BHead4.SDNAnr
+                b'i',
+                # BHead4.nr
+                b'i',
+            )))
+            return BlockHeaderStruct(header_struct, BHead4)
+
+        assert self.pointer_size == 8
+        header_struct = struct.Struct(b''.join((
+            endian_str,
+            # SmallBHead8.code
+            b'4s',
+            # SmallBHead8.len
+            b'i',
+            # SmallBHead8.old
+            b'Q',
+            # SmallBHead8.SDNAnr
+            b'i',
+            # SmallBHead8.nr
+            b'i',
+        )))
+        return BlockHeaderStruct(header_struct, SmallBHead8)
+
+
+class BlockHeader:
+    """
+    A .blend file consists of a sequence of blocks whereby each block has a header.
+    This class can parse a header block in a specific .blend file.
+
+    Note the binary representation of this header is different for different files.
+    This class provides a unified interface for these underlying representations.
+    """
+
+    __slots__ = (
+        "code",
+        "size",
+        "addr_old",
+        "sdna_index",
+        "count",
+    )
+
+    # Indicates the type of the block. See BLO_CODE_* in BLO_core_bhead.hh.
+    code: bytes
+    # Number of bytes in the block.
+    size: int
+    # Old pointer/identifier of the block.
+    addr_old: int
+    # DNA struct index of the data in the block.
+    sdna_index: int
+    # Number of DNA structures in the block.
+    count: int
+
+    def __init__(self, file: typing.IO[bytes], block_header_struct: BlockHeaderStruct) -> None:
+        data = file.read(block_header_struct.size)
+
+        if len(data) != block_header_struct.size:
+            if len(data) != 8:
+                raise BlendHeaderError("invalid block header size")
+            legacy_endb = struct.Struct(b'4sI')
+            endb_header = legacy_endb.unpack(data)
+            if endb_header[0] != b'ENDB':
+                raise BlendHeaderError("invalid block header")
+            self.code = b'ENDB'
+            self.size = 0
+            self.addr_old = 0
+            self.sdna_index = 0
+            self.count = 0
+            return
+
+        header = block_header_struct.parse(data)
+        self.code = header.code.partition(b'\0')[0]
+        self.size = header.len
+        self.addr_old = header.old
+        self.sdna_index = header.SDNAnr
+        self.count = header.nr