Tools: add a utility to validate array sizes

The script check_source/static_check_size_comments.py run directly or called via the convenience target "make check_size_comments". Add a utility module: `line_number_utils` which implements a version of `re.finditer` that includes line numbers & ranges.
2025-05-23 04:01:10 +00:00
parent 84694bf635
commit 0265b13399
4 changed files with 374 additions and 0 deletions
--- a/5
+++ b/5
@@ -64,6 +64,7 @@ Static Source Code Checking
   * check_clang_array:     Run blender source through clang array checking script (C & C++).
   * check_struct_comments: Check struct member comments are correct (C & C++).
   * check_size_comments:   Check array size comments match defines/enums (C & C++).
   * check_deprecated:      Check if there is any deprecated code to remove.
   * check_descriptions:    Check for duplicate/invalid descriptions.
   * check_licenses:        Check license headers follow the SPDX license specification,
@@ -502,6 +503,10 @@ check_struct_comments: .FORCE
 	    "$(BLENDER_DIR)/tools/check_source/static_check_clang.py" \
 	    --checks=struct_comments --match=".*" --jobs=$(NPROCS)
 check_size_comments: .FORCE
 	$(PYTHON) \
 	    "$(BLENDER_DIR)/tools/check_source/static_check_size_comments.py"
 check_clang_array: .FORCE
 	@$(CMAKE_CONFIG)
 	@cd "$(BUILD_DIR)" ; \
--- a/tools/check_source/check_mypy_config.py
+++ b/tools/check_source/check_mypy_config.py
@@ -33,6 +33,7 @@ PATHS: tuple[tuple[str, tuple[Any, ...], dict[str, str]], ...] = (
    ("tools/check_docs/", (), {}),
    ("tools/check_source/", (), {'MYPYPATH': "modules"}),
    ("tools/check_source/check_unused_defines.py", (), {'MYPYPATH': "../utils_maintenance/modules"}),
    ("tools/check_source/static_check_size_comments.py", (), {'MYPYPATH': "../utils_maintenance/modules"}),
    ("tools/config/", (), {}),
    ("tools/triage/", (), {}),
    ("tools/utils/", (), {}),
--- a/tools/check_source/static_check_size_comments.py
+++ b/tools/check_source/static_check_size_comments.py
@@ -0,0 +1,280 @@
 #!/usr/bin/env python3
 # SPDX-FileCopyrightText: 2023 Blender Authors
 #
 # SPDX-License-Identifier: GPL-2.0-or-later
 r"""
 Validates sizes in C/C++ sources written as: ``type name[/*MAX_NAME*/ 64]``
 where ``MAX_NAME`` is expected to be a define equal to 64, otherwise a warning is reported.
 """
 __all__ = (
    "main",
 )
 import os
 import sys
 import re
 THIS_DIR = os.path.dirname(__file__)
 BASE_DIR = os.path.normpath(os.path.abspath(os.path.normpath(os.path.join(THIS_DIR, "..", ".."))))
 sys.path.append(os.path.join(THIS_DIR, "..", "utils_maintenance", "modules"))
 from batch_edit_text import run
 import line_number_utils
 # -----------------------------------------------------------------------------
 # Utilities
 # -----------------------------------------------------------------------------
 # Local Settings
 # TODO, move to config file
 SOURCE_DIRS = (
    "source",
 )
 SOURCE_EXT = (
    # C/C++
    ".c", ".h", ".cpp", ".hpp", ".cc", ".hh", ".cxx", ".hxx", ".inl",
    # Objective C
    ".m", ".mm",
    # GLSL
    ".glsl",
 )
 # Mainly useful for development to check extraction & validation are working.
 SHOW_SUCCESS = True
 # -----------------------------------------------------------------------------
 # Globals
 # Map defines to a list of (filename-split, value) pairs.
 global_defines: dict[
    # The define ID.
    str,
    # Value(s), in case it's defined in multiple files.
    list[
        tuple[
            # The `BASE_DIR` relative path (split by `os.sep`).
            tuple[str, ...],
            # The value of the define,
            # a literal string with comments stripped out.
            str,
        ],
    ],
 ] = {}
 REGEX_ID_LITERAL = "[A-Za-z_][A-Za-z_0-9]*"
 # Detect:
 #   `[/*ID*/ 64]`.
 #   `[/*ID - 2*/ 62]`.
 REGEX_SIZE_COMMENT_IN_ARRAY = re.compile("\\[\\/\\*([^\\]]+)\\*\\/\\s*(\\d+)\\]")
 # Detect: `#define ID 64`
 REGEX_DEFINE_C_LIKE = re.compile("^\\s*#\\s*define\\s+(" + REGEX_ID_LITERAL + ")[ \t]+([^\n]+)", re.MULTILINE)
 # Detect:
 #   `ID = 64,`
 #   `ID = 64`
 REGEX_ENUM_C_LIKE = re.compile("^\\s*(" + REGEX_ID_LITERAL + ")\\s=\\s([^,\n]+)", re.MULTILINE)
 # Detect ID's.
 REGEX_ID_OR_NUMBER_C_LIKE = re.compile("[A-Za-z0-9_]+")
 def extract_defines(filepath: str, data_src: str) -> None:
    filepath_rel = os.path.relpath(filepath, BASE_DIR)
    for regex_matcher in (REGEX_DEFINE_C_LIKE, REGEX_ENUM_C_LIKE):
        for m in regex_matcher.finditer(data_src):
            value_id = m.group(1)
            value_literal = m.group(2)
            # Weak comment stripping.
            # This is (arguably) acceptable since the intent is to extract numbers,
            # if developers feel the need to write lines such as:
            # `#define VALUE_MAX /* Lets make some trouble! */ 64`
            # Then they can consider if that's actually needed (sigh!)...
            # Otherwise, we could replace this with a full parser such as CLANG,
            # however this is a bit of a hassle to setup.
            if "//" in value_literal:
                value_literal = value_literal.split("//", 1)[0]
            if "/*" in value_literal:
                value_literal = value_literal.split("/*", 1)[0]
            try:
                global_defines[value_id].append((tuple(filepath_rel.split(os.sep)), value_literal))
            except KeyError:
                global_defines[value_id] = [(tuple(filepath_rel.split(os.sep)), value_literal)]
    # Returning None indicates the file is not edited.
 def path_score_distance(a: tuple[str, ...], b: tuple[str, ...]) -> tuple[int, int]:
    """
    Compare two paths, to find which paths are "closer" to each-other.
    This is used as a tie breaker when defines are found in multiple headers.
    """
    count_shared = 0
    range_min = min(len(a), len(b))
    range_max = max(len(a), len(b))
    for i in range(range_min):
        if a[i] != b[i]:
            break
        count_shared += 1
    count_nested = range_max - count_shared
    # Negate shared so smaller is better.
    # Less path nesting also gets priority.
    return (-count_shared, count_nested)
 def eval_define(
        value_literal: str,
        *,
        default: str,
        filepath_ref_split: tuple[str, ...],
 ) -> tuple[str, list[str]]:
    failed: list[str] = []
    def re_replace_fn(match: re.Match[str]) -> str:
        value = match.group()
        if value.isdigit():
            return value
        other_values = global_defines.get(value)
        if other_values is None:
            failed.append(value)
            return value
        if len(other_values) == 1:
            other_filepath_split, other_literal = other_values[0]
        else:
            # Find the "closest" on the file system.
            # In practice favor paths which are co-located works fairly well,
            # needed as it's now known which headers ID's in a head *could* reference.
            other_literal_best = ""
            other_score_best = (0, 0)
            other_filepath_split_best: tuple[str, ...] = ("",)
            for other_filepath_split_test, other_literal_test in other_values:
                other_score_test = path_score_distance(filepath_ref_split, other_filepath_split_test)
                if (
                    # First time.
                    (not other_literal_best) or
                    # A lower score has been found (smaller is better).
                    (other_score_test < other_score_best)
                ):
                    other_literal_best = other_literal_test
                    other_score_best = other_score_test
                    other_filepath_split_best = other_filepath_split_test
                del other_score_test
            other_literal = other_literal_best
            other_filepath_split = other_filepath_split_best
            del other_literal_best, other_score_best, other_filepath_split_best
        other_literal_eval, other_failed = eval_define(
            other_literal,
            default="",
            filepath_ref_split=other_filepath_split,
        )
        if other_literal_eval:
            return other_literal_eval
        # `failed.append(value)` is also valid, report the gestured failure as its more likely to give insights
        # into what went wrong.
        failed.extend(other_failed)
        return value
    # Use integer division.
    value_literal = value_literal.replace(r"/", r"//")
    # Populates `failed`.
    value_literal_eval = REGEX_ID_OR_NUMBER_C_LIKE.sub(re_replace_fn, value_literal)
    if failed:
        # One or more ID could not be found.
        return default, failed
    # This could use exception handling, don't unless it's needed though.
    # pylint: disable-next=eval-used
    return str(eval(value_literal_eval)), failed
 def validate_sizes(filepath: str, data_src: str) -> None:
    # Nicer for printing.
    filepath_rel = os.path.relpath(filepath, BASE_DIR)
    filepath_rel_split = tuple(filepath_rel.split(os.sep))
    for m, line, (beg, end) in line_number_utils.finditer_with_line_numbers_and_bounds(
            REGEX_SIZE_COMMENT_IN_ARRAY,
            data_src,
    ):
        del end
        value_id = m.group(1)
        value_literal = m.group(2)
        value_eval, lookups_failed = eval_define(
            value_id,
            default="",
            filepath_ref_split=filepath_rel_split,
        )
        data_line_column = "{:s}:{:d}:{:d}:".format(
            filepath_rel,
            line + 1,
            # Place the cursor after the `[`.
            (m.start(0) + 1) - beg,
        )
        if len(value_id.strip()) != len(value_id):
            print("WARN:", data_line_column, "comment includes white-space")
            continue
        if lookups_failed:
            print("WARN:", data_line_column, "[{:s}]".format(", ".join(lookups_failed)), "unknown")
            continue
        if value_literal != value_eval:
            print("WARN:", data_line_column, value_id, "mismatch", "({:s} != {:s})".format(value_literal, value_eval))
            continue
        if SHOW_SUCCESS:
            print("OK:  ", data_line_column, "{:s}={:s},".format(value_id, value_literal))
    # Returning None indicates the file is not edited.
 def main() -> int:
    # Extract defines.
    run(
        directories=[os.path.join(BASE_DIR, d) for d in SOURCE_DIRS],
        is_text=lambda filepath: filepath.endswith(SOURCE_EXT),
        text_operation=extract_defines,
        # Can't be used if we want to accumulate in a global variable.
        use_multiprocess=False,
    )
    # For predictable lookups on tie breakers.
    # In practice it should almost never matter.
    for values in global_defines.values():
        if len(values) > 1:
            values.sort()
    # Validate sizes.
    run(
        directories=[os.path.join(BASE_DIR, d) for d in SOURCE_DIRS],
        is_text=lambda filepath: filepath.endswith(SOURCE_EXT),
        text_operation=validate_sizes,
        # Can't be used if we want to accumulate in a global variable.
        use_multiprocess=False,
    )
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/tools/utils_maintenance/modules/line_number_utils.py
+++ b/tools/utils_maintenance/modules/line_number_utils.py
@@ -0,0 +1,88 @@
 # SPDX-FileCopyrightText: 2025 Blender Authors
 #
 # SPDX-License-Identifier: GPL-2.0-or-later
 """
 When writing text checking utilities, it's not always straightforward
 to find line numbers and ranges from an offset within the text.
 This module provides helpers to efficiently do this.
 The main utility is ``finditer_with_line_numbers_and_bounds``,
 an alternative to ``re.finditer`` which yields line numbers and offsets
 for the line bounds - useful for scanning files and reporting errors that include the line contents.
 """
 __all__ = (
    "finditer_newline_cache_compute",
    "finditer_with_line_numbers_and_bounds",
    "line_to_offset_range",
 )
 from collections.abc import (
    Iterator,
 )
 import re as _re
 def finditer_newline_cache_compute(text: str) -> tuple[dict[int, int], list[int]]:
    """
    Return a tuple containing:
    Offset to
    """
    # Offset to line lookup.
    offset_to_line_cache: dict[int, int] = {}
    # Line to offset lookup.
    line_to_offset_cache: list[int] = [0]
    for i, m in enumerate(_re.finditer("\\n", text), 1):
        ofs = m.start()
        offset_to_line_cache[ofs] = i
        line_to_offset_cache.append(ofs)
    return offset_to_line_cache, line_to_offset_cache
 def finditer_with_line_numbers_and_bounds(
        pattern: str,
        text: str,
        *,
        offset_to_line_cache: dict[int, int] | None = None,
        flags: int = 0,
 ) -> Iterator[tuple[_re.Match[str], int, tuple[int, int]]]:
    """
    A version of ``re.finditer`` that returns ``(match, line_number, line_bounds)``.
    Note that ``offset_to_line_cache`` is the first return value from
    ``finditer_newline_cache_compute``.
    This should be passed in if the iterator is called multiple times
    on the same buffer, to avoid calculating this every time.
    """
    if offset_to_line_cache is None:
        offset_to_line_cache, line_to_offset_cache = finditer_newline_cache_compute(text)
        del line_to_offset_cache
    text_len = len(text)
    for m in _re.finditer(pattern, text, flags):
        if (beg := text.rfind("\n", 0, m.start())) == -1:
            beg = 0
            line_number = 0
        else:
            line_number = offset_to_line_cache[beg]
        if (end := text.find("\n", m.end(), text_len)) == -1:
            end = text_len
        yield m, line_number, (beg, end)
 def line_to_offset_range(line: int, offset_limit: int, line_to_offset_cache: list[int]) -> tuple[int, int]:
    """
    Given an offset, return line bounds.
    """
    assert line >= 0
    beg = line_to_offset_cache[line]
    end = line_to_offset_cache[line + 1] if (line + 1 < len(line_to_offset_cache)) else offset_limit
    return beg, end