Tools: add a utility to validate array sizes

The script check_source/static_check_size_comments.py run directly or called via the convenience target "make check_size_comments". Add a utility module: `line_number_utils` which implements a version of `re.finditer` that includes line numbers & ranges.
2025-05-23 04:01:10 +00:00
parent 84694bf635
commit 0265b13399
4 changed files with 374 additions and 0 deletions
--- a/5
+++ b/5
@@ -64,6 +64,7 @@ Static Source Code Checking

   * check_clang_array:     Run blender source through clang array checking script (C & C++).
   * check_struct_comments: Check struct member comments are correct (C & C++).
+   * check_size_comments:   Check array size comments match defines/enums (C & C++).
   * check_deprecated:      Check if there is any deprecated code to remove.
   * check_descriptions:    Check for duplicate/invalid descriptions.
   * check_licenses:        Check license headers follow the SPDX license specification,
@@ -502,6 +503,10 @@ check_struct_comments: .FORCE
 	    "$(BLENDER_DIR)/tools/check_source/static_check_clang.py" \
 	    --checks=struct_comments --match=".*" --jobs=$(NPROCS)

+check_size_comments: .FORCE
+	$(PYTHON) \
+	    "$(BLENDER_DIR)/tools/check_source/static_check_size_comments.py"
+
 check_clang_array: .FORCE
 	@$(CMAKE_CONFIG)
 	@cd "$(BUILD_DIR)" ; \
--- a/tools/check_source/check_mypy_config.py
+++ b/tools/check_source/check_mypy_config.py
@@ -33,6 +33,7 @@ PATHS: tuple[tuple[str, tuple[Any, ...], dict[str, str]], ...] = (
    ("tools/check_docs/", (), {}),
    ("tools/check_source/", (), {'MYPYPATH': "modules"}),
    ("tools/check_source/check_unused_defines.py", (), {'MYPYPATH': "../utils_maintenance/modules"}),
+    ("tools/check_source/static_check_size_comments.py", (), {'MYPYPATH': "../utils_maintenance/modules"}),
    ("tools/config/", (), {}),
    ("tools/triage/", (), {}),
    ("tools/utils/", (), {}),
--- a/tools/check_source/static_check_size_comments.py
+++ b/tools/check_source/static_check_size_comments.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2023 Blender Authors
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+r"""
+Validates sizes in C/C++ sources written as: ``type name[/*MAX_NAME*/ 64]``
+where ``MAX_NAME`` is expected to be a define equal to 64, otherwise a warning is reported.
+"""
+__all__ = (
+    "main",
+)
+
+import os
+import sys
+import re
+
+THIS_DIR = os.path.dirname(__file__)
+BASE_DIR = os.path.normpath(os.path.abspath(os.path.normpath(os.path.join(THIS_DIR, "..", ".."))))
+sys.path.append(os.path.join(THIS_DIR, "..", "utils_maintenance", "modules"))
+
+from batch_edit_text import run
+import line_number_utils
+
+
+# -----------------------------------------------------------------------------
+# Utilities
+
+
+# -----------------------------------------------------------------------------
+# Local Settings
+
+# TODO, move to config file
+SOURCE_DIRS = (
+    "source",
+)
+
+SOURCE_EXT = (
+    # C/C++
+    ".c", ".h", ".cpp", ".hpp", ".cc", ".hh", ".cxx", ".hxx", ".inl",
+    # Objective C
+    ".m", ".mm",
+    # GLSL
+    ".glsl",
+)
+
+# Mainly useful for development to check extraction & validation are working.
+SHOW_SUCCESS = True
+
+
+# -----------------------------------------------------------------------------
+# Globals
+
+
+# Map defines to a list of (filename-split, value) pairs.
+global_defines: dict[
+    # The define ID.
+    str,
+    # Value(s), in case it's defined in multiple files.
+    list[
+        tuple[
+            # The `BASE_DIR` relative path (split by `os.sep`).
+            tuple[str, ...],
+            # The value of the define,
+            # a literal string with comments stripped out.
+            str,
+        ],
+    ],
+] = {}
+
+
+REGEX_ID_LITERAL = "[A-Za-z_][A-Za-z_0-9]*"
+
+# Detect:
+#   `[/*ID*/ 64]`.
+#   `[/*ID - 2*/ 62]`.
+REGEX_SIZE_COMMENT_IN_ARRAY = re.compile("\\[\\/\\*([^\\]]+)\\*\\/\\s*(\\d+)\\]")
+# Detect: `#define ID 64`
+REGEX_DEFINE_C_LIKE = re.compile("^\\s*#\\s*define\\s+(" + REGEX_ID_LITERAL + ")[ \t]+([^\n]+)", re.MULTILINE)
+# Detect:
+#   `ID = 64,`
+#   `ID = 64`
+REGEX_ENUM_C_LIKE = re.compile("^\\s*(" + REGEX_ID_LITERAL + ")\\s=\\s([^,\n]+)", re.MULTILINE)
+# Detect ID's.
+REGEX_ID_OR_NUMBER_C_LIKE = re.compile("[A-Za-z0-9_]+")
+
+
+def extract_defines(filepath: str, data_src: str) -> None:
+    filepath_rel = os.path.relpath(filepath, BASE_DIR)
+    for regex_matcher in (REGEX_DEFINE_C_LIKE, REGEX_ENUM_C_LIKE):
+        for m in regex_matcher.finditer(data_src):
+            value_id = m.group(1)
+            value_literal = m.group(2)
+
+            # Weak comment stripping.
+            # This is (arguably) acceptable since the intent is to extract numbers,
+            # if developers feel the need to write lines such as:
+            # `#define VALUE_MAX /* Lets make some trouble! */ 64`
+            # Then they can consider if that's actually needed (sigh!)...
+            # Otherwise, we could replace this with a full parser such as CLANG,
+            # however this is a bit of a hassle to setup.
+            if "//" in value_literal:
+                value_literal = value_literal.split("//", 1)[0]
+            if "/*" in value_literal:
+                value_literal = value_literal.split("/*", 1)[0]
+
+            try:
+                global_defines[value_id].append((tuple(filepath_rel.split(os.sep)), value_literal))
+            except KeyError:
+                global_defines[value_id] = [(tuple(filepath_rel.split(os.sep)), value_literal)]
+
+    # Returning None indicates the file is not edited.
+
+
+def path_score_distance(a: tuple[str, ...], b: tuple[str, ...]) -> tuple[int, int]:
+    """
+    Compare two paths, to find which paths are "closer" to each-other.
+    This is used as a tie breaker when defines are found in multiple headers.
+    """
+    count_shared = 0
+    range_min = min(len(a), len(b))
+    range_max = max(len(a), len(b))
+    for i in range(range_min):
+        if a[i] != b[i]:
+            break
+        count_shared += 1
+
+    count_nested = range_max - count_shared
+    # Negate shared so smaller is better.
+    # Less path nesting also gets priority.
+    return (-count_shared, count_nested)
+
+
+def eval_define(
+        value_literal: str,
+        *,
+        default: str,
+        filepath_ref_split: tuple[str, ...],
+) -> tuple[str, list[str]]:
+    failed: list[str] = []
+
+    def re_replace_fn(match: re.Match[str]) -> str:
+        value = match.group()
+        if value.isdigit():
+            return value
+
+        other_values = global_defines.get(value)
+        if other_values is None:
+            failed.append(value)
+            return value
+
+        if len(other_values) == 1:
+            other_filepath_split, other_literal = other_values[0]
+        else:
+            # Find the "closest" on the file system.
+            # In practice favor paths which are co-located works fairly well,
+            # needed as it's now known which headers ID's in a head *could* reference.
+            other_literal_best = ""
+            other_score_best = (0, 0)
+            other_filepath_split_best: tuple[str, ...] = ("",)
+
+            for other_filepath_split_test, other_literal_test in other_values:
+                other_score_test = path_score_distance(filepath_ref_split, other_filepath_split_test)
+                if (
+                    # First time.
+                    (not other_literal_best) or
+                    # A lower score has been found (smaller is better).
+                    (other_score_test < other_score_best)
+                ):
+                    other_literal_best = other_literal_test
+                    other_score_best = other_score_test
+                    other_filepath_split_best = other_filepath_split_test
+                del other_score_test
+            other_literal = other_literal_best
+            other_filepath_split = other_filepath_split_best
+            del other_literal_best, other_score_best, other_filepath_split_best
+
+        other_literal_eval, other_failed = eval_define(
+            other_literal,
+            default="",
+            filepath_ref_split=other_filepath_split,
+        )
+        if other_literal_eval:
+            return other_literal_eval
+
+        # `failed.append(value)` is also valid, report the gestured failure as its more likely to give insights
+        # into what went wrong.
+        failed.extend(other_failed)
+        return value
+
+    # Use integer division.
+    value_literal = value_literal.replace(r"/", r"//")
+
+    # Populates `failed`.
+    value_literal_eval = REGEX_ID_OR_NUMBER_C_LIKE.sub(re_replace_fn, value_literal)
+
+    if failed:
+        # One or more ID could not be found.
+        return default, failed
+
+    # This could use exception handling, don't unless it's needed though.
+    # pylint: disable-next=eval-used
+    return str(eval(value_literal_eval)), failed
+
+
+def validate_sizes(filepath: str, data_src: str) -> None:
+    # Nicer for printing.
+    filepath_rel = os.path.relpath(filepath, BASE_DIR)
+    filepath_rel_split = tuple(filepath_rel.split(os.sep))
+
+    for m, line, (beg, end) in line_number_utils.finditer_with_line_numbers_and_bounds(
+            REGEX_SIZE_COMMENT_IN_ARRAY,
+            data_src,
+    ):
+        del end
+        value_id = m.group(1)
+        value_literal = m.group(2)
+
+        value_eval, lookups_failed = eval_define(
+            value_id,
+            default="",
+            filepath_ref_split=filepath_rel_split,
+        )
+
+        data_line_column = "{:s}:{:d}:{:d}:".format(
+            filepath_rel,
+            line + 1,
+            # Place the cursor after the `[`.
+            (m.start(0) + 1) - beg,
+        )
+
+        if len(value_id.strip()) != len(value_id):
+            print("WARN:", data_line_column, "comment includes white-space")
+            continue
+
+        if lookups_failed:
+            print("WARN:", data_line_column, "[{:s}]".format(", ".join(lookups_failed)), "unknown")
+            continue
+
+        if value_literal != value_eval:
+            print("WARN:", data_line_column, value_id, "mismatch", "({:s} != {:s})".format(value_literal, value_eval))
+            continue
+
+        if SHOW_SUCCESS:
+            print("OK:  ", data_line_column, "{:s}={:s},".format(value_id, value_literal))
+
+    # Returning None indicates the file is not edited.
+
+
+def main() -> int:
+
+    # Extract defines.
+    run(
+        directories=[os.path.join(BASE_DIR, d) for d in SOURCE_DIRS],
+        is_text=lambda filepath: filepath.endswith(SOURCE_EXT),
+        text_operation=extract_defines,
+        # Can't be used if we want to accumulate in a global variable.
+        use_multiprocess=False,
+    )
+
+    # For predictable lookups on tie breakers.
+    # In practice it should almost never matter.
+    for values in global_defines.values():
+        if len(values) > 1:
+            values.sort()
+
+    # Validate sizes.
+    run(
+        directories=[os.path.join(BASE_DIR, d) for d in SOURCE_DIRS],
+        is_text=lambda filepath: filepath.endswith(SOURCE_EXT),
+        text_operation=validate_sizes,
+        # Can't be used if we want to accumulate in a global variable.
+        use_multiprocess=False,
+    )
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/utils_maintenance/modules/line_number_utils.py
+++ b/tools/utils_maintenance/modules/line_number_utils.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: 2025 Blender Authors
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+"""
+When writing text checking utilities, it's not always straightforward
+to find line numbers and ranges from an offset within the text.
+
+This module provides helpers to efficiently do this.
+
+The main utility is ``finditer_with_line_numbers_and_bounds``,
+an alternative to ``re.finditer`` which yields line numbers and offsets
+for the line bounds - useful for scanning files and reporting errors that include the line contents.
+"""
+
+__all__ = (
+    "finditer_newline_cache_compute",
+    "finditer_with_line_numbers_and_bounds",
+    "line_to_offset_range",
+)
+
+from collections.abc import (
+    Iterator,
+)
+
+import re as _re
+
+
+def finditer_newline_cache_compute(text: str) -> tuple[dict[int, int], list[int]]:
+    """
+    Return a tuple containing:
+    Offset to
+    """
+    # Offset to line lookup.
+    offset_to_line_cache: dict[int, int] = {}
+    # Line to offset lookup.
+    line_to_offset_cache: list[int] = [0]
+
+    for i, m in enumerate(_re.finditer("\\n", text), 1):
+        ofs = m.start()
+        offset_to_line_cache[ofs] = i
+        line_to_offset_cache.append(ofs)
+
+    return offset_to_line_cache, line_to_offset_cache
+
+
+def finditer_with_line_numbers_and_bounds(
+        pattern: str,
+        text: str,
+        *,
+        offset_to_line_cache: dict[int, int] | None = None,
+        flags: int = 0,
+) -> Iterator[tuple[_re.Match[str], int, tuple[int, int]]]:
+    """
+    A version of ``re.finditer`` that returns ``(match, line_number, line_bounds)``.
+
+    Note that ``offset_to_line_cache`` is the first return value from
+    ``finditer_newline_cache_compute``.
+    This should be passed in if the iterator is called multiple times
+    on the same buffer, to avoid calculating this every time.
+    """
+    if offset_to_line_cache is None:
+        offset_to_line_cache, line_to_offset_cache = finditer_newline_cache_compute(text)
+        del line_to_offset_cache
+
+    text_len = len(text)
+    for m in _re.finditer(pattern, text, flags):
+
+        if (beg := text.rfind("\n", 0, m.start())) == -1:
+            beg = 0
+            line_number = 0
+        else:
+            line_number = offset_to_line_cache[beg]
+
+        if (end := text.find("\n", m.end(), text_len)) == -1:
+            end = text_len
+
+        yield m, line_number, (beg, end)
+
+
+def line_to_offset_range(line: int, offset_limit: int, line_to_offset_cache: list[int]) -> tuple[int, int]:
+    """
+    Given an offset, return line bounds.
+    """
+    assert line >= 0
+    beg = line_to_offset_cache[line]
+    end = line_to_offset_cache[line + 1] if (line + 1 < len(line_to_offset_cache)) else offset_limit
+    return beg, end