Files
test2/tools/check_source/check_spelling.py
Sergey Sharybin 03806d0b67 Re-design of submodules used in blender.git
This commit implements described in the #104573.

The goal is to fix the confusion of the submodule hashes change, which are not
ideal for any of the supported git-module configuration (they are either always
visible causing confusion, or silently staged and committed, also causing
confusion).

This commit replaces submodules with a checkout of addons and addons_contrib,
covered by the .gitignore, and locale and developer tools are moved to the
main repository.

This also changes the paths:
- /release/scripts are moved to the /scripts
- /source/tools are moved to the /tools
- /release/datafiles/locale is moved to /locale

This is done to avoid conflicts when using bisect, and also allow buildbot to
automatically "recover" wgen building older or newer branches/patches.

Running `make update` will initialize the local checkout to the changed
repository configuration.

Another aspect of the change is that the make update will support Github style
of remote organization (origin remote pointing to thy fork, upstream remote
pointing to the upstream blender/blender.git).

Pull Request #104755
2023-02-21 16:39:58 +01:00

759 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-or-later
"""
Script for checking source code spelling.
python3 tools/check_source/check_spelling.py some_soure_file.py
- Pass in a path for it to be checked recursively.
- Pass in '--strings' to check strings instead of comments.
Currently only python source is checked.
"""
import os
import argparse
from typing import (
Callable,
Dict,
Generator,
List,
Optional,
Set,
Tuple,
)
# Report: word, line, column.
Report = Tuple[str, int, int]
# Cache: {filepath: length, hash, reports}.
CacheData = Dict[str, Tuple[int, bytes, List[Report]]]
# Map word to suggestions.
SuggestMap = Dict[str, str]
ONLY_ONCE = True
USE_COLOR = True
# Ignore: `/*identifier*/` as these are used in C++ for unused arguments or to denote struct members.
# These identifiers can be ignored in most cases.
USE_SKIP_SINGLE_IDENTIFIER_COMMENTS = True
_words_visited = set()
_files_visited = set()
# Lowercase word -> suggestion list.
_suggest_map: SuggestMap = {}
VERBOSE_CACHE = False
if USE_COLOR:
COLOR_WORD = "\033[92m"
COLOR_ENDC = "\033[0m"
else:
COLOR_WORD = ""
COLOR_ENDC = ""
from check_spelling_c_config import (
dict_custom,
dict_ignore,
dict_ignore_hyphenated_prefix,
dict_ignore_hyphenated_suffix,
files_ignore,
)
BASEDIR = os.path.abspath(os.path.dirname(__file__))
ROOTDIR = os.path.normpath(os.path.join(BASEDIR, "..", "..", ".."))
# Ensure native slashes.
files_ignore = {
os.path.normpath(os.path.join(ROOTDIR, f.replace("/", os.sep)))
for f in files_ignore
}
# -----------------------------------------------------------------------------
# Dictionary Utilities
def dictionary_create(): # type: ignore
import enchant # type: ignore
dict_spelling = enchant.Dict("en_US")
# Don't add ignore to the dictionary, since they will be suggested.
for w in dict_custom:
# Also, don't use `add(w)`, this will manipulate users personal dictionaries.
dict_spelling.add_to_session(w)
return dict_spelling
def dictionary_check(w: str, code_words: Set[str]) -> bool:
w_lower = w.lower()
if w_lower in dict_ignore:
return True
is_correct: bool = _dict.check(w)
# Split by hyphenation and check.
if not is_correct:
if "-" in w:
is_correct = True
# Allow: `un-word`, `re-word`.
w_split = w.strip("-").split("-")
if len(w_split) > 1:
if w_split and w_split[0].lower() in dict_ignore_hyphenated_prefix:
del w_split[0]
# Allow: `word-ish`, `word-ness`.
if len(w_split) > 1:
if w_split and w_split[-1].lower() in dict_ignore_hyphenated_suffix:
del w_split[-1]
for w_sub in w_split:
if w_sub:
if w_sub in code_words:
continue
w_sub_lower = w_sub.lower()
if w_sub_lower in dict_ignore:
continue
if not _dict.check(w_sub):
is_correct = False
break
return is_correct
def dictionary_suggest(w: str) -> List[str]:
return _dict.suggest(w) # type: ignore
_dict = dictionary_create() # type: ignore
# -----------------------------------------------------------------------------
# General Utilities
def hash_of_file_and_len(fp: str) -> Tuple[bytes, int]:
import hashlib
with open(fp, 'rb') as fh:
data = fh.read()
m = hashlib.sha512()
m.update(data)
return m.digest(), len(data)
import re
re_vars = re.compile("[A-Za-z]+")
# First remove this from comments, so we don't spell check example code, doxygen commands, etc.
re_ignore = re.compile(
r'('
# URL.
r'\b(https?|ftp)://\S+|'
# Email address: <me@email.com>
# <someone@foo.bar-baz.com>
r"<\w+@[\w\.\-]+>|"
# Convention for TODO/FIXME messages: TODO(my name) OR FIXME(name+name) OR XXX(some-name) OR NOTE(name/other-name):
r"\b(TODO|FIXME|XXX|NOTE|WARNING)\(@?[\w\s\+\-/]+\)|"
# Doxygen style: <pre> ... </pre>
r"<pre>.+</pre>|"
# Doxygen style: \code ... \endcode
r"\s+\\code\b.+\s\\endcode\b|"
# Doxygen style #SOME_CODE.
r'#\S+|'
# Doxygen commands: \param foo
r"\\(section|subsection|subsubsection|defgroup|ingroup|addtogroup|param|tparam|page|a|see)\s+\S+|"
# Doxygen commands without any arguments after them: \command
r"\\(retval|todo|name)\b|"
# Doxygen 'param' syntax used rarely: \param foo[in,out]
r"\\param\[[a-z,]+\]\S*|"
# Words containing underscores: a_b
r'\S*\w+_\S+|'
# Words containing arrows: a->b
r'\S*\w+\->\S+'
# Words containing dot notation: a.b (NOT ab... since this is used in English).
r'\w+\.\w+\S*|'
# Single and back-tick quotes (often used to reference code).
# Allow white-space or any bracket prefix, e.g:
# (`expr a+b`)
r"[\s\(\[\{]\`[^\n`]+\`|"
r"[\s\(\[\{]'[^\n']+'"
r')',
re.MULTILINE | re.DOTALL,
)
# Then extract words.
re_words = re.compile(
r"\b("
# Capital words, with optional '-' and "'".
r"[A-Z]+[\-'A-Z]*[A-Z]|"
# Lowercase words, with optional '-' and "'".
r"[A-Za-z][\-'a-z]*[a-z]+"
r")\b"
)
re_not_newline = re.compile("[^\n]")
if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS:
re_single_word_c_comments = re.compile(r"\/\*[\s]*[a-zA-Z_]+[a-zA-Z0-9_]*[\s]*\*\/")
def words_from_text(text: str, check_type: str) -> List[Tuple[str, int]]:
""" Extract words to treat as English for spell checking.
"""
# Replace non-newlines with white-space, so all alignment is kept.
def replace_ignore(match: re.Match[str]) -> str:
start, end = match.span()
return re_not_newline.sub(" ", match.string[start:end])
# Handy for checking what we ignore, incase we ignore too much and miss real errors.
# for match in re_ignore.finditer(text):
# print(match.group(0))
# Strip out URL's, code-blocks, etc.
text = re_ignore.sub(replace_ignore, text)
words = []
if check_type == 'SPELLING':
for match in re_words.finditer(text):
words.append((match.group(0), match.start()))
def word_ok(w: str) -> bool:
# Ignore all uppercase words.
if w.isupper():
return False
return True
words[:] = [w for w in words if word_ok(w[0])]
elif check_type == 'DUPLICATES':
w_prev = ""
w_prev_start = 0
for match in re_words.finditer(text):
w = match.group(0)
w_start = match.start()
w_lower = w.lower()
if w_lower == w_prev:
text_ws = text[w_prev_start + len(w_prev): w_start]
if text_ws == " ":
words.append((w_lower, w_start))
w_prev = w_lower
w_prev_start = w_start
else:
assert False
return words
class Comment:
__slots__ = (
"file",
"text",
"line",
"type",
)
def __init__(self, file: str, text: str, line: int, type: str):
self.file = file
self.text = text
self.line = line
self.type = type
def parse(self, check_type: str) -> List[Tuple[str, int]]:
return words_from_text(self.text, check_type=check_type)
def line_and_column_from_comment_offset(self, pos: int) -> Tuple[int, int]:
text = self.text
slineno = self.line + text.count("\n", 0, pos)
# Allow for -1 to be not found.
scol = text.rfind("\n", 0, pos) + 1
if scol == 0:
# Not found.
scol = pos
else:
scol = pos - scol
return slineno, scol
def extract_code_strings(filepath: str) -> Tuple[List[Comment], Set[str]]:
import pygments
from pygments import lexers
from pygments.token import Token
comments = []
code_words = set()
# lex = lexers.find_lexer_class_for_filename(filepath)
# if lex is None:
# return comments, code_words
if filepath.endswith(".py"):
lex = lexers.get_lexer_by_name("python")
else:
lex = lexers.get_lexer_by_name("c")
slineno = 0
with open(filepath, encoding='utf-8') as fh:
source = fh.read()
for ty, ttext in lex.get_tokens(source):
if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}:
comments.append(Comment(filepath, ttext, slineno, 'STRING'))
else:
for match in re_vars.finditer(ttext):
code_words.add(match.group(0))
# Ugh - not nice or fast.
slineno += ttext.count("\n")
return comments, code_words
def extract_py_comments(filepath: str) -> Tuple[List[Comment], Set[str]]:
import token
import tokenize
source = open(filepath, encoding='utf-8')
comments = []
code_words = set()
prev_toktype = token.INDENT
tokgen = tokenize.generate_tokens(source.readline)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if toktype == token.STRING:
if prev_toktype == token.INDENT:
comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
elif toktype == tokenize.COMMENT:
# non standard hint for commented CODE that we can ignore
if not ttext.startswith("#~"):
comments.append(Comment(filepath, ttext, slineno, 'COMMENT'))
else:
for match in re_vars.finditer(ttext):
code_words.add(match.group(0))
prev_toktype = toktype
return comments, code_words
def extract_c_comments(filepath: str) -> Tuple[List[Comment], Set[str]]:
"""
Extracts comments like this:
/*
* This is a multi-line comment, notice the '*'s are aligned.
*/
"""
text = open(filepath, encoding='utf-8').read()
BEGIN = "/*"
END = "*/"
TABSIZE = 4
SINGLE_LINE = False
# reverse these to find blocks we won't parse
PRINT_NON_ALIGNED = False
PRINT_SPELLING = True
comment_ranges = []
if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS:
comment_ignore_offsets = set()
for match in re_single_word_c_comments.finditer(text):
comment_ignore_offsets.add(match.start(0))
i = 0
while i != -1:
i = text.find(BEGIN, i)
if i != -1:
i_next = text.find(END, i)
if i_next != -1:
do_comment_add = True
if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS:
if i in comment_ignore_offsets:
do_comment_add = False
# Not essential but seek back to find beginning of line.
while i > 0 and text[i - 1] in {"\t", " "}:
i -= 1
i_next += len(END)
if do_comment_add:
comment_ranges.append((i, i_next))
i = i_next
else:
pass
if PRINT_NON_ALIGNED:
for i, i_next in comment_ranges:
# Seek i back to the line start.
i_bol = text.rfind("\n", 0, i) + 1
l_ofs_first = i - i_bol
star_offsets = set()
block = text[i_bol:i_next]
for line_index, l in enumerate(block.split("\n")):
star_offsets.add(l.find("*", l_ofs_first))
l_ofs_first = 0
if len(star_offsets) > 1:
print("%s:%d" % (filepath, line_index + text.count("\n", 0, i)))
break
if not PRINT_SPELLING:
return [], set()
# Collect variables from code, so we can reference variables from code blocks
# without this generating noise from the spell checker.
code_ranges = []
if not comment_ranges:
code_ranges.append((0, len(text)))
else:
for index in range(len(comment_ranges) + 1):
if index == 0:
i_prev = 0
else:
i_prev = comment_ranges[index - 1][1]
if index == len(comment_ranges):
i_next = len(text)
else:
i_next = comment_ranges[index][0]
code_ranges.append((i_prev, i_next))
code_words = set()
for i, i_next in code_ranges:
for match in re_vars.finditer(text[i:i_next]):
w = match.group(0)
code_words.add(w)
# Allow plurals of these variables too.
code_words.add(w + "'s")
comments = []
slineno = 0
i_prev = 0
for i, i_next in comment_ranges:
block = text[i:i_next]
# Add white-space in front of the block (for alignment test)
# allow for -1 being not found, which results as zero.
j = text.rfind("\n", 0, i) + 1
block = (" " * (i - j)) + block
slineno += text.count("\n", i_prev, i)
comments.append(Comment(filepath, block, slineno, 'COMMENT'))
i_prev = i
return comments, code_words
def spell_check_report(filepath: str, check_type: str, report: Report) -> None:
w, slineno, scol = report
if check_type == 'SPELLING':
w_lower = w.lower()
if ONLY_ONCE:
if w_lower in _words_visited:
return
else:
_words_visited.add(w_lower)
suggest = _suggest_map.get(w_lower)
if suggest is None:
_suggest_map[w_lower] = suggest = " ".join(dictionary_suggest(w))
print("%s:%d:%d: %s%s%s, suggest (%s)" % (
filepath,
slineno + 1,
scol + 1,
COLOR_WORD,
w,
COLOR_ENDC,
suggest,
))
elif check_type == 'DUPLICATES':
print("%s:%d:%d: %s%s%s, duplicate" % (
filepath,
slineno + 1,
scol + 1,
COLOR_WORD,
w,
COLOR_ENDC,
))
def spell_check_file(
filepath: str,
check_type: str,
extract_type: str = 'COMMENTS',
) -> Generator[Report, None, None]:
if extract_type == 'COMMENTS':
if filepath.endswith(".py"):
comment_list, code_words = extract_py_comments(filepath)
else:
comment_list, code_words = extract_c_comments(filepath)
elif extract_type == 'STRINGS':
comment_list, code_words = extract_code_strings(filepath)
if check_type == 'SPELLING':
for comment in comment_list:
words = comment.parse(check_type='SPELLING')
for w, pos in words:
w_lower = w.lower()
if w_lower in dict_ignore:
continue
is_good_spelling = dictionary_check(w, code_words)
if not is_good_spelling:
# Ignore literals that show up in code,
# gets rid of a lot of noise from comments that reference variables.
if w in code_words:
# print("Skipping", w)
continue
slineno, scol = comment.line_and_column_from_comment_offset(pos)
yield (w, slineno, scol)
elif check_type == 'DUPLICATES':
for comment in comment_list:
words = comment.parse(check_type='DUPLICATES')
for w, pos in words:
slineno, scol = comment.line_and_column_from_comment_offset(pos)
# print(filepath + ":" + str(slineno + 1) + ":" + str(scol), w, "(duplicates)")
yield (w, slineno, scol)
else:
assert False
def spell_check_file_recursive(
dirpath: str,
check_type: str,
extract_type: str = 'COMMENTS',
cache_data: Optional[CacheData] = None,
) -> None:
import os
from os.path import join, splitext
def source_list(
path: str,
filename_check: Optional[Callable[[str], bool]] = None,
) -> Generator[str, None, None]:
for dirpath, dirnames, filenames in os.walk(path):
# Only needed so this can be matches with ignore paths.
dirpath = os.path.abspath(dirpath)
# skip '.git'
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.startswith("."):
continue
filepath = join(dirpath, filename)
if not (filename_check is None or filename_check(filepath)):
continue
if filepath in files_ignore:
continue
yield filepath
def is_source(filename: str) -> bool:
ext = splitext(filename)[1]
return (ext in {
".c",
".cc",
".inl",
".cpp",
".cxx",
".hpp",
".hxx",
".h",
".hh",
".m",
".mm",
".osl",
".py",
})
for filepath in source_list(dirpath, is_source):
for report in spell_check_file_with_cache_support(
filepath, check_type, extract_type=extract_type, cache_data=cache_data,
):
spell_check_report(filepath, check_type, report)
# -----------------------------------------------------------------------------
# Cache File Support
#
# Cache is formatted as follows:
# (
# # Store all misspelled words.
# {filepath: (size, sha512, [reports, ...])},
#
# # Store suggestions, as these are slow to re-calculate.
# {lowercase_words: suggestions},
# )
#
def spell_cache_read(cache_filepath: str) -> Tuple[CacheData, SuggestMap]:
import pickle
cache_store: Tuple[CacheData, SuggestMap] = {}, {}
if os.path.exists(cache_filepath):
with open(cache_filepath, 'rb') as fh:
cache_store = pickle.load(fh)
return cache_store
def spell_cache_write(cache_filepath: str, cache_store: Tuple[CacheData, SuggestMap]) -> None:
import pickle
with open(cache_filepath, 'wb') as fh:
pickle.dump(cache_store, fh)
def spell_check_file_with_cache_support(
filepath: str,
check_type: str,
extract_type: str = 'COMMENTS',
cache_data: Optional[CacheData] = None,
) -> Generator[Report, None, None]:
"""
Iterator each item is a report: (word, line_number, column_number)
"""
_files_visited.add(filepath)
if cache_data is None:
yield from spell_check_file(filepath, check_type, extract_type=extract_type)
return
cache_data_for_file = cache_data.get(filepath)
if cache_data_for_file and len(cache_data_for_file) != 3:
cache_data_for_file = None
cache_hash_test, cache_len_test = hash_of_file_and_len(filepath)
if cache_data_for_file is not None:
cache_len, cache_hash, cache_reports = cache_data_for_file
if cache_len_test == cache_len:
if cache_hash_test == cache_hash:
if VERBOSE_CACHE:
print("Using cache for:", filepath)
yield from cache_reports
return
cache_reports = []
for report in spell_check_file(filepath, check_type, extract_type=extract_type):
cache_reports.append(report)
cache_data[filepath] = (cache_len_test, cache_hash_test, cache_reports)
yield from cache_reports
# -----------------------------------------------------------------------------
# Extract Bad Spelling from a Source File
# -----------------------------------------------------------------------------
# Main & Argument Parsing
def argparse_create() -> argparse.ArgumentParser:
# When --help or no args are given, print this help
description = __doc__
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
'--extract',
dest='extract',
choices=('COMMENTS', 'STRINGS'),
default='COMMENTS',
required=False,
metavar='WHAT',
help=(
'Text to extract for checking.\n'
'\n'
'- ``COMMENTS`` extracts comments from source code.\n'
'- ``STRINGS`` extracts text.'
),
)
parser.add_argument(
'--check',
dest='check_type',
choices=('SPELLING', 'DUPLICATES'),
default='SPELLING',
required=False,
metavar='CHECK_TYPE',
help=(
'Text to extract for checking.\n'
'\n'
'- ``COMMENTS`` extracts comments from source code.\n'
'- ``STRINGS`` extracts text.'
),
)
parser.add_argument(
"--cache-file",
dest="cache_file",
help=(
"Optional cache, for fast re-execution, "
"avoiding re-extracting spelling when files have not been modified."
),
required=False,
)
parser.add_argument(
"paths",
nargs='+',
help="Files or directories to walk recursively.",
)
return parser
def main() -> None:
global _suggest_map
import os
args = argparse_create().parse_args()
extract_type = args.extract
cache_filepath = args.cache_file
check_type = args.check_type
cache_data: Optional[CacheData] = None
if cache_filepath:
cache_data, _suggest_map = spell_cache_read(cache_filepath)
clear_stale_cache = True
# print(extract_type)
try:
for filepath in args.paths:
if os.path.isdir(filepath):
# recursive search
spell_check_file_recursive(filepath, check_type, extract_type=extract_type, cache_data=cache_data)
else:
# single file
for report in spell_check_file_with_cache_support(
filepath, check_type, extract_type=extract_type, cache_data=cache_data):
spell_check_report(filepath, check_type, report)
except KeyboardInterrupt:
clear_stale_cache = False
if cache_filepath:
assert cache_data is not None
if VERBOSE_CACHE:
print("Writing cache:", len(cache_data))
if clear_stale_cache:
# Don't keep suggestions for old misspellings.
_suggest_map = {w_lower: _suggest_map[w_lower] for w_lower in _words_visited}
for filepath in list(cache_data.keys()):
if filepath not in _files_visited:
del cache_data[filepath]
spell_cache_write(cache_filepath, (cache_data, _suggest_map))
if __name__ == "__main__":
main()