diff --git a/tools/check_source/check_spelling.py b/tools/check_source/check_spelling.py index 5fc9b3c6872..565e02d7c71 100755 --- a/tools/check_source/check_spelling.py +++ b/tools/check_source/check_spelling.py @@ -8,8 +8,8 @@ Script for checking source code spelling. python3 tools/check_source/check_spelling.py some_source_file.py -- Pass in a path for it to be checked recursively. -- Pass in '--strings' to check strings instead of comments. +- Pass in a directory for it to be checked recursively. +- Pass in '--extract=STRINGS' to check strings instead of comments. Currently only python source is checked. """ @@ -17,10 +17,13 @@ __all__ = ( "main", ) -import os import argparse +import os +import re import sys +from enum import Enum + from collections.abc import ( Callable, Iterator, @@ -86,6 +89,22 @@ SOURCE_EXT = ( "cmake", ) + +class TokenType(Enum): + COMMENT = 0 + STRING = 1 + DOCSTRING = 1 + + +class LangType(Enum): + C = 0 + CMAKE = 1 + PYTHON = 2 + + +LangTokenType = tuple[LangType, TokenType] + + BASEDIR = os.path.abspath(os.path.dirname(__file__)) ROOTDIR = os.path.normpath(os.path.join(BASEDIR, "..", "..")) ROOTDIR_WITH_SLASH = ROOTDIR + os.sep @@ -169,51 +188,124 @@ def hash_of_file_and_len(fp: str) -> tuple[bytes, int]: return m.digest(), len(data) -import re re_vars = re.compile("[A-Za-z]+") -# First remove this from comments, so we don't spell check example code, DOXYGEN commands, etc. -re_ignore = re.compile( - r'(' +def re_compile_from_sequence(ls: tuple[str, ...]) -> re.Pattern[str]: + return re.compile( + "({:s})".format("|".join(ls)), re.MULTILINE | re.DOTALL, + ) + + +# First remove this from comments, so we don't spell check example code, DOXYGEN commands, etc. +re_ignore_elems_generic_url_email_tags: tuple[str, ...] = ( # URL. - r'\b(https?|ftp)://\S+|' + r'\b(https?|ftp)://\S+', # Email address: # - r"<\w+@[\w\.\-]+>|" + r"<\w+@[\w\.\-]+>", # Convention for TODO/FIXME messages: TODO(my name) OR FIXME(name+name) OR XXX(some-name) OR NOTE(name/other-name): - r"\b(TODO|FIXME|XXX|NOTE|WARNING|WORKAROUND)\(@?[\w\s\+\-/]+\)|" - - # DOXYGEN style:
 ... 
- r"
.+
|" - # DOXYGEN style: \code ... \endcode - r"\s+\\code\b.+\s\\endcode\b|" - # DOXYGEN style #SOME_CODE. - r'#\S+|' - # DOXYGEN commands: \param foo - r"\\(section|subsection|subsubsection|defgroup|ingroup|addtogroup|param|tparam|page|a|see)\s+\S+|" - # DOXYGEN commands without any arguments after them: \command - r"\\(retval|todo|name)\b|" - # DOXYGEN 'param' syntax used rarely: \param foo[in,out] - r"\\param\[[a-z,]+\]\S*|" + r"\b(TODO|FIXME|XXX|NOTE|WARNING|WORKAROUND)\(@?[\w\s\+\-/]+\)", +) +re_ignore_elems_generic_expressions: tuple[str, ...] = ( # Words containing underscores: a_b - r'\S*\w+_\S+|' + r'\S*\w+_\S+', # Words containing arrows: a->b - r'\S*\w+\->\S+' + r'\S*\w+\->\S+', # Words containing dot notation: a.b (NOT ab... since this is used in English). - r'\w+\.\w+\S*|' + r'\w+\.\w+\S*', +) +re_ignore_elems_generic_single_backtick: tuple[str, ...] = ( # Single and back-tick quotes (often used to reference code). # Allow white-space or any bracket prefix, e.g: # (`expr a+b`) - r"[\s\(\[\{]\`[^\n`]+\`|" - r"[\s\(\[\{]'[^\n']+'" - - r')', - re.MULTILINE | re.DOTALL, + r"[\s\(\[\{]\`[^\n`]+\`", ) + +re_ignore_elems_generic_double_backtick: tuple[str, ...] = ( + # Double back-ticks are used for doc-strings for literals: + # (`expr a+b`) + r"[\s\(\[\{]\`\`[^\n`]+\`\`", +) + + +re_ignore_elems_generic_single_quote: tuple[str, ...] = ( + # Single and quotes. + # Allow white-space or any bracket prefix, e.g: + # ('reference') + r"[\s\(\[\{]'[^\n']+'", +) + +re_ignore_elems_lang_c_doxygen: tuple[str, ...] = ( + # DOXYGEN style: `
 ... 
` + r"
.+
", + # DOXYGEN style: `\code ... \endcode` + r"\s+\\code\b.+\s\\endcode\b", + # DOXYGEN style `#SOME_CODE`. + r'#\S+', + # DOXYGEN commands: `\param foo` + r"\\(section|subsection|subsubsection|defgroup|ingroup|addtogroup|param|tparam|page|a|see)\s+\S+", + # DOXYGEN commands without any arguments after them: \command + r"\\(retval|todo|name)\b", + # DOXYGEN 'param' syntax used rarely: `\param foo[in,out]` + r"\\param\[[a-z,]+\]\S*", + +) + +re_ignore_map: dict[tuple[LangType, TokenType], re.Pattern[str]] = { + (LangType.C, TokenType.COMMENT): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_lang_c_doxygen, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_single_backtick, + *re_ignore_elems_generic_single_quote, + )), + (LangType.C, TokenType.STRING): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_single_backtick, + *re_ignore_elems_generic_single_quote, + )), + + (LangType.PYTHON, TokenType.COMMENT): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_single_backtick, + )), + (LangType.PYTHON, TokenType.STRING): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_single_backtick, + )), + # Only Python uses the doc-string type. + (LangType.PYTHON, TokenType.DOCSTRING): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_double_backtick, + )), + + (LangType.CMAKE, TokenType.COMMENT): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_single_backtick, + )), + (LangType.CMAKE, TokenType.STRING): re_compile_from_sequence(( + *re_ignore_elems_generic_url_email_tags, + *re_ignore_elems_generic_expressions, + *re_ignore_elems_generic_single_backtick, + )), +} + +del re_ignore_elems_generic_url_email_tags +del re_ignore_elems_generic_expressions +del re_ignore_elems_generic_single_quote +del re_ignore_elems_generic_double_backtick +del re_ignore_elems_lang_c_doxygen + + # Then extract words. re_words = re.compile( r"\b(" @@ -230,7 +322,12 @@ if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS: re_single_word_c_comments = re.compile(r"\/\*[\s]*[a-zA-Z_]+[a-zA-Z0-9_]*[\s]*\*\/") -def words_from_text(text: str, check_type: str) -> list[tuple[str, int]]: +def words_from_text( + text: str, + lang: LangType, + type: TokenType, + check_type: str, +) -> list[tuple[str, int]]: """ Extract words to treat as English for spell checking. """ # Replace non-newlines with white-space, so all alignment is kept. @@ -243,6 +340,8 @@ def words_from_text(text: str, check_type: str) -> list[tuple[str, int]]: # print(match.group(0)) # Strip out URL's, code-blocks, etc. + re_ignore = re_ignore_map[(lang, type)] + text = re_ignore.sub(replace_ignore, text) words = [] @@ -282,17 +381,19 @@ class Comment: "file", "text", "line", + "lang", "type", ) - def __init__(self, file: str, text: str, line: int, type: str): + def __init__(self, file: str, text: str, line: int, lang: LangType, type: TokenType): self.file = file self.text = text self.line = line + self.lang = lang self.type = type def parse(self, check_type: str) -> list[tuple[str, int]]: - return words_from_text(self.text, check_type=check_type) + return words_from_text(self.text, self.lang, self.type, check_type=check_type) def line_and_column_from_comment_offset(self, pos: int) -> tuple[int, int]: text = self.text @@ -319,18 +420,25 @@ def extract_code_strings(filepath: str) -> tuple[list[Comment], set[str]]: # return comments, code_words if filepath.endswith(".py"): lex = lexers.get_lexer_by_name("python") + lang_type = LangType.PYTHON elif filepath.endswith((".cmake", ".txt")): lex = lexers.get_lexer_by_name("cmake") + lang_type = LangType.CMAKE else: lex = lexers.get_lexer_by_name("c") + lang_type = LangType.C slineno = 0 with open(filepath, encoding='utf-8') as fh: source = fh.read() for ty, ttext in lex.get_tokens(source): - if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}: - comments.append(Comment(filepath, ttext, slineno, 'STRING')) + if ty in { + Token.Literal.String, + Token.Literal.String.Double, + Token.Literal.String.Single, + }: + comments.append(Comment(filepath, ttext, slineno, lang_type, TokenType.STRING)) else: for match in re_vars.finditer(ttext): code_words.add(match.group(0)) @@ -356,11 +464,11 @@ def extract_py_comments(filepath: str) -> tuple[list[Comment], set[str]]: for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen: if toktype == token.STRING: if prev_toktype == token.INDENT: - comments.append(Comment(filepath, ttext, slineno - 1, 'DOCSTRING')) + comments.append(Comment(filepath, ttext, slineno - 1, LangType.PYTHON, TokenType.DOCSTRING)) elif toktype == tokenize.COMMENT: # non standard hint for commented CODE that we can ignore if not ttext.startswith("#~"): - comments.append(Comment(filepath, ttext, slineno - 1, 'COMMENT')) + comments.append(Comment(filepath, ttext, slineno - 1, LangType.PYTHON, TokenType.COMMENT)) else: for match in re_vars.finditer(ttext): code_words.add(match.group(0)) @@ -386,9 +494,9 @@ def extract_cmake_comments(filepath: str) -> tuple[list[Comment], set[str]]: if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}: # Disable because most CMake strings are references to paths/code." if False: - comments.append(Comment(filepath, ttext, slineno, 'STRING')) + comments.append(Comment(filepath, ttext, slineno, LangType.CMAKE, TokenType.STRING)) elif ty in {Token.Comment, Token.Comment.Single}: - comments.append(Comment(filepath, ttext, slineno, 'COMMENT')) + comments.append(Comment(filepath, ttext, slineno, LangType.CMAKE, TokenType.COMMENT)) else: for match in re_vars.finditer(ttext): code_words.add(match.group(0)) @@ -503,7 +611,7 @@ def extract_c_comments(filepath: str) -> tuple[list[Comment], set[str]]: block = (" " * (i - j)) + block slineno += text.count("\n", i_prev, i) - comments.append(Comment(filepath, block, slineno, 'COMMENT')) + comments.append(Comment(filepath, block, slineno, LangType.C, TokenType.COMMENT)) i_prev = i return comments, code_words