[PATCH v2 11/28] docs: kdoc: create a CMatch to match nested C blocks

From: Mauro Carvalho Chehab

Date: Thu Mar 12 2026 - 11:01:15 EST


The NextMatch code is complex, and will become even more complex
if we add there support for arguments.

Now that we have a tokenizer, we can use a better solution,
easier to be understood.

Yet, to improve performance, it is better to make it use a
previously tokenized code, changing its ABI.

So, reimplement NextMatch using the CTokener class. Once it
is done, we can drop NestedMatch.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@xxxxxxxxxx>
---
tools/lib/python/kdoc/c_lex.py | 222 +++++++++++++++++++++++++++---
tools/unittests/test_tokenizer.py | 3 +-
2 files changed, 203 insertions(+), 22 deletions(-)

diff --git a/tools/lib/python/kdoc/c_lex.py b/tools/lib/python/kdoc/c_lex.py
index 38f70e836eb8..e986a4ad73e3 100644
--- a/tools/lib/python/kdoc/c_lex.py
+++ b/tools/lib/python/kdoc/c_lex.py
@@ -58,14 +58,13 @@ class CToken():

return CToken.MISMATCH

+
def __init__(self, kind, value=None, pos=0,
brace_level=0, paren_level=0, bracket_level=0):
self.kind = kind
self.value = value
self.pos = pos
- self.brace_level = brace_level
- self.paren_level = paren_level
- self.bracket_level = bracket_level
+ self.level = (bracket_level, paren_level, brace_level)

def __repr__(self):
name = self.to_name(self.kind)
@@ -74,8 +73,7 @@ class CToken():
else:
value = self.value

- return f"CToken({name}, {value}, {self.pos}, " \
- f"{self.brace_level}, {self.paren_level}, {self.bracket_level})"
+ return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"

#: Tokens to parse C code.
TOKEN_LIST = [
@@ -105,20 +103,30 @@ TOKEN_LIST = [
(CToken.ENUM, r"\benum\b"),
(CToken.TYPEDEF, r"\bkinddef\b"),

- (CToken.NAME, r"[A-Za-z_][A-Za-z0-9_]*"),
+ (CToken.NAME, r"[A-Za-z_][A-Za-z0-9_]*"),

(CToken.SPACE, r"[\s]+"),

(CToken.MISMATCH,r"."),
]

+def fill_re_scanner(token_list):
+ """Ancillary routine to convert TOKEN_LIST into a finditer regex"""
+ re_tokens = []
+
+ for kind, pattern in token_list:
+ name = CToken.to_name(kind)
+ re_tokens.append(f"(?P<{name}>{pattern})")
+
+ return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
+
#: Handle C continuation lines.
RE_CONT = KernRe(r"\\\n")

RE_COMMENT_START = KernRe(r'/\*\s*')

#: tokenizer regex. Will be filled at the first CTokenizer usage.
-re_scanner = None
+RE_SCANNER = fill_re_scanner(TOKEN_LIST)

class CTokenizer():
"""
@@ -149,7 +157,7 @@ class CTokenizer():
paren_level = 0
bracket_level = 0

- for match in re_scanner.finditer(source):
+ for match in RE_SCANNER.finditer(source):
kind = CToken.from_name(match.lastgroup)
pos = match.start()
value = match.group()
@@ -175,7 +183,7 @@ class CTokenizer():
yield CToken(kind, value, pos,
brace_level, paren_level, bracket_level)

- def __init__(self, source):
+ def __init__(self, source=None):
"""
Create a regular expression to handle TOKEN_LIST.

@@ -183,20 +191,18 @@ class CTokenizer():
(?P<name>...)

in this particular case, it makes sense, as we can pick the name
- when matching a code via re_scanner().
+ when matching a code via RE_SCANNER.
"""
- global re_scanner
-
- if not re_scanner:
- re_tokens = []
-
- for kind, pattern in TOKEN_LIST:
- name = CToken.to_name(kind)
- re_tokens.append(f"(?P<{name}>{pattern})")
-
- re_scanner = KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)

self.tokens = []
+
+ if not source:
+ return
+
+ if isinstance(source, list):
+ self.tokens = source
+ return
+
for tok in self._tokenize(source):
self.tokens.append(tok)

@@ -237,3 +243,179 @@ class CTokenizer():
out += str(tok.value)

return out
+
+
+class CMatch:
+ """
+ Finding nested delimiters is hard with regular expressions. It is
+ even harder on Python with its normal re module, as there are several
+ advanced regular expressions that are missing.
+
+ This is the case of this pattern::
+
+ '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
+
+ which is used to properly match open/close parentheses of the
+ string search STRUCT_GROUP(),
+
+ Add a class that counts pairs of delimiters, using it to match and
+ replace nested expressions.
+
+ The original approach was suggested by:
+
+ https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+
+ Although I re-implemented it to make it more generic and match 3 types
+ of delimiters. The logic checks if delimiters are paired. If not, it
+ will ignore the search string.
+ """
+
+ # TODO: make CMatch handle multiple match groups
+ #
+ # Right now, regular expressions to match it are defined only up to
+ # the start delimiter, e.g.:
+ #
+ # \bSTRUCT_GROUP\(
+ #
+ # is similar to: STRUCT_GROUP\((.*)\)
+ # except that the content inside the match group is delimiter-aligned.
+ #
+ # The content inside parentheses is converted into a single replace
+ # group (e.g. r`\0').
+ #
+ # It would be nice to change such definition to support multiple
+ # match groups, allowing a regex equivalent to:
+ #
+ # FOO\((.*), (.*), (.*)\)
+ #
+ # it is probably easier to define it not as a regular expression, but
+ # with some lexical definition like:
+ #
+ # FOO(arg1, arg2, arg3)
+
+ def __init__(self, regex):
+ self.regex = KernRe(regex)
+
+ def _search(self, tokenizer):
+ """
+ Finds paired blocks for a regex that ends with a delimiter.
+
+ The suggestion of using finditer to match pairs came from:
+ https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+ but I ended using a different implementation to align all three types
+ of delimiters and seek for an initial regular expression.
+
+ The algorithm seeks for open/close paired delimiters and places them
+ into a stack, yielding a start/stop position of each match when the
+ stack is zeroed.
+
+ The algorithm should work fine for properly paired lines, but will
+ silently ignore end delimiters that precede a start delimiter.
+ This should be OK for kernel-doc parser, as unaligned delimiters
+ would cause compilation errors. So, we don't need to raise exceptions
+ to cover such issues.
+ """
+
+ start = None
+ offset = -1
+ started = False
+
+ import sys
+
+ stack = []
+
+ for i, tok in enumerate(tokenizer.tokens):
+ if start is None:
+ if tok.kind == CToken.NAME and self.regex.match(tok.value):
+ start = i
+ stack.append((start, tok.level))
+ started = False
+
+ continue
+
+ if not started and tok.kind == CToken.BEGIN:
+ started = True
+ continue
+
+ if tok.kind == CToken.END and tok.level == stack[-1][1]:
+ start, level = stack.pop()
+ offset = i
+
+ yield CTokenizer(tokenizer.tokens[start:offset + 1])
+ start = None
+
+ #
+ # If an END zeroing levels is not there, return remaining stuff
+ # This is meant to solve cases where the caller logic might be
+ # picking an incomplete block.
+ #
+ if start and offset < 0:
+ print("WARNING: can't find an end", file=sys.stderr)
+ yield CTokenizer(tokenizer.tokens[start:])
+
+ def search(self, source):
+ """
+ This is similar to re.search:
+
+ It matches a regex that it is followed by a delimiter,
+ returning occurrences only if all delimiters are paired.
+ """
+
+ if isinstance(source, CTokenizer):
+ tokenizer = source
+ is_token = True
+ else:
+ tokenizer = CTokenizer(source)
+ is_token = False
+
+ for new_tokenizer in self._search(tokenizer):
+ if is_token:
+ yield new_tokenizer
+ else:
+ yield str(new_tokenizer)
+
+ def sub(self, sub, line, count=0):
+ """
+ This is similar to re.sub:
+
+ It matches a regex that it is followed by a delimiter,
+ replacing occurrences only if all delimiters are paired.
+
+ if the sub argument contains::
+
+ r'\0'
+
+ it will work just like re: it places there the matched paired data
+ with the delimiter stripped.
+
+ If count is different than zero, it will replace at most count
+ items.
+ """
+ if isinstance(source, CTokenizer):
+ is_token = True
+ tokenizer = source
+ else:
+ is_token = False
+ tokenizer = CTokenizer(source)
+
+ new_tokenizer = CTokenizer()
+ cur_pos = 0
+ for start, end in self._search(tokenizer):
+ new_tokenizer.tokens += tokenizer.tokens[cur_pos:start]
+# new_tokenizer.tokens += [sub_str]
+
+ cur_pos = end + 1
+
+ if cur_pos:
+ new_tokenizer.tokens += tokenizer.tokens[cur_pos:]
+
+ print(new_tokenizer.tokens)
+
+ return str(new_tokenizer)
+
+ def __repr__(self):
+ """
+ Returns a displayable version of the class init.
+ """
+
+ return f'CMatch("{self.regex.regex.pattern}")'
diff --git a/tools/unittests/test_tokenizer.py b/tools/unittests/test_tokenizer.py
index efb1d1687811..3081f27a7786 100755
--- a/tools/unittests/test_tokenizer.py
+++ b/tools/unittests/test_tokenizer.py
@@ -30,8 +30,7 @@ def tokens_to_list(tokens):
if tok.kind == CToken.SPACE:
continue

- tuples += [(tok.kind, tok.value,
- tok.brace_level, tok.paren_level, tok.bracket_level)]
+ tuples += [(tok.kind, tok.value, tok.level)]

return tuples

--
2.52.0