From 931e3e681b51e26be2686838fb3a7ba6911be501 Mon Sep 17 00:00:00 2001 From: tonghuaroot Date: Fri, 3 Jul 2026 15:33:43 +0800 Subject: [PATCH 1/2] gh-152930: Make tomllib parse each table section in linear time key_value_rule re-walked the whole table header from the document root for every key under it, so a deep header followed by many keys parsed in time proportional to depth times keys. Resolve the section's flags and nest node once per header and walk only the key relative to it, making each section linear in depth plus keys. --- Lib/test/test_tomllib/test_misc.py | 41 +++++++++++ Lib/tomllib/_parser.py | 68 +++++++++++++++---- ...-07-03-07-33-24.gh-issue-152930.pFhzpO.rst | 3 + 3 files changed, 99 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index af7ab91bba7cd1f..fb1360bab1ad4bb 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -10,6 +10,7 @@ import sys import tempfile import textwrap +import time import unittest from test import support from test.support.script_helper import assert_python_ok @@ -18,6 +19,46 @@ class TestMiscellaneous(unittest.TestCase): + def test_many_keys_under_deep_header(self): + # gh-152930: many keys under one deep table header parse correctly. + depth = 200 + num_keys = 2000 + header = "[" + ".".join(f"h{i}" for i in range(depth)) + "]\n" + body = "".join(f"k{i} = {i}\n" for i in range(num_keys)) + nest = tomllib.loads(header + body) + for i in range(depth): + nest = nest[f"h{i}"] + self.assertEqual(len(nest), num_keys) + for i in (0, num_keys // 2, num_keys - 1): + with self.subTest(key=i): + self.assertEqual(nest[f"k{i}"], i) + + @support.requires_resource('cpu') + def test_deep_header_does_not_scale_with_depth(self): + # gh-152930: a deep table header must not be re-walked for every key + # under it. With the key count fixed, doubling the header depth should + # leave the parse time about the same (linear in depth + keys), not + # double it (quadratic in depth * keys). The ratio is machine + # independent, so a generous bound tolerates load and slow platforms. + num_keys = 10000 + body = "".join(f"k{i} = 1\n" for i in range(num_keys)) + + def parse_time(depth): + doc = "[" + ".".join(f"h{i}" for i in range(depth)) + "]\n" + body + tomllib.loads(doc) # warm up + best = None + for _ in range(3): + start = time.perf_counter() + tomllib.loads(doc) + elapsed = time.perf_counter() - start + best = elapsed if best is None else min(best, elapsed) + return best + + max_depth = tomllib._parser.MAX_KEY_PARTS - 2 + shallow = parse_time(max_depth // 2) + deep = parse_time(max_depth) + self.assertLess(deep, shallow * 1.5) + def test_load(self): content = "one=1 \n two='two' \n arr=[]" expected = {"one": 1, "two": "two", "arr": []} diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index b89934808008efc..5795601e241c117 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -162,6 +162,9 @@ def loads(s: str, /, *, parse_float: ParseFloat = float) -> dict[str, Any]: pos = 0 out = Output() header: Key = () + # Resolve the current section once so per-key work does not re-walk `header`. + header_flags = out.flags.resolve(header) + header_nest = out.data.get_or_create_nest(header) parse_float = make_safe_parse_float(parse_float) # Parse one statement at a time @@ -186,7 +189,9 @@ def loads(s: str, /, *, parse_float: ParseFloat = float) -> dict[str, Any]: pos += 1 continue if char in KEY_INITIAL_CHARS: - pos = key_value_rule(src, pos, out, header, parse_float) + pos = key_value_rule( + src, pos, out, header, header_flags, header_nest, parse_float + ) pos = skip_chars(src, pos, TOML_WS) elif char == "[": try: @@ -198,6 +203,8 @@ def loads(s: str, /, *, parse_float: ParseFloat = float) -> dict[str, Any]: pos, header = create_list_rule(src, pos, out) else: pos, header = create_dict_rule(src, pos, out) + header_flags = out.flags.resolve(header) + header_nest = out.data.get_or_create_nest(header) pos = skip_chars(src, pos, TOML_WS) elif char != "#": raise TOMLDecodeError("Invalid statement", src, pos) @@ -259,10 +266,25 @@ def set(self, key: Key, flag: int, *, recursive: bool) -> None: # noqa: A003 cont[key_stem] = {"flags": set(), "recursive_flags": set(), "nested": {}} cont[key_stem]["recursive_flags" if recursive else "flags"].add(flag) - def is_(self, key: Key, flag: int) -> bool: + def is_( + self, + key: Key, + flag: int, + *, + start: tuple[dict[Any, Any], dict[Any, Any] | None] | None = None, + ) -> bool: + # `start` is a section anchor from resolve(); `key` is then relative to + # it. create_dict_rule/create_list_rule reject a frozen section before + # resolve() runs, so an anchored walk sees the same flags as from root. + if start is None: + cont: dict[Any, Any] = self._flags + inner: dict[Any, Any] | None = None + else: + cont, inner = start if not key: - return False # document root has no flags - cont = self._flags + if inner is None: + return False # document root has no flags + return flag in inner["flags"] or flag in inner["recursive_flags"] for k in key[:-1]: if k not in cont: return False @@ -276,6 +298,16 @@ def is_(self, key: Key, flag: int) -> bool: return flag in inner_cont["flags"] or flag in inner_cont["recursive_flags"] return False + def resolve( + self, header: Key + ) -> tuple[dict[Any, Any], dict[Any, Any] | None]: + inner: dict[Any, Any] | None = None + cont = self._flags + for k in header: + inner = cont[k] + cont = inner["nested"] + return cont, inner + class NestedDict: def __init__(self) -> None: @@ -287,8 +319,11 @@ def get_or_create_nest( key: Key, *, access_lists: bool = True, + start: dict[str, Any] | None = None, ) -> dict[str, Any]: - cont: Any = self.dict + # `start` anchors the walk at the current section's nest so `key` can be + # the part relative to the header instead of the absolute path. + cont: Any = self.dict if start is None else start for k in key: if k not in cont: cont[k] = {} @@ -413,28 +448,35 @@ def create_list_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]: def key_value_rule( - src: str, pos: Pos, out: Output, header: Key, parse_float: ParseFloat + src: str, + pos: Pos, + out: Output, + header: Key, + header_flags: tuple[dict[Any, Any], dict[Any, Any] | None], + header_nest: dict[str, Any], + parse_float: ParseFloat, ) -> Pos: pos, key, value = parse_key_value_pair(src, pos, parse_float) key_parent, key_stem = key[:-1], key[-1] - abs_key_parent = header + key_parent - relative_path_cont_keys = (header + key[:i] for i in range(1, len(key))) - for cont_key in relative_path_cont_keys: + for i in range(1, len(key)): + rel = key[:i] # Check that dotted key syntax does not redefine an existing table - if out.flags.is_(cont_key, Flags.EXPLICIT_NEST): + if out.flags.is_(rel, Flags.EXPLICIT_NEST, start=header_flags): + cont_key = header + rel raise TOMLDecodeError(f"Cannot redefine namespace {cont_key}", src, pos) # Containers in the relative path can't be opened with the table syntax or # dotted key/value syntax in following table sections. - out.flags.add_pending(cont_key, Flags.EXPLICIT_NEST) + out.flags.add_pending(header + rel, Flags.EXPLICIT_NEST) - if out.flags.is_(abs_key_parent, Flags.FROZEN): + if out.flags.is_(key_parent, Flags.FROZEN, start=header_flags): + abs_key_parent = header + key_parent raise TOMLDecodeError( f"Cannot mutate immutable namespace {abs_key_parent}", src, pos ) try: - nest = out.data.get_or_create_nest(abs_key_parent) + nest = out.data.get_or_create_nest(key_parent, start=header_nest) except KeyError: raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None if key_stem in nest: diff --git a/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst b/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst new file mode 100644 index 000000000000000..8046961171101c3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst @@ -0,0 +1,3 @@ +:mod:`tomllib` no longer parses a deep table header in time proportional to +its depth for every key in the section, avoiding quadratic parse time on +documents with a deep header followed by many keys. Patch by tonghuaroot. From 84dc0708a43b2d7bd0aae59a0566f4cadd6f393c Mon Sep 17 00:00:00 2001 From: tonghuaroot Date: Fri, 3 Jul 2026 15:41:09 +0800 Subject: [PATCH 2/2] gh-152930: Trim comments and NEWS entry --- Lib/tomllib/_parser.py | 10 ++++------ .../2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst | 5 ++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index 5795601e241c117..376cd883587cb5d 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -162,7 +162,7 @@ def loads(s: str, /, *, parse_float: ParseFloat = float) -> dict[str, Any]: pos = 0 out = Output() header: Key = () - # Resolve the current section once so per-key work does not re-walk `header`. + # Resolve the section once; per-key work is then relative to it. header_flags = out.flags.resolve(header) header_nest = out.data.get_or_create_nest(header) parse_float = make_safe_parse_float(parse_float) @@ -273,9 +273,8 @@ def is_( *, start: tuple[dict[Any, Any], dict[Any, Any] | None] | None = None, ) -> bool: - # `start` is a section anchor from resolve(); `key` is then relative to - # it. create_dict_rule/create_list_rule reject a frozen section before - # resolve() runs, so an anchored walk sees the same flags as from root. + # `key` is relative to the section anchor `start`; the section is never + # frozen when resolve() runs, so this matches a walk from the root. if start is None: cont: dict[Any, Any] = self._flags inner: dict[Any, Any] | None = None @@ -321,8 +320,7 @@ def get_or_create_nest( access_lists: bool = True, start: dict[str, Any] | None = None, ) -> dict[str, Any]: - # `start` anchors the walk at the current section's nest so `key` can be - # the part relative to the header instead of the absolute path. + # `start` anchors the walk so `key` is relative to the section's nest. cont: Any = self.dict if start is None else start for k in key: if k not in cont: diff --git a/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst b/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst index 8046961171101c3..c2f1d903a71ee78 100644 --- a/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst +++ b/Misc/NEWS.d/next/Library/2026-07-03-07-33-24.gh-issue-152930.pFhzpO.rst @@ -1,3 +1,2 @@ -:mod:`tomllib` no longer parses a deep table header in time proportional to -its depth for every key in the section, avoiding quadratic parse time on -documents with a deep header followed by many keys. Patch by tonghuaroot. +:mod:`tomllib` now parses each table section in linear time instead of +re-walking the section header for every key. Patch by tonghuaroot.