From 1245c9828ef2d83e67f563fb853b9f5d9311405f Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 31 May 2026 13:14:42 -0400 Subject: [PATCH 001/152] Better parameterized test support. This is going to be pretty much a complete rewrite of the header value parser. As such, we want to do our best to ensure that we don't break anything. That means more tests. Writing tests for the email package is a bit enervating, given the number of different variations of input data and edge cases that really should be tested. I wrote the 'parameterize' helper to mitigate some of the pain of that test writing, but it was a pretty primitive tool and didn't really help all that much. To make it palatable to write more tests, I'm replacing that original, primitive decorator with a new parameterization framework informed by the years of test writing experience I've had since then. I'm pretty pleased with the result, which is conceptually based on a package I wrote a few years ago but never published outside of github. Its test suite gives a lot of usage examples, and it is much more intuitive than parameterize. As I move along in this refactor, there will be *many* more usage examples added to test__header_value_parser. --- Lib/test/test_email/params.py | 384 +++++++++ Lib/test/test_email/test_params.py | 1237 ++++++++++++++++++++++++++++ 2 files changed, 1621 insertions(+) create mode 100644 Lib/test/test_email/params.py create mode 100644 Lib/test/test_email/test_params.py diff --git a/Lib/test/test_email/params.py b/Lib/test/test_email/params.py new file mode 100644 index 000000000000000..6c3723850574b4f --- /dev/null +++ b/Lib/test/test_email/params.py @@ -0,0 +1,384 @@ +"""Support for Parameterized Tests""" + +import collections +from functools import wraps +from string import Formatter + + +class SafeFormatter(Formatter): + + def format(self, format_string, *args, **kw): + self.args = args + self.kw = kw + return super().format(format_string, *args, **kw) + + def parse(self, format_string): + for text, varname, spec, conv in super().parse(format_string): + if varname and varname not in self.kw: + spec = ':' + spec if spec else '' + conv = '!' + conv if conv else '' + text = text + '{' + varname + spec + conv + '}' + varname, spec, conv = None, None, None + yield text, varname, spec, conv + +safe_format = SafeFormatter().format + +def _fmt(fmtfunc, obj, subs): + if hasattr(obj, 'format'): + return safe_format(obj, **subs) + try: + i = iter(obj) + except TypeError: + return obj + if hasattr(obj, 'items'): + return type(obj)({k: fmtfunc(v, subs) for k, v in obj.items()}) + return type(obj)(fmtfunc(x, subs) for x in i) + +def fmt(obj, subs): + return _fmt( + lambda obj, subs: + safe_format(obj, **subs) if hasattr(obj, 'format') else obj, + obj, + subs, + ) + +def fmtall(obj, subs): + return _fmt(fmtall, obj, subs) + + +class C: + + """Call specification""" + + def __init__(self, *args, **kw): + """Return object holding a concrete set of arguments for a callable. + + Store any positional arguments as a tuple in self.args, and any + keyword arguments in as a dict in self.kw. + + """ + self.args = args + self.kw = kw + + def __call__(self, func): + """Call func using the concrete arguments from self.args and self.kw""" + return func(*self.args, **self.kw) + + def __eq__(self, other): + try: + return self.args == other.args and self.kw == other.kw + except AttributeError: + return False + + def _repr(self, fname): + args = ', '.join(repr(arg) for arg in self.args) + kw = ', '.join(f'{k}={repr(v)}' for k, v in self.kw.items()) + return f"{fname}({', '.join(filter(None, (args, kw)))})" + + def __repr__(self): + return self._repr(type(self).__name__) + + def repr_call(self, func): + return self._repr(func.__name__) + + def fmt(self, **subs): + return C(*fmt(self.args, subs), **fmt(self.kw, subs)) + + def fmtall(self, **subs): + return C(*fmtall(self.args, subs), **fmtall(self.kw, subs)) + + +class Params(collections.UserDict): + + def __init__(self, *args, **kw): + super().__init__() + self.update(*args, **kw) + + def __repr__(self): + items = ', '.join(f'{k}={repr(v)}' for k, v in self.items()) + return f'{type(self).__name__}({items})' + + def update(self, *args, **kw): + for arg in args: + if not isinstance(arg, self.__class__): + raise TypeError( + f"Invalid argument {arg!r}, arguments" + f" must be of type {type(self).__name__}," + f" not {type(arg).__name__!r}" + ) + super().update(arg) + super().update(kw) + + def __setitem__(self, name, value): + if not name.isidentifier(): + raise ValueError( + f"parameter names must be identifiers, {name!r} is invalid", + ) + if name in self: + raise ValueError( + f"cannot add {name}={value!r}, a callspec with that name" + f" already exists" + ) + if not isinstance(value, C): + value = C(value) + super().__setitem__(name, value) + + +debug = False +_tupify = lambda x: x.items() if isinstance(x, Params) else [('', x)] +def _params_map(func, *, with_name=False, with_namelist=False, debug=False): + if not callable(func): + raise TypeError( + f"argument must be callable, not {type(func).__name__!r}" + ) + if with_name and with_namelist: + raise ValueError("with_name and with_namelist cannot both be True") + @wraps(func) + def params_mapper(*args, **kw): + if __debug__ and debug: print(f'flattening using {func.__name__}') + if __debug__ and debug > 1: print(f'{args=} {kw=}') + param_set = Params() + for an, av in [t for a in [*args, Params(**kw)] for t in _tupify(a)]: + if debug: print(f'{an=} {av=}') + cs = C(*av.args, **av.kw) if isinstance(av, C) else C(av) + if with_name: + cs.args = (an, *cs.args) + elif with_namelist: + cs.args = (NameList(an), *cs.args) + try: + for n, v in cs(func): + try: + name = '__'.join(filter(None, (an, n))) + except Exception: + raise ValueError(f"Invalid label: {n!r}") from None + if __debug__ and debug: print(f'{n=} {v=} {name=}') + if not name: + raise ValueError('missing test label') + param_set[name] = v + except Exception as ex: + val = f'{an}={av!r}' if an else repr(av) + raise ValueError( + f'{func.__name__} failed on {val}', + ) from ex + return param_set + return params_mapper + +def params_map(*args, **kw): + if not kw: + return _params_map(*args) + else: + def _(func): + return _params_map(func, *args, **kw) + return _ + + +def only(*args): + yield ('', args[0]) if len(args) == 1 else args + + +@params_map +def as_value(name): + yield name, C(name) + + +@params_map(with_name=True) +def with_names(name, *args, **kw): + yield '', C(name, *args, **kw) + + +def for_each_name(*names): + @params_map + def for_each_name(*args, **kw): + for name in names: + yield name, C(name, *args, **kw) + return for_each_name + + +def add_label(label): + @params_map + def add_label(*args, **kw): + yield label, C(*args, **kw) + return add_label + + +def for_each_function(*functions): + @params_map + def for_each_function(*args, **kw): + for function in functions: + yield function.__name__, C(function, *args, **kw) + return for_each_function + + +# We could factor a common core out of these next two, but the error +# messages when the selection function fails would be more confusing. + +def include_if(include, *, label=''): + @params_map(with_name=True) + def include_if(name, *args, **kw): + if include(NameList(name), *args, **kw): + yield label, C(*args, **kw) + return include_if + +def include_unless(omit, *, label=''): + @params_map(with_name=True) + def include_unless(name, *args, **kw): + if omit(NameList(name), *args, **kw): + return + yield label, C(*args, **kw) + return include_unless + + +class NameList(list): + + def __new__(cls, name): + """Return a specialized list facilitating operations on test names. + + Split the test name into a list at '__' characters, so that it is at + base a list of the name components that were used to construct the name + by one or more param_maps, assuming that '__' has only been used in + names via param_maps concatenation. Calling string on the returned + object should yield the original name. + + """ + return super().__new__(cls) + + def __init__(self, name): + super().__init__(name.split('__') if name else []) + + def __str__(self): + return '__'.join(self) + + def has_any(self, *names): + """Return True if any name is an element of the name list + + names may be passed as a single tuple or a list of arguments. + """ + if len(names) == 1 and not hasattr(names[0], 'encode'): + names = names[0] + return any(name and name in self for name in names) + + + def has_all(self, *names): + """Return True if all of the names are elements of the name list""" + names = [n for n in names if n] + if not names: + return False + return all(name in self for name in names) + + +def params(*args, **kw): + """Mark decorated func so that it is called using specified C instances. + + If one or more dictionaries, and/or any keyword arguments are supplied, + raise an error if the dictionary keys are not unique across all arguments. + Combine these dictionaries, adding the name of the test followed by '__' as + a prefix to each of the dictionary keys. If called with the function as + the only argument, create an empty dictionary instead. If the class + contains attributes whose names starts with 'params_' plus the name of the + decorated function, add the attribute name with 'params_' removed from the + front and '__' added to the end as a prefix to each key in the parameter's + value. If the resultant names duplicate the names in the existing combined + dictionary, raise an error. Otherwise add them to the combined dictionary. + + For each element of the combined dictionary, create a function whose name + is the key, where the result of calling the function is to call the wrapped + function with the arguments specified by the C instance (or equivalent) + that must be the value of each dictionary entry. + + """ + if len(args) == 1 and not kw and callable( (func:= args[0]) ): + func._params_ = False + return func + def params_decorator(func): + func._params_ = Params(*args, **kw) + return func + return params_decorator + + +class ParamsMixin: + + """XXX docstring goes here once I write the docs.""" + + paramsAttributePrefix = 'params_' + paramsDebug = False + paramsRequired = True + + @classmethod + def __init_subclass__(cls, *args, **kwargs): + """Turn each test decorated with @params into a series of tests. + + """ + super().__init_subclass__(*args, **kwargs) + params_func_attrs = {} + params_attrs = {} + for name, attr in cls.__dict__.items(): + if hasattr(attr, '_params_'): + params_func_attrs[name] = attr + if __debug__ and cls.paramsDebug: + print(f'@params method {name!r}') + elif name.startswith(cls.paramsAttributePrefix): + if __debug__ and cls.paramsDebug: + print(f'{cls.paramsAttributePrefix} attribute {name!r}') + params_attrs[name] = attr + # Associate the params_ with the function with the matching name. + params = collections.defaultdict(list) + for pname, paramset in params_attrs.items(): + if not isinstance(paramset, Params): + raise ValueError( + f'value of params constant {pname} must be a Params' + f' dictionary, not {type(paramset)}' + ) + n = pname.removeprefix(cls.paramsAttributePrefix) + # Loop, in case the test name has one or more '__' in it... + tn = [] + while n: + if n in params_func_attrs: + break + if '__' not in n: + raise ValueError(f'No @params test found for {pname!r}') + n, t = n.rsplit('__', 1) + if cls.paramsDebug: + print(f'{n=} {t=}') + tn.insert(0, t) + params[n].append(('__'.join(tn), paramset)) + for fname, func in params_func_attrs.items(): + if __debug__ and cls.paramsDebug: + print( + f"{fname!r} has{'' if func._params_ else ' no'} decorator" + f" params and {(n := len(params[fname]))}" + f" {cls.paramsAttributePrefix}" + f" attribute{'' if n == 1 else 's'}", + ) + all_params = func._params_ + if all_params is False: + if not params[fname]: + raise ValueError(f'No params found for {fname!r}') + all_params = Params() + for pn, ps in params[fname]: + try: + pr = pn + '__' if pn else pn + all_params.update( + **{f'{pr}{n}' if pn else n: v for n, v in ps.items()} + ) + except Exception as ex: + raise ValueError( + f"error combining '{cls.paramsAttributePrefix}" + f"{fname}{'__' if pn else ''}{pn}'" + f" with existing params" + ) from ex + if cls.paramsRequired and not all_params: + raise ValueError( + f"paramsRequired is set and {fname!r} has no params", + ) + impl_name = '__' + fname + delattr(cls, fname) + setattr(cls, impl_name, func) + for (test_name, callspec) in all_params.items(): + test = ( + lambda self, impl_name=impl_name, callspec=callspec: + callspec(getattr(self, impl_name)) + ) + test.__name__ = fname + '__' + test_name + setattr(cls, test.__name__, test) + if __debug__ and cls.paramsDebug: + print(f'generated {callspec.repr_call(test)}') diff --git a/Lib/test/test_email/test_params.py b/Lib/test/test_email/test_params.py new file mode 100644 index 000000000000000..f48c8f312083642 --- /dev/null +++ b/Lib/test/test_email/test_params.py @@ -0,0 +1,1237 @@ +import re +import unittest +from contextlib import contextmanager +from test.support import captured_stdout +from test.test_email.params import ( + add_label, + as_value, + C, + for_each_function, + for_each_name, + fmt, + fmtall, + include_if, + NameList, + include_unless, + only, + params, + Params, + params_map, + ParamsMixin, + with_names, + ) +from textwrap import dedent + +TYPED_VALUES = with_names(int=1, dict=dict(a=1), C=C(C(1)), tuple=(1, 2)) + +class AssertMixin: + + @contextmanager + def assertRaisesRegexEx(self, ex, re, cause_ex=None, cause_re=None): + with super().assertRaisesRegex(ex, re) as cm: + yield + if cause_ex: + self.assertIsNotNone(cm.exception.__cause__) + self.assertIsInstance(cm.exception.__cause__, cause_ex) + self.assertRegex(str(cm.exception.__cause__), cause_re) + + +# We eat our own dogfood here, which could make bugs a bit confusing to sort +# out. But it exercises the machinery pretty well, and demonstrates the power +# of the framework. And we get much better test coverage out of it. + + +class TestFmt(ParamsMixin, unittest.TestCase): + + test_strings = ( + ('', '', '' ), + ('no sub point', 'no sub point', 'no sub point' ), + ('sub an {a}', 'sub an a', 'sub an a' ), + ('{a} and {foo!r}', "a and {foo!r}", "a and 'foo'" ), + ('{x} {y} {z:02}', '{x} 2 {z:02}', '1 2 03' ), + ) + sub = (dict(), dict(a='a', y=2), dict(foo='foo', x=1, z=3)) + expected = list(zip(*test_strings)) + make_list = lambda v: [C(1), *v, [1, 2, 3]] + make_dict = lambda v: dict( + {f'v{i}': v for i, v in enumerate(v)}, foo=C(1), bar=[1, 2, 3], + ) + + substitution_cases = Params( + no_subs = C(expected[0], dict(), expected[0]), + non_subs = C(expected[0], dict(zzz='foo', yyy='bar'), expected[0]), + one_sub = C(expected[0], sub[1], expected[1]), + all_sub = C(expected[0], sub[1] | sub[2], expected[2]), + ) + + @params( + for_each_function(fmt, fmtall)( + params_map(lambda obj, subs, expected, ml=make_list, md=make_dict: + [ + ('list', C(ml(obj), subs, ml(expected))), + ('dict', C(md(obj), subs, md(expected))), + ] + )(substitution_cases), + ) + ) + def test(self, fmter, obj, subs, expected): + self.assertEqual(fmter(obj, subs), expected) + + @params( + fmt_list = C(fmt, make_list), + fmtall_list = C(fmtall, make_list), + fmt_dict = C(fmt, make_dict), + fmtall_dict = C(fmtall, make_dict), + ) + def test_multiple_passes(self, fmter, maker): + unmodified = maker(self.expected[0]) + v0 = fmter(unmodified, dict()) + self.assertEqual(v0, unmodified) + v1 = fmter(v0, dict(foobar=99) | self.sub[1]) + self.assertEqual(v1, maker(self.expected[1])) + v2 = fmter(v1, dict(foobar=99, y=9) | self.sub[2]) + self.assertEqual(v2, maker(self.expected[2])) + + nested = [('{a}', '{b}'), dict(a='{a}', b=['{b}']), [[1, ['{b}']]]] + nested_subs = dict(a=1, b=2) + nested_subbed = [('1', '2'), dict(a='1', b=['2']), [[1, ['2']]]] + + def test_fmt_does_not_recurse(self): + self.assertEqual(fmt(self.nested, self.nested_subs), self.nested) + + def test_fmtall_recurses(self): + self.assertEqual( + fmtall(self.nested, self.nested_subs), + self.nested_subbed, + ) + + +class TestC(ParamsMixin, unittest.TestCase): + + def test_empty_C(self): + p = C() + self.assertEqual(p.args, tuple()) + self.assertEqual(p.kw, {}) + + def test_args_only(self): + p = C('a', 2) + self.assertEqual(p.args, ('a', 2)) + self.assertEqual(p.kw, {}) + + def test_kw_only(self): + p = C(b='a', n=2) + self.assertEqual(p.args, tuple(), p.args) + self.assertEqual(p.kw, dict(b='a', n=2)) + + def test_args_and_kw(self): + p = C(1, 2, b='a', n=2) + self.assertEqual(p.args, (1, 2)) + self.assertEqual(p.kw, dict(b='a', n=2)) + + def test_callable(self): + p = C(1, 2, b='a', n=2) + res = [] + def tester(arg1, arg2, b=None, n=None): + res.extend([arg1, arg2, b, n]) + p(tester) + self.assertEqual(res, [1, 2, 'a', 2]) + + @params( + missing_arg = C(lambda a: 1, C(), "(?i)(?=.*'a')(?=.*missing)"), + extra_arg = C(lambda: 1, C(1), "(?i)(?=.*0)(?=.*positional)"), + missing_kw = C(lambda *, x: 1, C(), "(?i)(?=.*'x')(?=.*missing)"), + extra_kw = C(lambda: 1, C(x=1), "(?i)(?=.*'x')(?=.*unexpected)"), + ) + def test_arguments_mismatch(self, f, cs, msg): + with self.assertRaisesRegex(TypeError, msg): + cs(f) + + expected_reprs = Params( + no_arg = C(C(), "{fn}()"), + one_arg = C(C(1), "{fn}(1)"), + two_args = C(C(1, 2), "{fn}(1, 2)"), + two_str_args = C(C('1', '2'), "{fn}('1', '2')"), + one_kw = C(C(a=1), "{fn}(a=1)"), + two_kw = C(C(a=1, b=2), "{fn}(a=1, b=2)"), + two_str_kw = C(C(a='1', b='2'), "{fn}(a='1', b='2')"), + one_each = C(C(1, a='1'), "{fn}(1, a='1')"), + two_each = C(C(1, 2, a='1', b=3), "{fn}(1, 2, a='1', b=3)"), + ) + + @params(expected_reprs) + def test_repr(self, callspec, expected_repr): + self.assertEqual(repr(callspec), expected_repr.format(fn='C')) + + @params(expected_reprs) + def test_repr_call(self, callspec, expected_repr): + f = lambda: 1 + self.assertEqual( + callspec.repr_call(f), + expected_repr.format(fn=''), + ) + + @params( + params_map(lambda cs, _: only(C(cs, cs)))(expected_reprs), + dict_reversed = C(C(dict(a=1, b=2)), C(dict(b=2, a=1))), + kws_reversed = C(C(a=1, b=2, c=3), C(c=3, b=2, a=1)), + ) + def test_eq(self, callspec1, callspec2): + self.assertEqual(callspec1, callspec2) + + @params( + one_arg_value_mismatch = C(C(1), C(2)), + two_arg_value_mismatch = C(C(1, a='2'), C(2, a='1')), + two_arg_value_mismatch2 = C(C(1, a='2'), C(1, a='1')), + arg_count_mismatch = C(C(1, 2), C(1)), + arg_type_mismatch = C(C('1', '2'), C(1, 2)), + kw_name_mismatch = C(C(a=1), C(b=1)), + kw_count_mismatch = C(C(a=1, b=2), C(a=1)), + kw_type_mismatch = C(C(a='1', b='2'), C(a=1, b=2)), + non_callspec = C(C(1), 1), + ) + def test_neq(self, callspec1, callspec2): + self.assertNotEqual(callspec1, callspec2) + + def test_args_is_settable(self): + cs = C('a', 'b') + cs.args = ('c', 'd') + self.assertEqual(cs(lambda *args: args), ('c', 'd')) + + def test_kw_is_settable(self): + cs = C(b=1, c=2) + cs.kw = dict(c=3, d=3) + self.assertEqual(cs(lambda *_, **kw: kw), dict(c=3, d=3)) + + def test_kw_is_mutable(self): + cs = C(b=1, c=2) + cs.kw['b'] = 2 + cs.kw.update(z=7) + self.assertEqual(cs(lambda *_, **kw: kw), dict(b=2, c=2, z=7)) + + @params(for_each_name('fmt', 'fmtall')(TestFmt.substitution_cases)) + def test_fmt(self, fmtname, unmodified, subs, expected): + cs = C(*TestFmt.make_list(unmodified), **TestFmt.make_dict(unmodified)) + exp = C(*TestFmt.make_list(expected), **TestFmt.make_dict(expected)) + self.assertEqual(getattr(cs, fmtname)(**subs), exp) + + def test_fmt_does_not_recurse(self): + unmodified = C(*TestFmt.nested) + self.assertEqual(unmodified.fmt(**TestFmt.nested_subs), unmodified) + + def test_fmtall_recurses(self): + self.assertEqual( + C(*TestFmt.nested).fmtall(**TestFmt.nested_subs), + C(*TestFmt.nested_subbed), + ) + + +class TestParams(ParamsMixin, unittest.TestCase): + + @params( + ints=C( + C(a=1, b=2, c=3), + expected=dict(a=C(1), b=C(2), c=C(3)), + ), + cs=C( + C(a=C(1), b=C(2), c=C(3)), + expected=dict(a=C(1), b=C(2), c=C(3)), + ), + dict_and_kw=C( + C(z=dict(a=C(1), b=2), c=C(3)), + expected=dict(z=C(dict(a=C(1), b=2)), c=C(3)), + ), + params_and_kw=C( + C(Params(z=dict(a=C(1), b=2)), c=C(3)), + expected=dict(z=C(dict(a=C(1), b=2)), c=C(3)), + ), + params_as_c_arg=C( + C(z=C(Params(y=dict(a=C(1), b=2)))), + expected=dict(z=C(Params(y=C(dict(a=C(1), b=2))))), + ), + params_as_kw=C( + C(z=Params(y=dict(a=C(1), b=2))), + expected=dict(z=C(Params(y=C(dict(a=C(1), b=2))))), + ), + ) + def test_valid_data(self, callspec, expected): + result = callspec(Params) + self.assertEqual(dict(result), expected) + self.assertIsInstance(result, Params) + + def test_repr(self): + self.assertEqual( + repr(Params(a=1, b=C(3), z=dict(a=1, b=2))), + "Params(a=C(1), b=C(3), z=C({'a': 1, 'b': 2}))", + ) + + @params(int=1, dict=dict(a=1), params=Params(a=1), c=C(1), d=C(Params(z=1))) + def test_setitem_results_in_c(self, value): + p = Params() + p['foo'] = value + self.assertEqual(p['foo'], value if isinstance(value, C) else C(value)) + + @params_map + def for_init_and_update(cs, msg): + yield 'init', C(Params, cs, msg) + yield 'update', C(Params().update, cs, msg) + + @params( + for_init_and_update( + params_kw = C(C(Params(a=1), a=7), msg=r'a=7'), + two_params = C(C(Params(a=1), Params(a=7)), msg=r'a=C\(7\)'), + ), + setitem_duplicate = C(Params(a=1).__setitem__, C('a', 7), msg='a=7'), + setitem_non_identifier = C(Params().__setitem__, C('0', 7), msg='0'), + ) + def test_duplicate_keys_disallowed(self, meth, callspec, msg): + with self.assertRaisesRegex(ValueError, msg): + callspec(meth) + + @params(for_init_and_update(TYPED_VALUES)) + def test_invalid_data_types(self, meth, typ, val): + msg = f'(?=.*1)(?=.*Params)(?=.*{typ})' + with self.assertRaisesRegex(TypeError, msg): + meth(val) + + +class TestParameterizingTests(AssertMixin, ParamsMixin, unittest.TestCase): + + def _test_success(self, testcase, testname): + res = unittest.TestResult() + testcase(methodName=testname).run(res) + self.assertEqual(res.testsRun, 1) + self.assertEqual(res.failures, []) + self.assertEqual(res.errors, []) + self.assertTrue(res.wasSuccessful) + + def _test_error(self, testcase, testname, expected_error_regex): + res = unittest.TestResult() + testcase(methodName=testname).run(res) + self.assertEqual(res.testsRun, 1) + self.assertEqual(res.failures, []) + self.assertEqual(len(res.errors), 1, "wrong number of errors raised") + self.assertRegex(res.errors[0][1], expected_error_regex) + self.assertFalse(res.wasSuccessful()) + + def test_normal_tests_run(self): + check = [] + class Test(ParamsMixin, unittest.TestCase): + def test_normal_tests_run(self): + check.append(1) + self._test_success(Test, 'test_normal_tests_run') + self.assertEqual(check, [1]) + + class ParameterizeFixture(ParamsMixin, unittest.TestCase): + + @params(a=C(1), b=C(2), c=C(3)) + def test_kw(self, value): + self.check.append(value) + + @params(Params(a=C(1), b=C(2), c=C(3))) + def test_arg(self, value): + self.check.append(value) + + @params + def test_params(self, value): + self.check.append(value) + params_test_params = Params(a=C(1), b=C(2), c=C(3)) + + @params(Params(a=C(1)), b=C(2)) + def test_multiple_sources(self, value): + self.check.append(value) + params_test_multiple_sources = Params(c=C(3)) + + @params(Params(a=C(1), b=C(2)), Params(c=C(3), d=C(4)), e=C(5), f=C(6)) + def test_multiple_multiple(self, value): + self.check.append(value) + params_test_multiple_multiple = Params(g=C(7), h=C(8)) + params_test_multiple_multiple__more = Params(i=C(9), h=C(10)) + + expected_names = [ + *[ + f'test_{n}__{k}' + for k in 'abc' + for n in ('kw', 'arg', 'params', 'multiple_sources') + ], + *[f'test_multiple_multiple__{k}' for k in 'abcdefgh'], + *[f'test_multiple_multiple__more__{k}' for k in 'ih'], + ] + + @params( + as_value('kw', 'arg', 'params', 'multiple_sources'), + multiple_multiple=C( + 'multiple_multiple', + expected={c: n for n, c in enumerate('abcdefgh', 1)}, + ), + multiple_multiple_more=C( + 'multiple_multiple__more', + expected={c: n for n, c in enumerate('ih', 9)}, + ) + ) + def test_parameterization(self, name, expected=dict(a=1, b=2, c=3)): + self.ParameterizeFixture.check = [] + values = [] + for k, v in expected.items(): + self._test_success(self.ParameterizeFixture, f'test_{name}__{k}') + values.append(v) + self.assertEqual(self.ParameterizeFixture.check, values) + with self.assertRaisesRegex( + ValueError, + r'(?i)(?=.*no.*test.*method)(?=.*test_{name}__bad)', + ): + self.ParameterizeFixture('test_{name}__bad') + + class RawValueFixture(ParamsMixin, unittest.TestCase): + + expected = dict(a=1, b=(2, 3), c=dict(z=4)) + + @params(**expected) + def test_kw(self, value): + self.check.append(value) + + @params(Params(**expected)) + def test_arg(self, value): + self.check.append(value) + + @params + def test_params(self, value): + self.check.append(value) + params_test_params = Params(**expected) + + @params_map + def identity(v): + yield '', v + @params + def test_params_map(self, value): + self.check.append(value) + params_test_params_map = identity(**expected) + + expected_names = [ + f'test_{n}__{k}' + for k in expected + for n in ('kw', 'arg', 'params', 'params_map') + ] + + @params(as_value('kw', 'arg', 'params', 'params_map')) + def test_raw_values_are_handled(self, name): + self.RawValueFixture.check = [] + values = [] + for k, v in self.RawValueFixture.expected.items(): + self._test_success(self.RawValueFixture, f'test_{name}__{k}') + values.append(v) + self.assertEqual(self.RawValueFixture.check, values) + + class ParamsAttributeFixture(ParamsMixin, unittest.TestCase): + + @params + def test(self, n): + self.check.append(('test', n)) + params_test = Params(a='test') + params_test__more = Params(a='test') + params_test__more__still = Params(a='test') + + @params + def test_(self, n): + self.check.append(('test', n)) + params_test_ = Params(a='test') + params_test___more = Params(a='test') + params_test___more__still = Params(a='test') + + @params + def test__(self, n): + self.check.append(('test__', n)) + params_test__ = Params(a='test__') + params_test____more = Params(a='test__') + params_test____more__still = Params(a='test__') + + @params + def test___(self, n): + self.check.append(('test___', n)) + params_test___ = Params(a='test___') + params_test_____more = Params(a='test___') + params_test_____more__still = Params(a='test___') + + @params + def test____(self, n): + self.check.append(('test____', n)) + params_test____ = Params(a='test____') + params_test______more = Params(a='test____') + params_test______more__still = Params(a='test____') + + @params + def test__foo(self, n): + self.check.append(('test__foo', n)) + params_test__foo = Params(a='test__foo') + params_test__foo__more = Params(a='test__foo') + params_test__foo__more__still = Params(a='test__foo') + + @params + def test___foo(self, n): + self.check.append(('test___foo', n)) + params_test___foo = Params(a='test___foo') + params_test___foo__more = Params(a='test___foo') + params_test___foo__more__still = Params(a='test___foo') + + @params + def test____foo(self, n): + self.check.append(('test____foo', n)) + params_test____foo = Params(a='test____foo') + params_test____foo__more = Params(a='test____foo') + params_test____foo__more__still = Params(a='test____foo') + + @params + def test_foo__bar(self, n): + self.check.append(('test_foo__bar', n)) + params_test_foo__bar = Params(a='test_foo__bar') + params_test_foo__bar__more = Params(a='test_foo__bar') + params_test_foo__bar__more__still = Params(a='test_foo__bar') + + @params + def test_foo__bar__baz(self, n): + self.check.append(('test_foo__bar__baz', n)) + params_test_foo__bar__baz = Params(a='test_foo__bar__baz') + params_test_foo__bar__baz__more = Params(a='test_foo__bar__baz') + params_test_foo__bar__baz__more__still = Params(a='test_foo__bar__baz') + + expected_names = [ + 'test__a', + 'test__more__a', + 'test__more__still__a', + 'test___a', + 'test___more__a', + 'test___more__still__a', + 'test____a', + 'test____more__a', + 'test____more__still__a', + 'test_____a', + 'test_____more__a', + 'test_____more__still__a', + 'test______a', + 'test______more__a', + 'test______more__still__a', + 'test__foo__a', + 'test__foo__more__a', + 'test__foo__more__still__a', + 'test___foo__a', + 'test___foo__more__a', + 'test___foo__more__still__a', + 'test____foo__a', + 'test____foo__more__a', + 'test____foo__more__still__a', + 'test_foo__bar__a', + 'test_foo__bar__more__a', + 'test_foo__bar__more__still__a', + 'test_foo__bar__baz__a', + 'test_foo__bar__baz__more__a', + 'test_foo__bar__baz__more__still__a', + ] + + @params(as_value(*ParamsAttributeFixture.expected_names)) + def test_params_attach_to_correct_tests(self, name): + self.ParamsAttributeFixture.check = [] + self._test_success(self.ParamsAttributeFixture, name) + self.assertEqual(*self.ParamsAttributeFixture.check[0]) + + @params( + for_each_function( + ParameterizeFixture, + RawValueFixture, + ParamsAttributeFixture + )(C()), + ) + def test_names_are_as_expected(self, fixture): + test_names = [x for x in dir(fixture) if x.startswith('test_')] + self.assertEqual(sorted(test_names), sorted(fixture.expected_names)) + + def test_empty_parameters_is_an_error_by_default(self): + msg = r"(?i)(?=.*'test_foo')(?=.*no.*param)" + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + @params() + def test_foo(self): + pass + params_test_foo = Params() + + def test_empty_decorator_is_ok_when_check_disabled(self): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params() + def test_foo(self): + pass + + def test_empty_parameters_is_ok_when_check_disabled(self): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params + def test_foo(self): + pass + params_test_foo = Params() + + def test_no_parameter_sets_is_an_error(self): + msg = r"(?i)(?=.*no.*param)(?=.*'test_foo')" + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params + def test_foo(self): + pass + + def test_no_parameter_sets_is_an_error_even_when_check_disabled(self): + msg = r"(?i)(?=.*no.*param)(?=.*'test_foo')" + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params + def test_foo(self): + pass + + def test_params_and_no_decorator_is_an_error(self): + with self.assertRaisesRegex( + ValueError, r'(?i)(?=.*params_test_foo)(?=.*no.*test)', + ): + class Test(ParamsMixin, unittest.TestCase): + def test_foo(self): + pass + params_test_foo = Params() + + def test_params_with_no_exactly_matching_test_is_an_error(self): + with self.assertRaisesRegex( + ValueError, r'(?i)(?=.*params_test_foo_bar)(?=.*no.*test)', + ): + class Test(ParamsMixin, unittest.TestCase): + @params + def test_foo(self): + pass + params_test_foo_bar = Params() + + def test_params_args_keys_must_differ(self): + with self.assertRaisesRegex(ValueError, r'ggg=.*6'): + class Test(ParamsMixin, unittest.TestCase): + @params(Params(xzy=1, b=2, ggg=3), Params(ggg=6, xzy=7)) + def test_foo(self): + pass + + def test_params_args_keys_must_differ_from_kws(self): + with self.assertRaisesRegex(ValueError, r'ggg=.*6'): + class Test(ParamsMixin, unittest.TestCase): + @params(Params(xzy=1, b=2, ggg=3), ggg=6, xzy=7) + def test_foo(self): + pass + + def test_params_args_keys_must_differ_from_params_attr_keys(self): + with self.assertRaisesRegexEx( + ValueError, r'params_test_foo', + ValueError, r'ggg=.*6', + ): + class Test(ParamsMixin, unittest.TestCase): + @params(Params(xzy=1, b=2, ggg=3)) + def test_foo(self): + pass + params_test_foo = Params(ggg=6, xzy=7) + + def test_kws_must_differ_from_params_attr_keys(self): + with self.assertRaisesRegexEx( + ValueError, r'params_test_foo', + ValueError, r'ggg=.*6', + ): + class Test(ParamsMixin, unittest.TestCase): + @params(xzy=1, b=2, ggg=3) + def test_foo(self): + pass + params_test_foo = Params(ggg=6, xzy=7) + + def test_params_attr_keys_must_differ(self): + with self.assertRaisesRegexEx( + ValueError, r"'params_test_bar__foo'", + ValueError, r'ggg=.*6', + ): + class Test(ParamsMixin, unittest.TestCase): + @params + def test_bar(self): + pass + params_test_bar = Params(foo__ggg=6, xzy=7) + params_test_bar__foo = Params(ggg=6, xzy=7) + + @params(TYPED_VALUES) + def test_non_params_arg_to_decorator_is_invalid(self, typ, val): + msg = fr'(?=.*1)(?=.*Params)(?=.*{typ})' + with self.assertRaisesRegex(TypeError, msg): + class Test(ParamsMixin, unittest.TestCase): + # we have to have a dummy argument here because unlike any + # normal call we'd otherwise only be passing one argument, and + # when we pass params exactly one callable it will think it is + # supposed to wrap it. Which is what it should do, but not + # what we are testing here. + @params(Params(dummy=1), val) + def test_bad_arg(self): + pass + + @params(TYPED_VALUES) + def test_non_params_value_for_params_attr_is_invalid(self, typ, val): + msg = fr'(?i)(?=.*params_test_bad_value)(?=.*not.*{typ})' + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + params_test_bad_value = val + + def test_debug(self): + with captured_stdout() as stdout: + class Test(ParamsMixin, unittest.TestCase): + paramsDebug = True + paramsRequired = False + def test_dummy(): pass + @params + def test_foo(self, a): pass + params_test_foo = Params(x=7, y=3) + @params(a=1, b=2) + def test_bar(self, z): pass + params_test_bar = Params(c=4, d=6) + self.assertEqual( + stdout.getvalue(), + # Making this an exact match means any change to the debug + # output requires a change here. On the other hand, that also + # means that temporary changes to the debug output during bug + # fixing in params_map itself will be caught by this test so + # they don't sneak in to production code unintentionally. + dedent("""\ + @params method 'test_foo' + params_ attribute 'params_test_foo' + @params method 'test_bar' + params_ attribute 'params_test_bar' + 'test_foo' has no decorator params and 1 params_ attribute + generated test_foo__x(7) + generated test_foo__y(3) + 'test_bar' has decorator params and 1 params_ attribute + generated test_bar__a(1) + generated test_bar__b(2) + generated test_bar__c(4) + generated test_bar__d(6) + """) + ) + + +class Test_params_map(AssertMixin, ParamsMixin, unittest.TestCase): + + @params + def test(self, callspec, expected): + i = 0 + @params_map + def numbered_params(*args, **kw): + nonlocal i + # With this 'if' we test params_map handling being handed raw data + yield f't{i}', args[0] if len(args) == 1 else C(*args, **kw) + i += 1 + result = callspec(numbered_params) + self.assertEqual(dict(result), expected) + self.assertIsInstance(result, Params) + + params_test__value_wrapping = Params( + string = C( C('abc'), dict(t0=C('abc')) ), + char = C( C(C('a')), dict(t0=C('a')) ), + tuple = C( C(('a', 2)), dict(t0=C(('a', 2))) ), + list = C( C(['b', 7]), dict(t0=C(['b', 7])) ), + dict = C( C(dict(a=1, b=2)), dict(t0=C(dict(a=1, b=2))) ), + multiple = C( C(4, 7, 9), dict(t0=C(4), t1=C(7), t2=C(9)) ), + kw_only = C( C(x=1, y=C(7)), dict(x__t0=C(1), y__t1=C(7)) ), + mixed = C( + C(1, (3, 5), z=[0, 1]), + dict(t0=C(1), t1=C((3, 5)), z__t2=C([0, 1])), + ), + mixed2 = C( + C(4, z=7, b=9), + dict(t0=C(4), z__t1=C(7), b__t2=C(9)), + ), + ) + + params_test__flattening = Params( + one_pset = C( + C(Params(a=1, b=2)), + dict(a__t0=C(1), b__t1=C(2)), + ), + pset_and_duplicator = C( + C( + Params(x=1, y=2, z=3), + params_map(lambda v: [('z', v), ('x', v)])( + Params(a='a', b='b'), + ), + ), + dict( + x__t0=C(1), + y__t1=C(2), + z__t2=C(3), + a__z__t3=C('a'), + a__x__t4=C('a'), + b__z__t5=C('b'), + b__x__t6=C('b'), + ), + ), + two_psets_and_kewords = C( + C(Params(a=0, b=1), Params(c=2, d=3), e=4, f=5), + {f"{chr(ord('a')+i)}__t{i}": C(i) for i in range(6)}, + ), + ) + + params_test__only = Params( + no_extra_name = C( + C(params_map(lambda v: only(C(v+1)))(a=1, b=2)), + dict(a__t0=C(2), b__t1=C(3)), + ), + adds_name = C( + C(params_map(lambda v: only('z', C(v+1)))(a=1, b=2)), + dict(a__z__t0=C(2), b__z__t1=C(3)), + ), + generates_name = C( + C(params_map(lambda v: only(chr(ord('b') + v), C(v+1)))(1, 2)), + dict(c__t0=C(2), d__t1=C(3)), + ), + ) + + def test_output_can_be_zero_or_many(self): + @params_map(with_name=True) + def zero_or_many(name, *args, **kw): + if name == 'skip': + return + if name == 'dup': + yield '1', C(*args, **kw) + yield '2', C(*args, **kw) + yield '3', C(*args, **kw) + else: + yield '', C(*args, **kw) + self.assertEqual( + zero_or_many(dup=C(1), skip=C(2), other=C(3)), + dict(dup__1=C(1), dup__2=C(1), dup__3=C(1), other=C(3)), + ) + + def test_composing_maps(self): + @params_map + def add_args(foo, bar): + yield foo, C(foo + bar) + @params_map + def no_zed(v): + yield '', C(v.removesuffix('zed')) + round1 = add_args(a=C('abc', 'de'), b=C('x', 'zed')) + self.assertEqual(round1, dict(a__abc=C('abcde'), b__x=C('xzed'))) + self.assertEqual(no_zed(round1), dict(a__abc=C('abcde'), b__x=C('x'))) + + @params( + repeated_name = C( C('a', ('a', 'a')), err="'a'" ), + colliding_names = C( C('a__a', a=C('a')), err=r"a=C\('a'\)" ), + null_name = C( C(''), err="''" ), + good_before_dup = C( C('a', 'b', 'c', 'c'), err="'c'" ), + empty_in_middle = C( C('a', 'b', '', 'c'), err="''" ), + ) + def test_names_must_be_unique(self, callspec, err): + @params_map + def yield_name(*args, **kw): + yield args[0], "doesn't matter" + with self.assertRaisesRegex(ValueError, err): + callspec(yield_name) + + @params + def test_with_name(self, callspec, expected): + @params_map(with_name=True) + def use_first_arg_if_no_name(n, *args, **kw): + label = '' if n else args[0] + yield label, C(*args, **kw) + self.assertEqual(callspec(use_first_arg_if_no_name), expected) + + params_test_with_name = Params( + named = C( C(a=1, b=2), expected=Params(a=C(1), b=C(2))), + noname = C( C('a', 'b'), expected=Params(a=C('a'), b=C('b'))), + mixed = C( C('a', b=2), expected=Params(a=C('a'), b=C(2))), + ) + + @params( + params_test_with_name, + cskip = C( C('z', c=1), expected=Params(z=C('z'))), + xyskip = C( C(b=1, x__y='a'), expected=Params(b=C(1))), + noxskip = C(C('m', x__b='a'), expected=Params(m=C('m'), x__b='a')), + ) + def test_with_namelist(self, callspec, expected): + @params_map(with_namelist=True) + def use_first_arg_or_skip_on_c_d_xy(nl, *args, **kw): + self.assertIsInstance(nl, NameList) + if nl.has_any('c', 'd') or nl.has_all('x', 'y'): + return + label = '' if nl else args[0] + yield label, C(*args, **kw) + self.assertEqual(callspec(use_first_arg_or_skip_on_c_d_xy), expected) + + def test_with_name_and_with_namelist_cannot_both_be_true(self): + with self.assertRaisesRegex(ValueError, "(?i)(?=.*both)(?=.*True)"): + @params_map(with_name=True, with_namelist=True) + def foo(): + pass + + @params( + on_call = C(lambda *_: 1/0), + on_bad_value = C(lambda *a: [('x', 0/a[-1])], on_0=True), + in_generator = C( + lambda *a: [(str(x), 0/x) for x in (a[-1]+1, a[-1])], + on_0=True, + ), + too_many_values = C(lambda _: [('', 2, 3)]), + too_few_values = C(lambda _: [('',)]), + non_iterable = C(lambda _: [1]), + non_string_name = C(lambda _: [(1, 1)]), + non_identifier = C(lambda _: [('.', 1)]), + ) + def test_errors_in_wrapped_function(self, func, on_0=False): + test_params_map = params_map(func) + expected = 'zero=C(0)' if on_0 else 'one=C(1)' + with self.assertRaisesRegex(ValueError, re.escape(expected)): + test_params_map(one=1, zero=0) + + @params( + no_argument = C(C(), TypeError, 'missing'), + extra_args = C(C(lambda: 1, 'bar'), TypeError, '2 were given'), + non_func_arg = C(C(1), TypeError, 'int'), + bad_keyword = C(C(bad_key=True), TypeError, 'bad_key'), + ) + def test_bad_arguments(self, callspec, ex, msg): + with self.assertRaisesRegex(ex, msg): + # For the keyword case we get back a wreapper and need to call + # it to see the error. Otherwise the error happens before the call. + callspec(params_map)('foo') + + def test_debug(self): + @params_map(debug=True) + def test_map(anarg, k=None): + yield 'arg', C(anarg) + yield 'kw', C(k=k) + with captured_stdout() as stdout: + test_map(x=C('a', k='b'), y=C('d', k='e')) + self.assertEqual( + stdout.getvalue(), + # Making this an exact match means any change to the debug + # output requires a change here. On the other hand, that also + # means that temporary changes to the debug output during bug + # fixing in params_map itself will be caught by this test so + # they don't sneak in to production code unintentionally. + dedent("""\ + flattening using test_map + an='x' av=C('a', k='b') + n='arg' v=C('a') name='x__arg' + n='kw' v=C(k='b') name='x__kw' + an='y' av=C('d', k='e') + n='arg' v=C('d') name='y__arg' + n='kw' v=C(k='e') name='y__kw' + """) + ) + + # The exact causes here are not part of the fixed expecations (just + # that there *is* a cause), but if they change it is worth noticing. + + utility_maps = params_map(lambda f, **k: only(f.__name__, C(f, **k)))( + C( + with_names, + as_map=with_names, + ), + C( + as_value, + as_map=as_value, + unnamed_cause_msg=r'(?i)(?=.*invalid)(?=.*1)' + ), + C( + add_label, + as_map=add_label('xxx'), + unnamed_cause_msg=r'x=C\(1\)', + badarg_cause_ex=ValueError, + badarg_cause_msg=r'(?i)(?=.*invalid)(?=.*1)' + ), + C( + include_if, + as_map=include_if(lambda *_: True), + badarg_cause_ex=TypeError, + badarg_cause_msg=r"'int' object is not callable", + ), + C( + include_unless, + as_map=include_unless(lambda *_: False), + badarg_cause_ex=TypeError, + badarg_cause_msg=r"'int' object is not callable", + ), + C( + for_each_name, + as_map=for_each_name('aname'), + unnamed_cause_msg=r"aname=(?=.*exists)(?=.*C\('aname', 1\))", + badarg_cause_ex=ValueError, + badarg_cause_msg=r"(?i)(?=.*invalid label)(?=.*1)", + ), + C( + for_each_function, + as_map=for_each_function(int), + unnamed_cause_msg=r'int=C\(.*int.*1\)', + badarg_cause_ex=AttributeError, + badarg_cause_msg=r"'int'.*__name__", + ), + ) + + @params + def test_utility_maps(self, utility, callspec, expected): + self.assertEqual(callspec(utility), expected) + + params_test_utility_maps = Params( + + params_map( + lambda *a, as_map, **k: only('no_args', C(as_map, C(), dict())) + )(utility_maps), + + with_names = C( + with_names, + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C('a', 1), z=C('z', 'a'), foo=C('foo', ['bar'])), + ), + + as_value = C( + as_value, + C('a', 'foo', 'bar'), + dict(a=C('a'), foo=C('foo'), bar=C('bar')), + ), + + add_label = C( + add_label('xxx'), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a__xxx=C(1), z__xxx=C('a'), foo__xxx=C(['bar'])), + ), + + include_if__include_all = C( + include_if(lambda *_: True), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1), z=C('a'), foo=C(['bar'])), + ), + + include_if__include_none = C( + include_if(lambda *_: False), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(), + ), + + include_unless__omit_all = C( + include_unless(lambda *_: True), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(), + ), + + include_unless__omit_none = C( + include_unless(lambda *_: False), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1), z=C('a'), foo=C(['bar'])), + ), + + include_if__include_one_letters = C( + include_if(lambda n, v: n.has_any('a', 'z')), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1), z=C('a')), + ), + + include_unless__omit_one_letters = C( + include_unless(lambda n, v: n.has_any('a', 'z')), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(foo=C(['bar'])), + ), + + include_if__include_int_values = C( + include_if(lambda n, v: type(v) == int), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1)), + ), + + include_unless__omit_int_values = C( + include_unless(lambda n, v: type(v) == int), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(z=C('a'), foo=C(['bar'])), + ), + + include_if__include_int_values_with_label = C( + include_if(lambda n, v: type(v) == int, label='int'), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a__int=C(1)), + ), + + include_unless__omit_int_values_with_label = C( + include_unless(lambda n, v: type(v) == int, label='non_int'), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(z__non_int=C('a'), foo__non_int=C(['bar'])), + ), + + for_each_name = C( + for_each_name('some', 'names'), + C(42, a=1, b=C(2, z=7)), + dict( + some=C('some', 42), + names=C('names', 42), + a__some=C('some', 1), + a__names=C('names', 1), + b__some=C('some', 2, z=7), + b__names=C('names', 2, z=7), + ), + ), + + for_each_function = C( + for_each_function(as_value, add_label), + C(42, a=1, b=C(2, z=7)), + dict( + as_value=C(as_value, 42), + add_label=C(add_label, 42), + a__as_value=C(as_value, 1), + a__add_label=C(add_label, 1), + b__as_value=C(as_value, 2, z=7), + b__add_label=C(add_label, 2, z=7), + ), + ), + + ) + + @params + def test_utility_map_failures( + self, + utility, + callspec, + ex, + msg, + cause_ex=None, + cause_msg=None, + ): + with self.assertRaisesRegexEx(ex, msg, cause_ex, cause_msg): + # Some errors only show up when a generated utility is called. + callspec(utility)(t1=1, t2=2) + + params_test_utility_map_failures = Params( + + params_map( + lambda f, as_map=None, unnamed_cause_msg=r'missing.*label', **k: + only('unnamed_input', + C( + as_map, + C(1, 1), + ValueError, fr'(?i)(?=.*{as_map.__name__})(?=.*1)', + ValueError, unnamed_cause_msg, + ), + ) + )(utility_maps), + + params_map( + lambda f, *, badarg_cs=C(1), badarg_cause_ex, badarg_cause_msg, **k: + only( + 'bad_map_maker_arg', + C( + f, + badarg_cs, + ValueError, fr'(?i)(?=.*{f.__name__})(?=.*t1=C\(1\))', + badarg_cause_ex, badarg_cause_msg, + ) + ) + )( + include_if(lambda n, *a, **k: 'badarg_cause_ex' in k)(utility_maps), + ), + + ) + + +class TestNameList(ParamsMixin, unittest.TestCase): + + names_to_list_map = dict( + XonenameX = ['XonenameX'], + Xtwo__namesX = ['Xtwo', 'namesX'], + Xmany__many__names__hereX = ['Xmany', 'many', 'names', 'hereX'], + Xnames_with__underscores_tooX = ['Xnames_with', 'underscores_tooX'], + Xtoo_many___underscores____are_confusingX = [ + 'Xtoo_many', '_underscores', '', 'are_confusingX', + ] + ) + + name_nl_and_list = params_map(with_name=True)( + lambda n, v: only(C(n, NameList(n), v)) + )(**names_to_list_map) + + @params(name_nl_and_list) + def test_str_equals_name(self, name, nl, aslist): + self.assertEqual(name, str(nl)) + + @params(name_nl_and_list) + def test_str_supports_startswith(self, name, nl, aslist): + self.assertTrue(str(nl).startswith(name[:2])) + self.assertFalse(str(nl).startswith('notthestart')) + + @params(name_nl_and_list) + def test_str_supports_endswith(self, name, nl, aslist): + self.assertTrue(str(nl).endswith(name[-2:])) + self.assertFalse(str(nl).endswith('nottheend')) + + @params(name_nl_and_list) + def test_str_supports_in(self, name, nl, aslist): + self.assertTrue(name[4:6] in str(nl)) + self.assertFalse('notthemiddle' in str(nl)) + + def test_empty_string_produces_empty_list(self): + self.assertEqual(list(NameList('')), []) + + nl_and_list = params_map(with_name=True)( + lambda n, v: only(C(NameList(n), v)) + )(**names_to_list_map) + + @params(nl_and_list) + def test_list(self, nl, aslist): + self.assertIsInstance(nl, list) + self.assertListEqual(nl, aslist) + + @params(nl_and_list) + def test_indexing(self, nl, aslist): + for i in range(len(aslist)): + self.assertEqual(nl[i], aslist[i]) + + @params(nl_and_list) + def test_contains(self, nl, aslist): + for name in aslist: + self.assertTrue(name in nl) + + # Running all of these for all the examples appears to be a bit of + # overkill, but not only does it exercise the machinery and provide a + # non-trivial example, it found a bug that failed for only one of the + # example names. + + @params_map + def has_all_tests(nl, l): + yield 'one_name', C( nl, C(l[0]), True) + yield 'all_names', C( nl, C(*l), True) + yield 'name_notname', C( nl, C(l[0], 'notname'), False) + yield 'no_name', C( nl, C(''), False) + + @params(has_all_tests(nl_and_list)) + def test_has_all(self, nl, callspec, expected_value): + self.assertEqual(callspec(nl.has_all), expected_value) + + @params_map + def has_any_tests(nl, l): + yield 'one_name', C( nl, C(l[0]), True) + yield 'one_name_tuple', C( nl, C((l[0],)), True) + yield 'one_name_list', C( nl, C([l[0]]), True) + yield 'all_names', C( nl, C(*l), True) + yield 'all_names_tuple', C( nl, C(tuple(l)), True) + yield 'all_names_list', C( nl, C(l), True) + yield 'all_names_dict', C( nl, C({n: n for n in l}), True) + yield 'name_notname', C( nl, C(l[0], 'notname'), True) + yield 'no_names', C( nl, C(), False) + yield 'no_names_list', C( nl, C([]), False) + yield 'one_notname', C( nl, C('notname'), False) + yield 'two_notnames', C( nl, C('notname', 'alsonot'), False) + yield 'null_str_arg', C( nl, C(''), False) + yield 'null_str_tuple', C( nl, C(('',)), False) + yield 'null_str_list', C( nl, C(['',]), False) + + @params(has_any_tests(nl_and_list)) + def test_has_any(self, nl, callspec, expected_value): + self.assertEqual(callspec(nl.has_any), expected_value) + + def test_has_any_false_if_empty_name(self): + nl = NameList('') + self.assertFalse(nl.has_any('')) + + def test_has_any_false_partial_names(self): + nl = NameList('foo__bar__bird') + self.assertTrue(nl.has_any('foo', 'bar', 'bird')) + self.assertFalse(nl.has_any('fo', '_bar', 'ird', 'ir', 'or', '')) + + +if __name__ == '__main__': + unittest.main() From 5797d0d90d70df46a5d67d145dab23f949f9bfcc Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 31 May 2026 13:16:53 -0400 Subject: [PATCH 002/152] Make params the default for email test cases. --- Lib/test/test_email/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index 5d708e6e97efe7b..c7891a0d7d6dcb3 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -6,6 +6,7 @@ from email._policybase import compat32 from test.support import load_package_tests from test.test_email import __file__ as landmark +from test.test_email.params import ParamsMixin # Load all tests in package def load_tests(*args): @@ -20,7 +21,7 @@ def openfile(filename, *args, **kws): # Base test class -class TestEmailBase(unittest.TestCase): +class TestEmailBase(ParamsMixin, unittest.TestCase): maxDiff = None # Currently the default policy is compat32. By setting that as the default @@ -72,6 +73,8 @@ def assertDefectsEqual(self, actual, expected): 'item {}'.format(i)) +# XXX Don't use this for new tests, use params instead. @parameterized will be +# deprecated and removed eventually. def parameterize(cls): """A test method parameterization class decorator. From a5d8705f4a72dbb9ed05ec2b56940010c8afede2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 28 Feb 2026 15:28:01 -0500 Subject: [PATCH 003/152] DROPME: temporary unittest usability hack. --- Lib/test/test_email/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index c7891a0d7d6dcb3..3e88a80c3fdbbf5 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -23,6 +23,11 @@ def openfile(filename, *args, **kws): # Base test class class TestEmailBase(ParamsMixin, unittest.TestCase): + # XXX XXX temporary usability hack, edit this out before publishing PR. + def __str__(self): + from unittest.util import strclass + return "%s.%s" % (strclass(self.__class__), self._testMethodName) + maxDiff = None # Currently the default policy is compat32. By setting that as the default # here we make minimal changes in the test_email tests compared to their From 71c9ca4aa9f173a2023674a6ef5d3b8c4e48e426 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 26 Dec 2025 02:36:41 -0500 Subject: [PATCH 004/152] Enhance defects assertion. This new version gives a much more useful error message, and also supports specifying regexes to match the expected defects. It also no longer depends on the order of the defects, making it less fragile. In addition it supports using a function to produce the defect and regex, making it easier to verify defects that take arguments instead of a message string. --- Lib/test/test_email/__init__.py | 81 +++++- .../test_email/test_testing_infrastructure.py | 266 ++++++++++++++++++ 2 files changed, 340 insertions(+), 7 deletions(-) create mode 100644 Lib/test/test_email/test_testing_infrastructure.py diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index 3e88a80c3fdbbf5..876fa7acbfa14f0 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -1,7 +1,8 @@ -import os -import unittest import collections import email +import os +import re +import unittest from email.message import Message from email._policybase import compat32 from test.support import load_package_tests @@ -71,11 +72,77 @@ def assertBytesEqual(self, first, second, msg): """Our byte strings are really encoded strings; improve diff output""" self.assertEqual(self._bytes_repr(first), self._bytes_repr(second)) - def assertDefectsEqual(self, actual, expected): - self.assertEqual(len(actual), len(expected), actual) - for i in range(len(actual)): - self.assertIsInstance(actual[i], expected[i], - 'item {}'.format(i)) + def assertDefectsMatch(self, actual, expected): + """Assert list of defects matches a list of expected defect patterns + + actual should be a list of actual defect instances. expected should + a list of patterns. Match the patterns against the actual list, + and report any defects that do not match a pattern or any patterns + that do not match a defect. Matching must be one to one: if there + are two identical defects in the actual list, it should be an error + if there are not two patterns that match those defects in the + expected list. + + A pattern can be one of three things: + 1) a defect class (eg: InvalidHeaderDefect) + 2) a tuple of (defect_class, regex), where the regex must + match the message produced by calling str on the actual defect + 3) a tuple of (callable, *args) where calling the callable + with the args must produce a tuple as in (2). + + """ + aleft = list(actual) + eleft = [] + for x in expected: + p = None + while not p: + if type(x) is type: + p = (x, '.*') + elif not hasattr(x, '__getitem__'): + raise ValueError(f'invalid defect pattern: {x!r}') + elif type(x[0]) is type: + p = x + elif callable(x[0]): + x = x[0](*x[1:]) + else: + raise ValueError(f'invalid defect pattern: {x!r}') + eleft.append(p) + for t, s in list(eleft): + for a in aleft: + if type(a) == t and re.search(s, str(a), flags=re.I): + eleft.remove((t, s)) + aleft.remove(a) + break + if eleft or aleft: + areprs = [repr((type(a), str(a))) for a in aleft] + ereprs = [repr(e) for e in eleft] + matched = f"{len(actual) - len(aleft)} defects matched" + if len(eleft) == len(aleft): + raise self.failureException( + f"{matched}, {len(aleft)} defects did not match:" + f"\n unmatched expected:\n {'\n '.join(ereprs)}" + f"\n unmatched actual:\n {'\n '.join(areprs)}" + ) + if len(eleft) == 0: + raise self.failureException( + f"{matched}, {len(aleft)} extra defects:" + f"\n {'\n '.join(areprs)}" + ) + if len(aleft) == 0: + raise self.failureException( + f"{matched}, {len(eleft)} missing defects:" + f"\n {'\n '.join(ereprs)}" + ) + else: + raise self.failureException( + f"Expected {len(expected)} defects but got {len(actual)};" + f" {matched}, {len(eleft)} missing, {len(aleft)} extra:" + f"\n unmatched actual:\n {'\n '.join(areprs)}" + f"\n unmatched expected:\n {'\n '.join(ereprs)}" + ) + + # XXX assertDefectsEqual can go away when it is no longer used. + assertDefectsEqual = assertDefectsMatch # XXX Don't use this for new tests, use params instead. @parameterized will be diff --git a/Lib/test/test_email/test_testing_infrastructure.py b/Lib/test/test_email/test_testing_infrastructure.py new file mode 100644 index 000000000000000..26e60244898a362 --- /dev/null +++ b/Lib/test/test_email/test_testing_infrastructure.py @@ -0,0 +1,266 @@ +from email import errors +from test.test_email import TestEmailBase +from test.test_email.params import C, params_map, params + +class TestAssertDefectsMatch(TestEmailBase): + + # The code should behave the same whether the pattern comes direct or + # out of a callable. + @params_map + def direct_and_callable(actual, expected, *args): + yield 'direct', C(actual, expected, *args) + expected = [(lambda x: x, x) for x in expected] + yield 'callable', C(actual, expected, *args) + + @params + def test_success(self, actual, expected): + self.assertDefectsMatch(actual, expected) + + np_checker = lambda s: (errors.NonPrintableDefect, f'.*non-printable.*{s}') + + params_test_success = direct_and_callable( + + no_defects = C([], []), + + one_defect_by_class = C( + [errors.InvalidHeaderDefect('foo')], + [errors.InvalidHeaderDefect], + ), + + one_defect_by_regex = C( + [errors.InvalidHeaderDefect('This is a message')], + [(errors.InvalidHeaderDefect, '.*is a')], + ), + + multiple_defects_by_class = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.InvalidHeaderDefect('This is a different message'), + errors.InvalidHeaderDefect('This is the same message'), + errors.InvalidHeaderDefect('This is the same message'), + ], + [*[errors.InvalidHeaderDefect] * 4], + ), + + multiple_defects_by_regex = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.InvalidHeaderDefect('This is a different message'), + errors.InvalidHeaderDefect('This is the same message'), + errors.InvalidHeaderDefect('This is the same message'), + ], + [ + (errors.InvalidHeaderDefect, '.*the same'), + (errors.InvalidHeaderDefect, '.*is a'), + (errors.InvalidHeaderDefect, '.*different'), + (errors.InvalidHeaderDefect, '.*the same'), + ], + ), + + multiple_different_defects_by_class = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.ObsoleteHeaderDefect('This is a different message'), + errors.NonPrintableDefect('abc'), + ], + [ + errors.InvalidHeaderDefect, + errors.ObsoleteHeaderDefect, + errors.NonPrintableDefect, + ], + ), + + multiple_different_defects_by_regex = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.ObsoleteHeaderDefect('This is a different message'), + errors.NonPrintableDefect('abc'), + ], + [ + (errors.ObsoleteHeaderDefect, '.*different'), + (errors.NonPrintableDefect, '.*non-printable.*abc'), + (errors.InvalidHeaderDefect, '.*is a'), + ], + ), + + ) + + @params + def test_failure(self, actual, expected, msg): + with self.assertRaisesRegex(AssertionError, msg): + self.assertDefectsMatch(actual, expected) + + params_test_failure = direct_and_callable( + + one_extra_defect_expecting_none = C( + [errors.InvalidHeaderDefect('foo')], + [], + r'(?i)0.*matched.*1.*extra', + ), + + two_extra_defects_expecting_none = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + ], + [], + r'(?i)0.*matched.*2.*extra', + ), + + two_extra_defects_expecting_one = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + ], + [(errors.InvalidHeaderDefect, 'bar')], + r'(?i)1.*matched.*1.*extra', + ), + + three_extra_defects_expecting_one = C( + [ + errors.InvalidHeaderDefect('foo'), + *[errors.InvalidHeaderDefect('bar')]*3, + ], + [(errors.InvalidHeaderDefect, 'bar')], + r'(?is)1.*matched.*3.*extra(?=.*foo)(?=.*bar.*bar)', + ), + + one_missing_defect_expecting_one = C( + [], + [(errors.InvalidHeaderDefect, 'bar')], + r'(?is)0.*matched.*1.*missing.*bar', + ), + + two_missing_defects_expecting_two = C( + [ + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + ], + [ + (errors.InvalidHeaderDefect, 'foo'), + (errors.InvalidHeaderDefect, 'bird'), + ], + r'(?is)0.*matched.*2.*did not match' + r'(?=.*foo)(?=.*bird)(?=.*bar)(?=.*bing)', + ), + + two_missing_defects_expecting_four = C( + [ + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + ], + [ + (errors.InvalidHeaderDefect, 'bar'), + (errors.InvalidHeaderDefect, 'foo'), + (errors.InvalidHeaderDefect, 'bing'), + (errors.InvalidHeaderDefect, 'bird'), + ], + r'(?is)2.*matched.*2.*missing(?=.*foo)(?=.*bird)', + ), + + two_extra_defects_expecting_two = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + errors.InvalidHeaderDefect('bar'), + ], + [ + (errors.InvalidHeaderDefect, 'bar'), + (errors.InvalidHeaderDefect, 'bing'), + ], + r'(?is)2.*matched.*2.*extra(?=.*foo)(?=.*bar)', + ), + + two_extra_defects_one_missing_expecting_three = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + errors.InvalidHeaderDefect('bar'), + ], + [ + (errors.InvalidHeaderDefect, 'bar'), + (errors.InvalidHeaderDefect, 'bing'), + (errors.InvalidHeaderDefect, 'bing'), + ], + r'(?is)2.*matched(?=.*2.*extra)(?=.*1.*missing)' + r'(?=.*foo)(?=.*bar)(?=.*bing)', + ), + + actual_is_string = C( + ['foo'], + [], + r'(?is)0.*matched.*1.*extra(?=.*str.*foo)', + ), + + actual_is_tuple = C( + [(errors.InvalidHeaderDefect, 'foo', 'bar')], + [], + r'(?is)0.*matched.*1.*extra(?=.*tuple.*InvalidHeaderDefect.*foo)', + ), + + ) + + @params + def test_bad_expected_patterns(self, actual, expected, msg): + with self.assertRaisesRegex((ValueError, TypeError), msg): + self.assertDefectsMatch(actual, expected) + + params_test_bad_expected_patterns = direct_and_callable( + + not_subscriptable = C( + [], + [1], + r'(?i)(?=.*invalid).*1', + ), + + string = C( + [], + ['foo'], + r'(?i)(?=.*invalid).*foo', + ), + + triple = C( + [], + [(errors.InvalidHeaderDefect, 'foo', 'bar')], + r'(?i)too many values', + ), + + singleton = C( + [], + [(errors.InvalidHeaderDefect,)], + '(?i)not enough values', + ), + + # This only happens if a comparison is made. Which will happen. + regex_is_not_string = C( + [errors.InvalidHeaderDefect('foo')], + [(errors.InvalidHeaderDefect, 200)], + r'(?i)must be string', + ), + + backwards_expected_entry = C( + [], + [('foo', errors.InvalidHeaderDefect)], + r'(?i)(?=.*invalid).*foo.*InvalidHeaderDefect', + ), + + ) + + @params( + multiple_args = C( + [(lambda x, y, z: (errors.InvalidHeaderDefect, z), 'x', 1, 'foo')], + ), + no_args = C([(lambda: errors.InvalidHeaderDefect,)]), + ) + def test_callable_success(self, expected): + self.assertDefectsMatch([errors.InvalidHeaderDefect('foo')], expected) + + @params( + no_args_bad_result = C([(lambda: 'bad value',)], r'(?i)bad value'), + wrong_number_of_args = C([(lambda: 'x', 1)], r'(?i)arguments'), + ) + def test_callable_failure(self, expected, msg): + with self.assertRaisesRegex((ValueError, TypeError), msg): + self.assertDefectsMatch([], expected) From 2096594def314ecf6ee9d7b97c346175834d5b42 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Tue, 19 May 2026 16:21:04 -0400 Subject: [PATCH 005/152] Add defect expectation constants and functions. These will be testing for exact matches of the messages, but since there's only one place in the tests that need to be changed if the message changes, this seems reasonable. There could in theory be a defect subclass hierarchy here, but we'll leave that idea for some future refactor. --- .../test_email/test__header_value_parser.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9d9fe418ee4d067..d8d2666ec3683c1 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1,3 +1,4 @@ +import re import string import unittest from email import _header_value_parser as parser @@ -5,6 +6,131 @@ from email import policy from test.test_email import TestEmailBase, parameterize + +# ---> Defect Expectations + +undecodable_bytes_defect = ( + errors.UndecodableBytesDefect, + 'Non-ASCII characters found in header token', + ) + +def undecodable_bytes_in_ew_defect(chars): + return ( + errors.UndecodableBytesDefect, + f"Encoded word contains bytes not decodable using '{chars}' charset", + ) + +def nonprintable_defect(chars): + return ( + errors.NonPrintableDefect, + 'the following ASCII non-printables found in header:' + f' {re.escape(repr(list(chars)))}', + ) + +whitespace_inside_ew_defect = ( + errors.InvalidHeaderDefect, + 'whitespace inside encoded word', + ) + +missing_whitespace_before_ew_defect = ( + errors.InvalidHeaderDefect, + 'missing whitespace before encoded word', + ) + +missing_whitespace_after_ew_defect = ( + errors.InvalidHeaderDefect, + 'missing trailing whitespace after encoded-word', + ) + +def charset_defect(chars): + return ( + errors.CharsetError, + f"Unknown charset '{chars}' in encoded word; decoded as unknown bytes", + ) + +invalid_base64_padding_defect = ( + errors.InvalidBase64PaddingDefect, + '', + ) + +invalid_base64_characters_defect = ( + errors.InvalidBase64CharactersDefect, + '', + ) + +invalid_base64_length_defect = ( + errors.InvalidBase64LengthDefect, + '', + ) + +end_inside_quoted_string_defect = ( + errors.InvalidHeaderDefect, + 'end of header inside quoted string', + ) + +ew_inside_quoted_string_defect = ( + errors.InvalidHeaderDefect, + 'encoded word inside quoted string', + ) + +end_inside_comment_defect = ( + errors.InvalidHeaderDefect, + 'end of header inside comment', + ) + +period_in_phrase_obs_defect = ( + errors.ObsoleteHeaderDefect, + "period in 'phrase'", + ) + +comment_without_atom_in_phrase_obs_defect = ( + errors.ObsoleteHeaderDefect, + 'comment found without atom', + ) + +non_word_phrase_start_defect = ( + errors.InvalidHeaderDefect, + "phrase does not start with word", + ) + +non_dot_atom_local_part_obs_defect = ( + errors.ObsoleteHeaderDefect, + r'local-part is not a dot-atom \(contains CFWS\)', + ) + +not_even_obs_local_part_defect = ( + errors.InvalidHeaderDefect, + 'local-part is not dot-atom, quoted-string, or obs-local-part', + ) + +missing_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "missing '.' between words", + ) + +trailing_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "invalid trailing '.' in local part", + ) + +leading_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "invalid leading '.' in local part", + ) + +repeated_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "invalid repeated '.'", + ) + +misplaced_backslash_defect = ( + errors.InvalidHeaderDefect, + r"'\\' character outside of quoted-string/ccontent", + ) + +# ---> End Defect Expectations + + class TestTokens(TestEmailBase): # EWWhiteSpaceTerminal From c358facf658c061446207518f287cdd9f1808b28 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 21 Jan 2026 18:34:00 -0500 Subject: [PATCH 006/152] Fix defect copying when extending a token list. BUGFIX: In certain cases correctly detected defects were being dropped when combining syntactic units. These defects are now correctly copied to the higher level syntactic units, which will result in additional defects being reported in certain cases (mostly mailbox and MIME parameter lists). I found this bug fairly far along in the refactoring process, but it makes more sense to fix it early, since the test bug it triggered wasn't related to the code I was working on at the time. --- Lib/email/_header_value_parser.py | 5 +++ .../test_email/test__header_value_parser.py | 34 ++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 792072ab9f6128a..7fb0dfd66f37082 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -146,6 +146,11 @@ def __repr__(self): return '{}({})'.format(self.__class__.__name__, super().__repr__()) + def extend(self, value): + super().extend(value) + if hasattr(value, 'defects'): + self.defects.extend(value.defects) + @property def value(self): return ''.join(x.value for x in self if x.value) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index d8d2666ec3683c1..ac8ac92c0f7102c 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -5,6 +5,7 @@ from email import errors from email import policy from test.test_email import TestEmailBase, parameterize +from test.test_email.params import C, params # ---> Defect Expectations @@ -131,6 +132,37 @@ def charset_defect(chars): # ---> End Defect Expectations +class TestTokenList(TestEmailBase): + + @params( + none_none = C([], []), + one_none = C([errors.InvalidHeaderDefect('a')], []), + none_one = C([], [errors.InvalidHeaderDefect('b')]), + one_one = C( + [errors.InvalidHeaderDefect('a')], + [errors.InvalidHeaderDefect('b')], + ), + two_two = C( + [errors.InvalidHeaderDefect('a'), errors.NonPrintableDefect('y')], + [errors.NonPrintableDefect('b'), errors.InvalidHeaderDefect('z')], + ), + ) + def test_extend_copies_defects(self, existing, new): + tl1 = parser.TokenList() + tl1.defects.extend(existing) + tl2 = parser.TokenList(['fake', 'values']) + tl2.defects.extend(new) + tl1.extend(tl2) + self.assertEqual(tl1.defects, existing + new) + + def test_extend_with_non_token_list_leaves_defects_unchanged(self): + tl = parser.TokenList() + defects = [errors.InvalidHeaderDefect('a')] + tl.defects.extend(defects) + tl.extend(['fake', 'values']) + self.assertEqual(tl.defects, defects) + + class TestTokens(TestEmailBase): # EWWhiteSpaceTerminal @@ -2395,7 +2427,7 @@ def test_get_group_list_obs_group_list(self): ', (foo),,(bar)', ', (foo),,(bar)', ', ,, ', - [errors.ObsoleteHeaderDefect], + [errors.ObsoleteHeaderDefect] * 5, '') self.assertEqual(group_list.token_type, 'group-list') self.assertEqual(len(group_list.mailboxes), 0) From f847561e030d049681cb8d25aa14e024f5966501 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 14 Mar 2026 15:19:06 -0400 Subject: [PATCH 007/152] Add for_each_character params filter. This allows us to easily test that, for example, *every* non-printable character is treated as an error, not just the one used by a non-parameterized test. --- Lib/test/test_email/__init__.py | 49 +++++++- .../test_email/test_testing_infrastructure.py | 115 +++++++++++++++++- 2 files changed, 161 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index 876fa7acbfa14f0..2195a13fc14e621 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -2,12 +2,14 @@ import email import os import re +import unicodedata import unittest +from curses.ascii import controlnames from email.message import Message from email._policybase import compat32 from test.support import load_package_tests from test.test_email import __file__ as landmark -from test.test_email.params import ParamsMixin +from test.test_email.params import C, params_map, ParamsMixin # Load all tests in package def load_tests(*args): @@ -20,6 +22,51 @@ def openfile(filename, *args, **kws): path = os.path.join(os.path.dirname(landmark), 'data', filename) return open(path, *args, **kws) +def charname(c): + try: + n = unicodedata.name(c).lower().replace(' ', '_').replace('-', '_') + except ValueError: + try: + n = controlnames[ord(c)] + except IndexError: + assert c == '\x7F' + return 'DEL' + return n + +def for_each_character(chars, skip=''): + """Create a filter that expands each input into a test per character. + + chars should be an iterable of characters (eg a string), as should skip. + + For each character in chars that is not in skip, the filter should process + all arguments and keywords, creating a new call spec. For any objects and + (recursively} sub-objects found that have a 'format' attribute, replace the + object in the new call spec with the results of calling the object's format + method, passing the method three keyword arguments: 'char', set to the + character, 'echar', set to the character passed through re.escape, and + 'erchar', set to the repr of the character (without the quotes) passed + through re.escape. + + Process any dictionary object's values, but not its keys. Assume that any + other object that is an iterator can be recreated by passing its type a + list of objects. + + Return the character name as derived from unicodedata or the curses ascii + module as as the name string to be added to the test name. + + """ + chars = {charname(v): v for v in chars if v not in skip} + @params_map + def for_each_character_in(*args, **kw): + for name, c in chars.items(): + subs = dict( + char=c, + echar=re.escape(c), + erchar=re.escape(repr(c)[1:-1]), + ) + yield name, C(*args, **kw).fmtall(**subs) + return for_each_character_in + # Base test class class TestEmailBase(ParamsMixin, unittest.TestCase): diff --git a/Lib/test/test_email/test_testing_infrastructure.py b/Lib/test/test_email/test_testing_infrastructure.py index 26e60244898a362..e3b6214998794b1 100644 --- a/Lib/test/test_email/test_testing_infrastructure.py +++ b/Lib/test/test_email/test_testing_infrastructure.py @@ -1,6 +1,6 @@ from email import errors -from test.test_email import TestEmailBase -from test.test_email.params import C, params_map, params +from test.test_email import TestEmailBase, for_each_character +from test.test_email.params import C, params_map, params, Params class TestAssertDefectsMatch(TestEmailBase): @@ -264,3 +264,114 @@ def test_callable_success(self, expected): def test_callable_failure(self, expected, msg): with self.assertRaisesRegex((ValueError, TypeError), msg): self.assertDefectsMatch([], expected) + + +class TestForEachCharacter(TestEmailBase): + + @params + def test_for_each_character(self, callspec, chars, expected): + callspecs = Params(test=callspec) + expected = Params(**{f'test__{c}': v for c, v in expected.items()}) + self.assertEqual(for_each_character(chars)(callspecs), expected) + + @params_map + def for_each_value_type(callspec, chars, expected): + yield '', C(callspec, chars, expected) + yield 'in_list', C( + C(foo=['bar', callspec.args[0], 'z']), + chars=chars, + expected={ + n: C(foo=['bar', v.args[0], 'z']) for n, v in expected.items()}, + ) + yield 'in_tuple', C( + C(foo=('bar', callspec.args[0], 'z')), + chars=chars, + expected={ + n: C(foo=('bar', v.args[0], 'z')) for n, v in expected.items()}, + ) + yield 'in_dict', C( + C(foo=dict(a=callspec.args[0], z=1)), + chars=chars, + expected={ + n: C(foo=dict(a=v.args[0], z=1)) for n, v in expected.items()}, + ) + + params_test_for_each_character = for_each_value_type( + + no_subs = C( + C('no subs'), + chars='./', + expected=dict(full_stop=C('no subs'), solidus=C('no subs')), + ), + + one_sub = C( + C('one{char}sub'), + chars='./', + expected=dict(full_stop=C('one.sub'), solidus=C('one/sub')), + ), + + all_three_sub_types = C( + C('plain {char} escaped {echar} escaped repr {erchar}.'), + chars='\t', + expected=dict(HT=C('plain \t escaped \\\t escaped repr \\\\t.')), + ), + + a_list = C( + C(['a', '{char}', '{echar}']), + chars='/', + expected=dict(solidus=C(['a', '/', '/'])), + ), + + a_tuple = C( + C(('{char}{echar}', '{erchar}')), + chars='.', + expected=dict(full_stop=C((r'.\.', r'\.'))), + ), + + a_dict = C( + C(dict(a='{char}', b='{erchar}')), + chars='a', + expected=dict(latin_small_letter_a=C(dict(a='a', b='a'))), + ), + + ) + + def test_for_each_character_complex_input(self): + callspecs = Params( + two_positional=C('pos {char} 1', 'pos {echar} 2'), + all_value_types=C( + '{char}', + d=dict(a='a {char}', r='{erchar}'), + l=['a', '{char}', 'b'], + t=('{char}', ('{char}', 1)), + ), + dummy=C('no subs'), + ) + chars = 'X\t.' + expected = Params( + two_positional__latin_capital_letter_x=C('pos X 1', 'pos X 2'), + two_positional__full_stop=C('pos . 1', r'pos \. 2'), + two_positional__HT=C('pos \t 1', 'pos \\\t 2'), + all_value_types__latin_capital_letter_x=C( + 'X', + d=dict(a='a X', r='X'), + l=['a', 'X', 'b'], + t=('X', ('X', 1)), + ), + all_value_types__HT=C( + '\t', + d=dict(a='a \t', r='\\\\t'), + l=['a', '\t', 'b'], + t=('\t', ('\t', 1)), + ), + all_value_types__full_stop=C( + '.', + d=dict(a='a .', r='\\.'), + l=['a', '.', 'b'], + t=('.', ('.', 1)), + ), + dummy__latin_capital_letter_x=C('no subs'), + dummy__full_stop=C('no subs'), + dummy__HT=C('no subs'), + ) + self.assertEqual(for_each_character(chars)(callspecs), expected) From de749a3d60b6727aac76117c36b232dbe8844b27 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 8 Jan 2026 12:09:38 -0500 Subject: [PATCH 008/152] Begin converting _wsp_splitter tests to new framework. The test refactoring process is a bit laborious. I ended up splitting the changesets up into incremental steps to make sure that I wasn't omitting or changing any tests that I didn't intend to. Some changesets are fairly clean as is, others are best viewed using git's --color-words or a similar substring rather than line based diff tool, such as github's diff display. Whether it will be worthwhile for anyone to review all these commits is an open question, but having them helped me get the refactoring correct, especially when I had to go back and fix mistakes and/or bugs at various points during the refactoring. Not to mention rebasing on main as some parser bugs were fixed there. Along the way we are going to fix a non-trivial number of small bugs. The commit messages will contain paragraphs starting with "BUGFIX:" for any such changes. All such bug fixes are internal: not worthy of a NEWS item beyond the one announcing the parser rewrite. Anything that doesn't fall into that category I intend to fix in main first in separate PRs. This commit is the setup, to make the next diff easier to read. --- Lib/test/test_email/__init__.py | 4 ++++ Lib/test/test_email/test__header_value_parser.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index 2195a13fc14e621..67121107b5e0214 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -71,6 +71,10 @@ def for_each_character_in(*args, **kw): # Base test class class TestEmailBase(ParamsMixin, unittest.TestCase): + # XXX XXX Delete this at end of refactor. We will be putting in temporary + # empty parameter lists during the refactoring process. + paramsRequired = False + # XXX XXX temporary usability hack, edit this out before publishing PR. def __str__(self): from unittest.util import strclass diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ac8ac92c0f7102c..258714ddc80a2bb 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -5,7 +5,7 @@ from email import errors from email import policy from test.test_email import TestEmailBase, parameterize -from test.test_email.params import C, params +from test.test_email.params import C, params, Params # ---> Defect Expectations @@ -202,13 +202,20 @@ def _test_parse_x(self, method, input, string, value, defects, class TestParser(TestParserMixin, TestEmailBase): - # _wsp_splitter - rfc_printable_ascii = bytes(range(33, 127)).decode('ascii') rfc_atext_chars = (string.ascii_letters + string.digits + "!#$%&\'*+-/=?^_`{}|~") rfc_dtext_chars = rfc_printable_ascii.translate(str.maketrans('','',r'\[]')) + # _wsp_splitter + + @params + def test__wsp_splitter(self, s, res): + self.assertEqual(parser._wsp_splitter(s, 1), res) + + params_test__wsp_splitter = Params( + ) + def test__wsp_splitter_one_word(self): self.assertEqual(parser._wsp_splitter('foo', 1), ['foo']) From 664e2e30a47197060484157f42ba6cfa92e2f2d5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 28 Dec 2025 13:03:50 -0500 Subject: [PATCH 009/152] Do _wsp_splitter test conversion. --- Lib/test/test_email/test__header_value_parser.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 258714ddc80a2bb..60d90a9d5d8623b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -214,19 +214,11 @@ def test__wsp_splitter(self, s, res): self.assertEqual(parser._wsp_splitter(s, 1), res) params_test__wsp_splitter = Params( + test__wsp_splitter_one_word = C('foo', ['foo']), + test__wsp_splitter_two_words = C('foo def', ['foo', ' ', 'def']), + test__wsp_splitter_ws_runs = C('foo \t def jik', ['foo', ' \t ', 'def jik']), ) - def test__wsp_splitter_one_word(self): - self.assertEqual(parser._wsp_splitter('foo', 1), ['foo']) - - def test__wsp_splitter_two_words(self): - self.assertEqual(parser._wsp_splitter('foo def', 1), - ['foo', ' ', 'def']) - - def test__wsp_splitter_ws_runs(self): - self.assertEqual(parser._wsp_splitter('foo \t def jik', 1), - ['foo', ' \t ', 'def jik']) - # get_fws From 2bb5c390a435b0827f110a871f9fb5b378180ab2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 28 Dec 2025 13:04:57 -0500 Subject: [PATCH 010/152] Update _wsp_splitter test names. --- Lib/test/test_email/test__header_value_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 60d90a9d5d8623b..174e444e88a59ef 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -214,9 +214,9 @@ def test__wsp_splitter(self, s, res): self.assertEqual(parser._wsp_splitter(s, 1), res) params_test__wsp_splitter = Params( - test__wsp_splitter_one_word = C('foo', ['foo']), - test__wsp_splitter_two_words = C('foo def', ['foo', ' ', 'def']), - test__wsp_splitter_ws_runs = C('foo \t def jik', ['foo', ' \t ', 'def jik']), + one_word = C('foo', ['foo']), + two_words = C('foo def', ['foo', ' ', 'def']), + ws_runs = C('foo \t def jik', ['foo', ' \t ', 'def jik']), ) From 7f08d64cf2b818f478b3ffd4e23a9fcc2bfb75a6 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 14 Mar 2026 15:23:30 -0400 Subject: [PATCH 011/152] Add tests for _validate_xtext. This functionality of this code will eventually get absorbed into a new function during the refactor, but as the test comments say, equivalent tests will still need to run against the new implementation. And we'll probably keep the function around until the deprecation period has expired. --- .../test_email/test__header_value_parser.py | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 174e444e88a59ef..456b5047f3a57c6 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -4,9 +4,12 @@ from email import _header_value_parser as parser from email import errors from email import policy -from test.test_email import TestEmailBase, parameterize +from test.test_email import for_each_character, TestEmailBase, parameterize from test.test_email.params import C, params, Params +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 +RFC_NONPRINTABLES = bytes([*range(0, 33), 127]).decode('ascii') + # ---> Defect Expectations @@ -220,6 +223,55 @@ def test__wsp_splitter(self, s, res): ) + # _validate_xtext + + # As an internal method these tests are not API requirements; however, the + # behavior they check must be verified one way or another, so if the + # implementation changes there need to be equivalent tests. + + @params + def test__validate_xtext(self, s, defects=[]): + vt = parser.ValueTerminal(s, 'test') + parser._validate_xtext(vt) + self.assertDefectsMatch(vt.defects, defects) + + params_test__validate_xtext = Params( + + valid = C('foo'), + + # Although it looks a bit odd for unicode to be acceptable when we have + # a non-ascii error, the parser in fact handles unicode. + unicode = C('föö'), + + # The non-ascii error arises only if the input was supposed to be 7-bit + # ASCII but in fact had non-ascii in it, in which case those bytes end + # up as surrogates. Thus the name of the defect. + surrogates = C( + 'föö'.encode().decode('ascii', 'surrogateescape'), + # "Non-ASCII characters found in header token" + defects=[undecodable_bytes_defect], + ), + + multiple_nps = C( + 'a\ttab spaces and\rcarriage return', + defects=[(nonprintable_defect, '\t \r ')], + ), + + nps_and_surrogates = C( + 'föö\t'.encode().decode('ascii', 'surrogateescape'), + defects=[undecodable_bytes_defect, nonprintable_defect('\t')], + ), + + **for_each_character(RFC_NONPRINTABLES)( + non_printable = C( + 'f{char}o', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + ) + + # get_fws def test_get_fws_only(self): From 880b2aaf082ecf6ccd221e819086e5f4775f0a33 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 26 Dec 2025 16:10:41 -0500 Subject: [PATCH 012/152] Add more testing infrastructure. This is more preparation for the refactoring steps. This is an expanded version of the old _test_get_x method that is much more general, works with the new test parameterization, has optional automatic testing of the 'start' attribute, and provides a convenience method for checking that only the expected token types are included in whatever is returned by the parsing method. This will be used to test both the new and the old API for all refactored methods. The refactoring process will be a several step process for each function: 1) refactor the existing tests to the new framework without changing the test arguments. This allows verification that the tests are the same if anyone cares to check. 2) simplify the test arguments using the parameterization, and in most cases add additional tests. 3) refactor the function to support the new API, and start testing both APIs. (1) is supported by having the first few keyword arguments to _test_parse be in the same order as the positional arguments in _test_get_x. In steps (1) and (2) we use the 'old_api_only' params map to call _test_method such that we ignore any warnings. In step (3) we switch to 'for_each_api', which expects a deprecation warning when using the old API call style. When the refactoring is complete, old_api_only can go away. When the deprecation happens, for_each_api can go away after a global s/for_each_api/Params/. Much of this code is informed by the later refactoring process, but it is easier to present and maintain it in this changeset. --- Lib/test/temp | 0 Lib/test/test_email/__init__.py | 9 + .../test_email/test__header_value_parser.py | 173 +++++++++++++++++- 3 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 Lib/test/temp diff --git a/Lib/test/temp b/Lib/test/temp new file mode 100644 index 000000000000000..e69de29bb2d1d64 diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index 67121107b5e0214..be433c5a88617d3 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -196,6 +196,15 @@ def assertDefectsMatch(self, actual, expected): assertDefectsEqual = assertDefectsMatch +# A more stringent version of the test.support check_warnings helper. +from contextlib import contextmanager +from test.support.warnings_helper import _filterwarnings +@contextmanager +def check_all_warnings(*filters): + """Raise an error if the generated warnings to not exactly match filters.""" + return _filterwarnings(filters) + + # XXX Don't use this for new tests, use params instead. @parameterized will be # deprecated and removed eventually. def parameterize(cls): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 456b5047f3a57c6..4645cc1f36603fb 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1,15 +1,29 @@ import re import string import unittest +from contextlib import ExitStack from email import _header_value_parser as parser from email import errors from email import policy -from test.test_email import for_each_character, TestEmailBase, parameterize -from test.test_email.params import C, params, Params +from random import choices, randint +from test.test_email import ( + check_all_warnings, + for_each_character, + TestEmailBase, + parameterize, + ) +from test.test_email.params import ( + C, + params, + Params, + params_map, + ) # https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 RFC_NONPRINTABLES = bytes([*range(0, 33), 127]).decode('ascii') +ALL_ASCII = bytes(range(0, 128)).decode('ascii') + # ---> Defect Expectations @@ -202,6 +216,161 @@ def _test_parse_x(self, method, input, string, value, defects, self._assert_results(tl, '', string, value, defects, '', comments) return tl + def _test_parse( + self, + method, + callspec, + stringified=None, + value=None, + defects=None, + remainder='', + comments=None, + *, + exception=None, + warnings=None, + test_start=True, + no_end=False, + pprint=False, + ): + """Call method with callspec, make asserts, and return results of call. + + Expect method to be a parsing method that takes a string as its first + argument and returns a Terminal or TokenList as its return value, + possibly followed by an "unparsed remainder" index, and possibly + additional return values. + + If test_start is true (the default), modify the callspec to add a + random prefix to its first (string) argument, and add a new parameter + after it consisting of the length of the added prefix. If the callspec + contains a value for 'end', modify that value by adding the prefix + length. + + If exception has a value, assert that using callspec to call method + raises the exception that must be the first element of value tuple with + a string value that matches the regex that must be the second element + of the value tuple. + + Otherwise use the (possibly modified) callspec to call the method, + capturing its return value, which should either be a single Terminal or + TokenList, or a tuple whose first element is a Terminal or TokenList. + + If no_end is True, assert that the return value was not a tuple or its + second value was not an integer. + + If warnings has a value, use it as the argument value to a + check_all_warnings assert around the callspec call. + + If pprint is true, call the pprint method of returned object. + + If the return value is not a singleton and the second element of + the return value is an integer, use it, modified by the length of + the prefix if test_start s true, to assert that the unparsed + remainder matches the value of 'remainder'. + + Assert that str called on the returned object matches the value + of stringified, or the characters from start to end or the end + of the string if stringified is None. + + Assert that the value attribute of the returned object matches + value, or stringified is value is None. + + Assert that the comments attribute of the returned object matches + comments. + + Assert that the defects attribute of the returned object matches + defects. + + Return whatever the called method returned. + + """ + s, *args = callspec.args + base = s[:-len(remainder)] if remainder else s + if test_start: + # XXX I'm not at sure the overhead of this randomization is worth + # it. We do at least need to test having a prefix though... + prefix_len = randint(1, 20) + prefix = ''.join(choices(ALL_ASCII, k=prefix_len)) + kw = dict(callspec.kw) + callspec = C(prefix + s, prefix_len, *args, **kw) + # XXX POSTDEP: Change this if to do only what's in the else clause. + if warnings is ...: + warningscheck = ExitStack() + else: + warnings = [(x[1], x[0]) for x in warnings] if warnings else [] + warningscheck = check_all_warnings(*warnings) + if exception: + with warningscheck: + with self.assertRaisesRegex(exception[0], exception[1]): + callspec(method) + return + stringified = base if stringified is None else stringified + value = stringified if value is None else value + comments = [] if comments is None else comments + defects = [] if defects is None else defects + with warningscheck: + result = callspec(method) + if isinstance(result, (parser.TokenList, parser.Terminal)): + other = [] + else: + result, *other = result + if pprint: + print(f'\n{result.ppstr()}') + # XXX POSTDEP: remove str from this 'if' + if other and isinstance(other[0], (int, str)): + if no_end: + self.fail( + "It looks like the function incorrectly returned an" + " end of parsing pointer" + ) + # a get_x method that returns a remainder or pointer. + actual_remainder, *other = other + if isinstance(actual_remainder, int): + if test_start: + actual_remainder -= prefix_len + actual_remainder = s[actual_remainder:] + self.assertEqual(actual_remainder, remainder) + self.assertEqual(str(result), stringified) + if isinstance(result, parser.TokenList): + self.assertEqual(result.value, value) + self.assertDefectsMatch(result.all_defects, defects) + self.assertEqual(result.comments, comments) + return (result, *other) if other else result + + def verify_terminal_types(self, tl, *text_types): + """Raise error if token_type of any Terminal is not in text_types.""" + self.assertIsInstance(tl, (parser.Terminal, parser.TokenList)) + if isinstance(tl, parser.Terminal): + self.assertIn(tl.token_type, text_types, repr(tl)) + elif isinstance(tl, parser.TokenList): + for t in tl: + # Some functions return a TokenList, but there should never be + # a plain TokenList anywhere deeper. This will catch failures + # to use 'extend' when consuming returned a TokenList. + self.assertIsNotNone(t.token_type, t) + self.verify_terminal_types(t, *text_types) + +# XXX XXX temporary step-wise refactoring tool, goes away at end of refactor. +@params_map(with_namelist=True) +def old_api_only(nl, *args, **kw): + if 'newapi' in nl: + return + kw['warnings'] = ... # Ignore pre-refactoring warnings. + kw.setdefault('test_start', False) + yield '' if 'oldapi' in nl else 'oldapionly', C(*args, **kw) + +# XXX POSTDEP: Delete this params_map and replace calls to it with params_set. +@params_map(with_namelist=True) +def for_each_api(nl, *args, **kw): + if nl.has_any('oldapi', 'newapi'): + # Reused tests; they've been through here before. + yield '', C(*args, **kw) + return + yield 'newapi', C(*args, **kw) + kw['warnings'] = kw.get('warnings', []) + [ + (DeprecationWarning, r'.*API.*has changed') + ] + yield 'oldapi', C(*args, **kw, test_start=False) + class TestParser(TestParserMixin, TestEmailBase): From 1956df1a449bcb8ef8c930bf108edb103ccdb023 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 26 Dec 2025 14:06:56 -0500 Subject: [PATCH 013/152] Add tests for _get_ptext_to_endchars. This is an internal function whose functionality will get replaced during the refactor, but as the comment says, the replacement function will need to pass these tests as well. Adding these tests revealed a bug: the existing code wasn't properly reporting that there were quoted printables if the only ones were \\s, even though it was correctly decoding them. Clearly there are no test cases in the rest of the tests that cover this case, and since it only affects defect detection it has never been reported. I fixed the bug in the old code, even though the code will go away in the refactor, so that the tests are checking for the desired behavior. BUGFIX: the detection of obsolete quoted pairs in certain contexts (i.e.: domain literals) was imperfect, such that some defects of this type were missed. They should now be consistently detected. --- Lib/email/_header_value_parser.py | 1 + .../test_email/test__header_value_parser.py | 136 +++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 7fb0dfd66f37082..9481f0cab93842f 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1052,6 +1052,7 @@ def _get_ptext_to_endchars(value, endchars): continue if escape: escape = False + had_qp = True elif fragment[pos] in endchars: break vchars.append(fragment[pos]) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 4645cc1f36603fb..66e96ea1bc4d945 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -5,8 +5,9 @@ from email import _header_value_parser as parser from email import errors from email import policy -from random import choices, randint +from random import choices, randint, sample from test.test_email import ( + charname, check_all_warnings, for_each_character, TestEmailBase, @@ -19,6 +20,12 @@ params_map, ) +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 +RFC_PRINTABLES = bytes(range(33, 127)).decode('ascii') + +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 +RFC_WSP = chr(32) + chr(9) + # https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 RFC_NONPRINTABLES = bytes([*range(0, 33), 127]).decode('ascii') @@ -441,6 +448,133 @@ def test__validate_xtext(self, s, defects=[]): ) + # _get_ptext_to_endchars + + # As an internal method these tests are not API requirements; however, the + # behavior they check must be verified one way or another, so if the + # implementation changes there need to be equivalent tests. + + @params + def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): + ptext, had_qp = self._test_parse( + parser._get_ptext_to_endchars, + C(s, endchars), + test_start=False, + **kw, + ) + self.assertEqual(had_qp, has_qp) + + @params_map + def for_each_endchar_set(*args, **kw): + # The function is general, but these are the ones we actually use. + endchar_sets = dict( + quoted_string='"', + comment='()', + domain_literal='[]', + ) + for name, endchars in endchar_sets.items(): + yield name, C(*args, endchars=endchars, **kw) + + @params_map + def for_each_endchar(*args, **kw): + return for_each_character(kw['endchars'])(C(*args, **kw)).items() + + # This params_map is used on exactly one expression, which has to contain a + # list of characters with no repeats. + @params_map + def stops_at_first_endchar_found(s): + for i in range(len(s)): + endchars = ''.join(sample((r := s[i:]), len(r))) + ec = charname(s[i]) + yield f'stops_at_first_endchar_found__string__{ec}', C( + s, + endchars=endchars, + remainder=r, + ) + yield f'stops_at_first_endchar_found__set__{ec}', C( + s, + endchars=set(endchars), + remainder=r, + ) + + params_test__get_ptext_to_endchars = Params( + + **for_each_endchar( + wsp_can_be_legal_endchars = C( + 'foo{char}bar"', + endchars='()' + RFC_WSP, + remainder='{char}bar"', + ), + ), + + **stops_at_first_endchar_found('(random?{})'), + + **for_each_endchar_set( + + one_word_no_wsp = C( + 'foo', + ), + + escaped_letter = C( + r'bar\s', + stringified='bars', + has_qp=True, + ), + + escaped_escape_char = C( + r'foo\\bar', + stringified=r'foo\bar', + has_qp=True, + ), + + any_printable_may_be_quoted = C( + ''.join(rf'\{c}' for c in RFC_PRINTABLES), + stringified=RFC_PRINTABLES, + has_qp=True, + ), + + ), + + **for_each_endchar( + for_each_endchar_set( + + stops_at_endchar = C( + 'foo{char}bar"', + remainder='{char}bar"', + ), + + quoted_endchar_no_actual_endchar = C( + r'foo\{char}bar', + stringified=r'foo{char}bar', + has_qp=True, + ), + + quoted_endchar_before_actual_endchar = C( + r'foo\{char}bar{char}', + stringified='foo{char}bar', + remainder='{char}', + has_qp=True, + ), + + multiple_qp = C( + r'\{char}\foo\\\{char}\a{char}', + stringified=r'{char}foo\{char}a', + remainder=r'{char}', + has_qp=True, + ), + + no_qp_before_endchar_but_some_after = C( + r'foo{char}a\b\a\r', + remainder=r'{char}a\b\a\r', + has_qp=False, + ), + + ), + ), + + ) + + # get_fws def test_get_fws_only(self): From 1f0ffc10ec56d6f262155feb8c5f34a2b666cef6 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 17 Jan 2026 16:06:23 -0500 Subject: [PATCH 014/152] Begin refactoring of get_fws tests. --- Lib/test/test_email/test__header_value_parser.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 66e96ea1bc4d945..f2601c0ac3b0116 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -577,6 +577,17 @@ def stops_at_first_endchar_found(s): # get_fws + @params + def test_get_fws(self, s, *args, **kw): + fws = self._test_parse(parser.get_fws, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(fws, parser.WhiteSpaceTerminal) + self.assertEqual(fws.token_type, 'fws') + + params_test_get_fws = old_api_only( + ) + def test_get_fws_only(self): fws = self._test_get_x(parser.get_fws, ' \t ', ' \t ', ' ', [], '') self.assertEqual(fws.token_type, 'fws') @@ -587,6 +598,7 @@ def test_get_fws_space(self): def test_get_fws_ws_run(self): self._test_get_x(parser.get_fws, ' \t foo ', ' \t ', ' ', [], 'foo ') + # get_encoded_word def test_get_encoded_word_missing_start_raises(self): From 872fb8a1e95cfea2bb3e1415a26ad2e4ce9f16fd Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 8 Jan 2026 12:27:26 -0500 Subject: [PATCH 015/152] Rough conversion of existing get_fws tests. This makes the next diff easier to compare, then we'll tidy up. --- Lib/test/test_email/test__header_value_parser.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f2601c0ac3b0116..6085029d1aafab5 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -586,17 +586,17 @@ def test_get_fws(self, s, *args, **kw): self.assertEqual(fws.token_type, 'fws') params_test_get_fws = old_api_only( - ) - def test_get_fws_only(self): - fws = self._test_get_x(parser.get_fws, ' \t ', ' \t ', ' ', [], '') - self.assertEqual(fws.token_type, 'fws') + test_get_fws_only = C( + ' \t ', ' \t ', ' ', [], ''), - def test_get_fws_space(self): - self._test_get_x(parser.get_fws, ' foo', ' ', ' ', [], 'foo') + test_get_fws_space = C( + ' foo', ' ', ' ', [], 'foo'), - def test_get_fws_ws_run(self): - self._test_get_x(parser.get_fws, ' \t foo ', ' \t ', ' ', [], 'foo ') + test_get_fws_ws_run = C( + ' \t foo ', ' \t ', ' ', [], 'foo '), + + ) # get_encoded_word From 522bc0d491812b4411e0ab650f27efb184de4c42 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 26 Dec 2025 16:18:50 -0500 Subject: [PATCH 016/152] Fix whitespace and test names in get_fws tests. --- Lib/test/test_email/test__header_value_parser.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 6085029d1aafab5..be50f13b634d1dd 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -587,14 +587,11 @@ def test_get_fws(self, s, *args, **kw): params_test_get_fws = old_api_only( - test_get_fws_only = C( - ' \t ', ' \t ', ' ', [], ''), + ws_only = C(' \t ', ' \t ', ' ', [], ''), - test_get_fws_space = C( - ' foo', ' ', ' ', [], 'foo'), + space = C(' foo', ' ', ' ', [], 'foo'), - test_get_fws_ws_run = C( - ' \t foo ', ' \t ', ' ', [], 'foo '), + ws_run = C(' \t foo ', ' \t ', ' ', [], 'foo '), ) From 302de5bb26c676ba02ab335ab3e0f5e524917721 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 8 Jan 2026 12:48:28 -0500 Subject: [PATCH 017/152] Convert get_fws tests to keyword form. --- Lib/test/test_email/test__header_value_parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index be50f13b634d1dd..30529313006d07b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -579,7 +579,7 @@ def stops_at_first_endchar_found(s): @params def test_get_fws(self, s, *args, **kw): - fws = self._test_parse(parser.get_fws, C(s), *args, **kw) + fws = self._test_parse(parser.get_fws, C(s), *args, value=' ', **kw) if 'exception' in kw: return self.assertIsInstance(fws, parser.WhiteSpaceTerminal) @@ -587,11 +587,11 @@ def test_get_fws(self, s, *args, **kw): params_test_get_fws = old_api_only( - ws_only = C(' \t ', ' \t ', ' ', [], ''), + ws_only = C(' \t '), - space = C(' foo', ' ', ' ', [], 'foo'), + space = C(' foo', remainder='foo'), - ws_run = C(' \t foo ', ' \t ', ' ', [], 'foo '), + ws_run = C(' \t foo ', remainder='foo '), ) From 7af1603027af9a38d7617e62e20655562c676019 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 20 Feb 2026 16:18:03 -0500 Subject: [PATCH 018/152] Improve the get_fws test names. --- Lib/test/test_email/test__header_value_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 30529313006d07b..9c0e8e699211763 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -587,11 +587,11 @@ def test_get_fws(self, s, *args, **kw): params_test_get_fws = old_api_only( - ws_only = C(' \t '), + wsp_run = C(' \t '), - space = C(' foo', remainder='foo'), + ends_at_non_wsp_after_wsp = C(' foo', remainder='foo'), - ws_run = C(' \t foo ', remainder='foo '), + ends_at_non_wsp_after_wsp_run = C(' \t foo ', remainder='foo '), ) From f2d069752060559e168016ec3269b6763a657959 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 26 Dec 2025 16:26:59 -0500 Subject: [PATCH 019/152] Improve get_fws tests. Unfortunately the existing behavior some of the new tests check is buggy: get_fws returns a WhitesSaceTerminal whose contents is an empty string, but 'value' still returns a single blank...neither of these is correct behavior. We'll deprecate calling get_fws that way when we do the refactor. --- .../test_email/test__header_value_parser.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9c0e8e699211763..0f7aa8fccd85632 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -589,12 +589,26 @@ def test_get_fws(self, s, *args, **kw): wsp_run = C(' \t '), - ends_at_non_wsp_after_wsp = C(' foo', remainder='foo'), + **for_each_character(RFC_WSP)( + ends_at_non_wsp_after_wsp = C('{char}foo', remainder='foo'), + ), - ends_at_non_wsp_after_wsp_run = C(' \t foo ', remainder='foo '), + **for_each_character(RFC_PRINTABLES)( + ends_at_non_wsp_after_wsp_run = C(' \t{char} ', remainder='{char} '), + ), ) + # XXX XXX: these ought to error, but get_fws should never be called this way + # We'll deprecate the lack of raise during the refactor. + params_test_get_fws.update( + old_api_only( + empty = C(''), + no_wsp = C('foo', remainder='foo'), + no_leading_wsp = C('foo bar', remainder='foo bar'), + ), + ) + # get_encoded_word From aa5fa3fa4e6a9a63a6e6ffbb781996f58f274db5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 2 Jan 2026 10:46:53 -0500 Subject: [PATCH 020/152] Begin _encoded_words test refactor. Code and comments to make the next diff cleaner. We're converting the _encoded_words tests because part of the parser refactor is going to involve a small refactor in this helper module. --- Lib/test/test_email/test__encoded_words.py | 63 ++++++++++++++++++++-- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index 1713962f94caef2..430c08763d8d404 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -2,40 +2,51 @@ from email import _encoded_words as _ew from email import errors from test.test_email import TestEmailBase +from test.test_email.params import params, Params class TestDecodeQ(TestEmailBase): + #@params def _test(self, source, ex_result, ex_defects=[]): result, defects = _ew.decode_q(source) self.assertEqual(result, ex_result) self.assertDefectsEqual(defects, ex_defects) + #params_test = dict( + def test_no_encoded(self): self._test(b'foobar', b'foobar') - def test_spaces(self): + def test_encoded_spaces(self): self._test(b'foo=20bar=20', b'foo bar ') + + def test_underline_spaces(self): self._test(b'foo_bar_', b'foo bar ') def test_run_of_encoded(self): self._test(b'foo=20=20=21=2Cbar', b'foo !,bar') + # ) + class TestDecodeB(TestEmailBase): + #@params def _test(self, source, ex_result, ex_defects=[]): result, defects = _ew.decode_b(source) self.assertEqual(result, ex_result) self.assertDefectsEqual(defects, ex_defects) + #params_test = dict( + def test_simple(self): self._test(b'Zm9v', b'foo') - def test_missing_padding(self): - # 1 missing padding character + def test_missing_1_padding_char(self): self._test(b'dmk', b'vi', [errors.InvalidBase64PaddingDefect]) - # 2 missing padding characters + + def test_missing_2_padding_chars(self): self._test(b'dg', b'v', [errors.InvalidBase64PaddingDefect]) def test_invalid_character(self): @@ -48,19 +59,38 @@ def test_invalid_character_and_bad_padding(self): def test_invalid_length(self): self._test(b'abcde', b'abcde', [errors.InvalidBase64LengthDefect]) + # ) + class TestDecode(TestEmailBase): - def test_wrong_format_input_raises(self): + #@params + #def test_raises_if(self, value, exception=ValueError): + # with self.assertRaises(exception): + # _ew.decode(value) + + #params_test_raises_if = dict( + + def test_raises_if_missing_middle(self): with self.assertRaises(ValueError): _ew.decode('=?badone?=') + + def test_raises_if_beginning_only(self): with self.assertRaises(ValueError): _ew.decode('=?') + + def test_raises_if_empty_string(self): with self.assertRaises(ValueError): _ew.decode('') + + def test_raises_if_invalid_encoding(self): with self.assertRaises(KeyError): _ew.decode('=?utf-8?X?somevalue?=') + #) + + + #@params def _test(self, source, result, charset='us-ascii', lang='', defects=[]): res, char, l, d = _ew.decode(source) self.assertEqual(res, result) @@ -68,6 +98,8 @@ def _test(self, source, result, charset='us-ascii', lang='', defects=[]): self.assertEqual(l, lang) self.assertDefectsEqual(d, defects) + #params_test = dict( + def test_simple_q(self): self._test('=?us-ascii?q?foo?=', 'foo') @@ -142,12 +174,17 @@ def test_q_nonascii(self): 'Éric', charset='utf-8') + # ) + class TestEncodeQ(TestEmailBase): + #@params def _test(self, src, expected): self.assertEqual(_ew.encode_q(src), expected) + #params_test = dict( + def test_all_safe(self): self._test(b'foobar', 'foobar') @@ -157,9 +194,18 @@ def test_spaces(self): def test_run_of_encodables(self): self._test(b'foo ,,bar', 'foo__=2C=2Cbar') + # ) + class TestEncodeB(TestEmailBase): + @params + def test(self, src, expected): + self.assertEqual(_ew.encode_b(src), expected) + + params_test = Params( + ) + def test_simple(self): self.assertEqual(_ew.encode_b(b'foo'), 'Zm9v') @@ -169,6 +215,13 @@ def test_padding(self): class TestEncode(TestEmailBase): + @params + def test(self, callspec, expected): + self.assertEqual(callspec(_ew.encode), expected) + + params_test = Params( + ) + def test_q(self): self.assertEqual(_ew.encode('foo', 'utf-8', 'q'), '=?utf-8?q?foo?=') From 6cc9d141fcbc80b1169409fe20f62a691fff59e8 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 2 Jan 2026 10:41:18 -0500 Subject: [PATCH 021/152] Rough conversion of _encoded_word tests. --- Lib/test/test_email/test__encoded_words.py | 246 +++++++++++---------- 1 file changed, 128 insertions(+), 118 deletions(-) diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index 430c08763d8d404..ab09ec048a08095 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -2,199 +2,204 @@ from email import _encoded_words as _ew from email import errors from test.test_email import TestEmailBase -from test.test_email.params import params, Params +from test.test_email.params import C, params, Params class TestDecodeQ(TestEmailBase): - #@params - def _test(self, source, ex_result, ex_defects=[]): + @params + def test(self, source, ex_result, ex_defects=[]): result, defects = _ew.decode_q(source) self.assertEqual(result, ex_result) self.assertDefectsEqual(defects, ex_defects) - #params_test = dict( + params_test = Params( - def test_no_encoded(self): - self._test(b'foobar', b'foobar') + test_no_encoded = C( + b'foobar', b'foobar'), - def test_encoded_spaces(self): - self._test(b'foo=20bar=20', b'foo bar ') + test_encoded_spaces = C( + b'foo=20bar=20', b'foo bar '), - def test_underline_spaces(self): - self._test(b'foo_bar_', b'foo bar ') + test_underline_space = C( + b'foo_bar_', b'foo bar '), - def test_run_of_encoded(self): - self._test(b'foo=20=20=21=2Cbar', b'foo !,bar') + test_run_of_encoded = C( + b'foo=20=20=21=2Cbar', b'foo !,bar'), - # ) + ) class TestDecodeB(TestEmailBase): - #@params - def _test(self, source, ex_result, ex_defects=[]): + @params + def test(self, source, ex_result, ex_defects=[]): result, defects = _ew.decode_b(source) self.assertEqual(result, ex_result) self.assertDefectsEqual(defects, ex_defects) - #params_test = dict( + params_test = Params( - def test_simple(self): - self._test(b'Zm9v', b'foo') + test_simple = C( + b'Zm9v', b'foo'), - def test_missing_1_padding_char(self): - self._test(b'dmk', b'vi', [errors.InvalidBase64PaddingDefect]) + test_missing_1_padding_char = C( + b'dmk', b'vi', [errors.InvalidBase64PaddingDefect]), - def test_missing_2_padding_chars(self): - self._test(b'dg', b'v', [errors.InvalidBase64PaddingDefect]) + test_missing_2_padding_char = C( + b'dg', b'v', [errors.InvalidBase64PaddingDefect]), - def test_invalid_character(self): - self._test(b'dm\x01k===', b'vi', [errors.InvalidBase64CharactersDefect]) + test_invalid_character = C( + b'dm\x01k===', b'vi', [errors.InvalidBase64CharactersDefect]), - def test_invalid_character_and_bad_padding(self): - self._test(b'dm\x01k', b'vi', [errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]) + test_invalid_character_and_bad_padding = C( + b'dm\x01k', b'vi', [errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect]), - def test_invalid_length(self): - self._test(b'abcde', b'abcde', [errors.InvalidBase64LengthDefect]) + test_invalid_length = C( + b'abcde', b'abcde', [errors.InvalidBase64LengthDefect]), - # ) + ) class TestDecode(TestEmailBase): - #@params - #def test_raises_if(self, value, exception=ValueError): - # with self.assertRaises(exception): - # _ew.decode(value) + @params + def test_raises_if(self, value, exception=ValueError): + with self.assertRaises(exception): + _ew.decode(value) - #params_test_raises_if = dict( + params_test_raises_if = Params( - def test_raises_if_missing_middle(self): - with self.assertRaises(ValueError): - _ew.decode('=?badone?=') + missing_middle = C( + '=?badone?='), - def test_raises_if_beginning_only(self): - with self.assertRaises(ValueError): - _ew.decode('=?') + beginning_only = C( + '=?'), - def test_raises_if_empty_string(self): - with self.assertRaises(ValueError): - _ew.decode('') + empty_string = C( + ''), - def test_raises_if_invalid_encoding(self): - with self.assertRaises(KeyError): - _ew.decode('=?utf-8?X?somevalue?=') + invalid_encoding = C( + '=?utf-8?X?somevalue?=', exception=KeyError), - #) + ) - #@params - def _test(self, source, result, charset='us-ascii', lang='', defects=[]): + @params + def test(self, source, result, charset='us-ascii', lang='', defects=[]): res, char, l, d = _ew.decode(source) self.assertEqual(res, result) self.assertEqual(char, charset) self.assertEqual(l, lang) self.assertDefectsEqual(d, defects) - #params_test = dict( + params_test = Params( - def test_simple_q(self): - self._test('=?us-ascii?q?foo?=', 'foo') + test_simple_q = C( + '=?us-ascii?q?foo?=', 'foo'), - def test_simple_b(self): - self._test('=?us-ascii?b?dmk=?=', 'vi') + test_simple_b = C( + '=?us-ascii?b?dmk=?=', 'vi'), - def test_q_case_ignored(self): - self._test('=?us-ascii?Q?foo?=', 'foo') + test_q_case_ignored = C( + '=?us-ascii?Q?foo?=', 'foo'), - def test_b_case_ignored(self): - self._test('=?us-ascii?B?dmk=?=', 'vi') + test_b_case_ignored = C( + '=?us-ascii?B?dmk=?=', 'vi'), - def test_non_trivial_q(self): - self._test('=?latin-1?q?=20F=fcr=20Elise=20?=', ' Für Elise ', 'latin-1') + test_non_trivial_q = C( + '=?latin-1?q?=20F=fcr=20Elise=20?=', ' Für Elise ', 'latin-1'), - def test_q_escaped_bytes_preserved(self): - self._test(b'=?us-ascii?q?=20\xACfoo?='.decode('us-ascii', + test_q_escaped_bytes_preserved = C( + b'=?us-ascii?q?=20\xACfoo?='.decode('us-ascii', 'surrogateescape'), ' \uDCACfoo', defects = [errors.UndecodableBytesDefect]) + , - def test_b_undecodable_bytes_ignored_with_defect(self): - self._test(b'=?us-ascii?b?dm\xACk?='.decode('us-ascii', + test_b_undecodable_bytes_ignored_with_defect = C( + b'=?us-ascii?b?dm\xACk?='.decode('us-ascii', 'surrogateescape'), 'vi', defects = [ errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect]) + , - def test_b_invalid_bytes_ignored_with_defect(self): - self._test('=?us-ascii?b?dm\x01k===?=', + test_b_invalid_bytes_ignored_with_defect = C( + '=?us-ascii?b?dm\x01k===?=', 'vi', defects = [errors.InvalidBase64CharactersDefect]) + , - def test_b_invalid_bytes_incorrect_padding(self): - self._test('=?us-ascii?b?dm\x01k?=', + test_b_invalid_bytes_incorrect_padding = C( + '=?us-ascii?b?dm\x01k?=', 'vi', defects = [ errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect]) + , - def test_b_padding_defect(self): - self._test('=?us-ascii?b?dmk?=', + test_b_padding_defect = C( + '=?us-ascii?b?dmk?=', 'vi', defects = [errors.InvalidBase64PaddingDefect]) + , - def test_nonnull_lang(self): - self._test('=?us-ascii*jive?q?test?=', 'test', lang='jive') + test_nonnull_lang = C( + '=?us-ascii*jive?q?test?=', 'test', lang='jive'), - def test_unknown_8bit_charset(self): - self._test('=?unknown-8bit?q?foo=ACbar?=', + test_unknown_8bit_charset = C( + '=?unknown-8bit?q?foo=ACbar?=', b'foo\xacbar'.decode('ascii', 'surrogateescape'), charset = 'unknown-8bit', defects = []) + , - def test_unknown_charset(self): - self._test('=?foobar?q?foo=ACbar?=', + test_unknown_charset = C( + '=?foobar?q?foo=ACbar?=', b'foo\xacbar'.decode('ascii', 'surrogateescape'), charset = 'foobar', # XXX Should this be a new Defect instead? defects = [errors.CharsetError]) + , - def test_invalid_character_in_charset(self): - self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', + test_invalid_character_in_charset = C( + '=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', b'foo\xacbar'.decode('ascii', 'surrogateescape'), charset = 'utf-8\udce2\udc80\udc9d', # XXX Should this be a new Defect instead? defects = [errors.CharsetError]) + , - def test_q_nonascii(self): - self._test('=?utf-8?q?=C3=89ric?=', + test_q_nonascii = C( + '=?utf-8?q?=C3=89ric?=', 'Éric', charset='utf-8') + , - # ) + ) class TestEncodeQ(TestEmailBase): - #@params - def _test(self, src, expected): + @params + def test(self, src, expected): self.assertEqual(_ew.encode_q(src), expected) - #params_test = dict( + params_test = Params( - def test_all_safe(self): - self._test(b'foobar', 'foobar') + test_all_safe = C( + b'foobar', 'foobar'), - def test_spaces(self): - self._test(b'foo bar ', 'foo_bar_') + test_spaces = C( + b'foo bar ', 'foo_bar_'), - def test_run_of_encodables(self): - self._test(b'foo ,,bar', 'foo__=2C=2Cbar') + test_run_of_encodables = C( + b'foo ,,bar', 'foo__=2C=2Cbar'), - # ) + ) class TestEncodeB(TestEmailBase): @@ -204,13 +209,14 @@ def test(self, src, expected): self.assertEqual(_ew.encode_b(src), expected) params_test = Params( - ) - def test_simple(self): - self.assertEqual(_ew.encode_b(b'foo'), 'Zm9v') + test_simple = C( + b'foo', 'Zm9v'), - def test_padding(self): - self.assertEqual(_ew.encode_b(b'vi'), 'dmk=') + test_padding = C( + b'vi', 'dmk='), + + ) class TestEncode(TestEmailBase): @@ -220,40 +226,44 @@ def test(self, callspec, expected): self.assertEqual(callspec(_ew.encode), expected) params_test = Params( - ) - def test_q(self): - self.assertEqual(_ew.encode('foo', 'utf-8', 'q'), '=?utf-8?q?foo?=') + test_q = C( + C('foo', 'utf-8', 'q'), '=?utf-8?q?foo?='), - def test_b(self): - self.assertEqual(_ew.encode('foo', 'utf-8', 'b'), '=?utf-8?b?Zm9v?=') + test_b = C( + C('foo', 'utf-8', 'b'), '=?utf-8?b?Zm9v?='), - def test_auto_q(self): - self.assertEqual(_ew.encode('foo', 'utf-8'), '=?utf-8?q?foo?=') + test_auto_q = C( + C('foo', 'utf-8'), '=?utf-8?q?foo?='), - def test_auto_q_if_short_mostly_safe(self): - self.assertEqual(_ew.encode('vi.', 'utf-8'), '=?utf-8?q?vi=2E?=') + test_auto_q_if_short_mostly_safe = C( + C('vi.', 'utf-8'), '=?utf-8?q?vi=2E?='), - def test_auto_b_if_enough_unsafe(self): - self.assertEqual(_ew.encode('.....', 'utf-8'), '=?utf-8?b?Li4uLi4=?=') + test_auto_b_if_enough_unsafe = C( + C('.....', 'utf-8'), '=?utf-8?b?Li4uLi4=?='), - def test_auto_b_if_long_unsafe(self): - self.assertEqual(_ew.encode('vi.vi.vi.vi.vi.', 'utf-8'), + test_auto_b_if_long_unsafe = C( + C('vi.vi.vi.vi.vi.', 'utf-8'), '=?utf-8?b?dmkudmkudmkudmkudmku?=') + , - def test_auto_q_if_long_mostly_safe(self): - self.assertEqual(_ew.encode('vi vi vi.vi ', 'utf-8'), + test_auto_q_if_long_mostly_safe = C( + C('vi vi vi.vi ', 'utf-8'), '=?utf-8?q?vi_vi_vi=2Evi_?=') + , - def test_utf8_default(self): - self.assertEqual(_ew.encode('foo'), '=?utf-8?q?foo?=') + test_utf8_default = C( + C('foo'), '=?utf-8?q?foo?='), - def test_lang(self): - self.assertEqual(_ew.encode('foo', lang='jive'), '=?utf-8*jive?q?foo?=') + test_lang = C( + C('foo', lang='jive'), '=?utf-8*jive?q?foo?='), - def test_unknown_8bit(self): - self.assertEqual(_ew.encode('foo\uDCACbar', charset='unknown-8bit'), + test_unknown_8bit = C( + C('foo\uDCACbar', charset='unknown-8bit'), '=?unknown-8bit?q?foo=ACbar?=') + , + + ) if __name__ == '__main__': From d3f4faceb2b7b9773136e45aaf03345b94f73793 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 2 Jan 2026 12:32:57 -0500 Subject: [PATCH 022/152] Fix whitespace and test names in _encoded_words tests. --- Lib/test/test_email/test__encoded_words.py | 369 +++++++++++---------- 1 file changed, 200 insertions(+), 169 deletions(-) diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index ab09ec048a08095..6519abbddbb4fd3 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -14,19 +14,10 @@ def test(self, source, ex_result, ex_defects=[]): self.assertDefectsEqual(defects, ex_defects) params_test = Params( - - test_no_encoded = C( - b'foobar', b'foobar'), - - test_encoded_spaces = C( - b'foo=20bar=20', b'foo bar '), - - test_underline_space = C( - b'foo_bar_', b'foo bar '), - - test_run_of_encoded = C( - b'foo=20=20=21=2Cbar', b'foo !,bar'), - + no_encoded = C(b'foobar', b'foobar'), + encoded_spaces = C(b'foo=20bar=20', b'foo bar '), + underline_space = C(b'foo_bar_', b'foo bar '), + run_of_encoded = C(b'foo=20=20=21=2Cbar', b'foo !,bar'), ) @@ -40,24 +31,43 @@ def test(self, source, ex_result, ex_defects=[]): params_test = Params( - test_simple = C( - b'Zm9v', b'foo'), - - test_missing_1_padding_char = C( - b'dmk', b'vi', [errors.InvalidBase64PaddingDefect]), - - test_missing_2_padding_char = C( - b'dg', b'v', [errors.InvalidBase64PaddingDefect]), - - test_invalid_character = C( - b'dm\x01k===', b'vi', [errors.InvalidBase64CharactersDefect]), - - test_invalid_character_and_bad_padding = C( - b'dm\x01k', b'vi', [errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]), - - test_invalid_length = C( - b'abcde', b'abcde', [errors.InvalidBase64LengthDefect]), + simple = C( + b'Zm9v', + b'foo', + ), + + missing_1_padding_char = C( + b'dmk', + b'vi', + [errors.InvalidBase64PaddingDefect], + ), + + missing_2_padding_char = C( + b'dg', + b'v', + [errors.InvalidBase64PaddingDefect], + ), + + invalid_character = C( + b'dm\x01k===', + b'vi', + [errors.InvalidBase64CharactersDefect], + ), + + invalid_character_and_bad_padding = C( + b'dm\x01k', + b'vi', + [ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + ), + + invalid_length = C( + b'abcde', + b'abcde', + [errors.InvalidBase64LengthDefect], + ), ) @@ -72,16 +82,21 @@ def test_raises_if(self, value, exception=ValueError): params_test_raises_if = Params( missing_middle = C( - '=?badone?='), + '=?badone?=', + ), beginning_only = C( - '=?'), + '=?', + ), empty_string = C( - ''), + '', + ), invalid_encoding = C( - '=?utf-8?X?somevalue?=', exception=KeyError), + '=?utf-8?X?somevalue?=', + exception=KeyError, + ), ) @@ -96,88 +111,102 @@ def test(self, source, result, charset='us-ascii', lang='', defects=[]): params_test = Params( - test_simple_q = C( - '=?us-ascii?q?foo?=', 'foo'), - - test_simple_b = C( - '=?us-ascii?b?dmk=?=', 'vi'), - - test_q_case_ignored = C( - '=?us-ascii?Q?foo?=', 'foo'), - - test_b_case_ignored = C( - '=?us-ascii?B?dmk=?=', 'vi'), - - test_non_trivial_q = C( - '=?latin-1?q?=20F=fcr=20Elise=20?=', ' Für Elise ', 'latin-1'), - - test_q_escaped_bytes_preserved = C( - b'=?us-ascii?q?=20\xACfoo?='.decode('us-ascii', - 'surrogateescape'), - ' \uDCACfoo', - defects = [errors.UndecodableBytesDefect]) - , - - test_b_undecodable_bytes_ignored_with_defect = C( - b'=?us-ascii?b?dm\xACk?='.decode('us-ascii', - 'surrogateescape'), - 'vi', - defects = [ - errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]) - , - - test_b_invalid_bytes_ignored_with_defect = C( - '=?us-ascii?b?dm\x01k===?=', - 'vi', - defects = [errors.InvalidBase64CharactersDefect]) - , - - test_b_invalid_bytes_incorrect_padding = C( - '=?us-ascii?b?dm\x01k?=', - 'vi', - defects = [ - errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]) - , - - test_b_padding_defect = C( - '=?us-ascii?b?dmk?=', - 'vi', - defects = [errors.InvalidBase64PaddingDefect]) - , - - test_nonnull_lang = C( - '=?us-ascii*jive?q?test?=', 'test', lang='jive'), - - test_unknown_8bit_charset = C( - '=?unknown-8bit?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), - charset = 'unknown-8bit', - defects = []) - , - - test_unknown_charset = C( - '=?foobar?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), - charset = 'foobar', - # XXX Should this be a new Defect instead? - defects = [errors.CharsetError]) - , - - test_invalid_character_in_charset = C( - '=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), - charset = 'utf-8\udce2\udc80\udc9d', - # XXX Should this be a new Defect instead? - defects = [errors.CharsetError]) - , - - test_q_nonascii = C( - '=?utf-8?q?=C3=89ric?=', - 'Éric', - charset='utf-8') - , + simple_q = C( + '=?us-ascii?q?foo?=', + 'foo', + ), + + simple_b = C( + '=?us-ascii?b?dmk=?=', + 'vi', + ), + + q_case_ignored = C( + '=?us-ascii?Q?foo?=', + 'foo', + ), + + b_case_ignored = C( + '=?us-ascii?B?dmk=?=', + 'vi', + ), + + non_trivial_q = C( + '=?latin-1?q?=20F=fcr=20Elise=20?=', + ' Für Elise ', + 'latin-1', + ), + + q_escaped_bytes_preserved = C( + b'=?us-ascii?q?=20\xACfoo?='.decode('us-ascii', 'surrogateescape'), + ' \uDCACfoo', + defects=[errors.UndecodableBytesDefect], + ), + + b_undecodable_bytes_ignored_with_defect = C( + b'=?us-ascii?b?dm\xACk?='.decode('us-ascii', 'surrogateescape'), + 'vi', + defects=[ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + ), + + b_invalid_bytes_ignored_with_defect = C( + '=?us-ascii?b?dm\x01k===?=', + 'vi', + defects=[errors.InvalidBase64CharactersDefect], + ), + + b_invalid_bytes_incorrect_padding = C( + '=?us-ascii?b?dm\x01k?=', + 'vi', + defects=[ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + ), + + b_padding_defect = C( + '=?us-ascii?b?dmk?=', + 'vi', + defects=[errors.InvalidBase64PaddingDefect], + ), + + nonnull_lang = C( + '=?us-ascii*jive?q?test?=', + 'test', + lang='jive', + ), + + unknown_8bit_charset = C( + '=?unknown-8bit?q?foo=ACbar?=', + b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset='unknown-8bit', + defects=[], + ), + + unknown_charset = C( + '=?foobar?q?foo=ACbar?=', + b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset='foobar', + # XXX Should this be a new Defect instead? + defects=[errors.CharsetError], + ), + + invalid_character_in_charset = C( + '=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', + b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset='utf-8\udce2\udc80\udc9d', + # XXX Should this be a new Defect instead? + defects=[errors.CharsetError], + ), + + q_nonascii = C( + '=?utf-8?q?=C3=89ric?=', + 'Éric', + charset='utf-8', + ), ) @@ -189,16 +218,9 @@ def test(self, src, expected): self.assertEqual(_ew.encode_q(src), expected) params_test = Params( - - test_all_safe = C( - b'foobar', 'foobar'), - - test_spaces = C( - b'foo bar ', 'foo_bar_'), - - test_run_of_encodables = C( - b'foo ,,bar', 'foo__=2C=2Cbar'), - + all_safe = C(b'foobar', 'foobar'), + spaces = C(b'foo bar ', 'foo_bar_'), + run_of_encodables = C(b'foo ,,bar', 'foo__=2C=2Cbar'), ) @@ -209,13 +231,8 @@ def test(self, src, expected): self.assertEqual(_ew.encode_b(src), expected) params_test = Params( - - test_simple = C( - b'foo', 'Zm9v'), - - test_padding = C( - b'vi', 'dmk='), - + simple = C(b'foo', 'Zm9v'), + padding = C(b'vi', 'dmk='), ) @@ -227,41 +244,55 @@ def test(self, callspec, expected): params_test = Params( - test_q = C( - C('foo', 'utf-8', 'q'), '=?utf-8?q?foo?='), - - test_b = C( - C('foo', 'utf-8', 'b'), '=?utf-8?b?Zm9v?='), - - test_auto_q = C( - C('foo', 'utf-8'), '=?utf-8?q?foo?='), - - test_auto_q_if_short_mostly_safe = C( - C('vi.', 'utf-8'), '=?utf-8?q?vi=2E?='), - - test_auto_b_if_enough_unsafe = C( - C('.....', 'utf-8'), '=?utf-8?b?Li4uLi4=?='), - - test_auto_b_if_long_unsafe = C( - C('vi.vi.vi.vi.vi.', 'utf-8'), - '=?utf-8?b?dmkudmkudmkudmkudmku?=') - , - - test_auto_q_if_long_mostly_safe = C( - C('vi vi vi.vi ', 'utf-8'), - '=?utf-8?q?vi_vi_vi=2Evi_?=') - , - - test_utf8_default = C( - C('foo'), '=?utf-8?q?foo?='), - - test_lang = C( - C('foo', lang='jive'), '=?utf-8*jive?q?foo?='), - - test_unknown_8bit = C( - C('foo\uDCACbar', charset='unknown-8bit'), - '=?unknown-8bit?q?foo=ACbar?=') - , + q = C( + C('foo', 'utf-8', 'q'), + '=?utf-8?q?foo?=', + ), + + b = C( + C('foo', 'utf-8', 'b'), + '=?utf-8?b?Zm9v?=', + ), + + auto_q = C( + C('foo', 'utf-8'), + '=?utf-8?q?foo?=', + ), + + auto_q_if_short_mostly_safe = C( + C('vi.', 'utf-8'), + '=?utf-8?q?vi=2E?=', + ), + + auto_b_if_enough_unsafe = C( + C('.....', 'utf-8'), + '=?utf-8?b?Li4uLi4=?=', + ), + + auto_b_if_long_unsafe = C( + C('vi.vi.vi.vi.vi.', 'utf-8'), + '=?utf-8?b?dmkudmkudmkudmkudmku?=', + ), + + auto_q_if_long_mostly_safe = C( + C('vi vi vi.vi ', 'utf-8'), + '=?utf-8?q?vi_vi_vi=2Evi_?=', + ), + + utf8_default = C( + C('foo'), + '=?utf-8?q?foo?=', + ), + + lang = C( + C('foo', lang='jive'), + '=?utf-8*jive?q?foo?=', + ), + + unknown_8bit = C( + C('foo\uDCACbar', charset='unknown-8bit'), + '=?unknown-8bit?q?foo=ACbar?=', + ), ) From d43d77ea1df662206bdec7222e43318e90986456 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 2 Jan 2026 14:11:25 -0500 Subject: [PATCH 023/152] Tidy up to finish _encoded_words test conversion. Using 'actual_' as a prefix makes the error results clearer when tests fail. Adding 'defects=' doesn't really add much here, but it is the new pattern established in test__header_value_parser so it is nice to be consistent. The rest is a small simplification of how the encoder and decoder tests are factored. --- Lib/test/test_email/test__encoded_words.py | 114 +++++++++------------ 1 file changed, 50 insertions(+), 64 deletions(-) diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index 6519abbddbb4fd3..d8884bc8a69189f 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -5,15 +5,19 @@ from test.test_email.params import C, params, Params -class TestDecodeQ(TestEmailBase): +class TestDecoders(TestEmailBase): + + def _test(self, function, source, result, defects=[]): + actual_result, actual_defects = function(source) + self.assertEqual(actual_result, result) + self.assertDefectsEqual(actual_defects, defects) + @params - def test(self, source, ex_result, ex_defects=[]): - result, defects = _ew.decode_q(source) - self.assertEqual(result, ex_result) - self.assertDefectsEqual(defects, ex_defects) + def test_decode_q(self, *args, **kw): + return self._test(_ew.decode_q, *args, **kw) - params_test = Params( + params_test_decode_q = Params( no_encoded = C(b'foobar', b'foobar'), encoded_spaces = C(b'foo=20bar=20', b'foo bar '), underline_space = C(b'foo_bar_', b'foo bar '), @@ -21,15 +25,11 @@ def test(self, source, ex_result, ex_defects=[]): ) -class TestDecodeB(TestEmailBase): - @params - def test(self, source, ex_result, ex_defects=[]): - result, defects = _ew.decode_b(source) - self.assertEqual(result, ex_result) - self.assertDefectsEqual(defects, ex_defects) + def test_decode_b(self, *args, **kw): + return self._test(_ew.decode_b, *args, **kw) - params_test = Params( + params_test_decode_b = Params( simple = C( b'Zm9v', @@ -39,25 +39,25 @@ def test(self, source, ex_result, ex_defects=[]): missing_1_padding_char = C( b'dmk', b'vi', - [errors.InvalidBase64PaddingDefect], + defects=[errors.InvalidBase64PaddingDefect], ), missing_2_padding_char = C( b'dg', b'v', - [errors.InvalidBase64PaddingDefect], + defects=[errors.InvalidBase64PaddingDefect], ), invalid_character = C( b'dm\x01k===', b'vi', - [errors.InvalidBase64CharactersDefect], + defects=[errors.InvalidBase64CharactersDefect], ), invalid_character_and_bad_padding = C( b'dm\x01k', b'vi', - [ + defects=[ errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect, ], @@ -66,50 +66,41 @@ def test(self, source, ex_result, ex_defects=[]): invalid_length = C( b'abcde', b'abcde', - [errors.InvalidBase64LengthDefect], + defects=[errors.InvalidBase64LengthDefect], ), ) -class TestDecode(TestEmailBase): - @params - def test_raises_if(self, value, exception=ValueError): + def test_decode_raises_if_value(self, value, exception=ValueError): with self.assertRaises(exception): _ew.decode(value) - params_test_raises_if = Params( - - missing_middle = C( - '=?badone?=', - ), - - beginning_only = C( - '=?', - ), - - empty_string = C( - '', - ), - - invalid_encoding = C( - '=?utf-8?X?somevalue?=', - exception=KeyError, - ), - + params_test_decode_raises_if_value = Params( + missing_middle = C('=?badone?='), + beginning_only = C('=?'), + empty_string = C(''), + invalid_encoding = C('=?utf-8?X?somevalue?=', exception=KeyError), ) @params - def test(self, source, result, charset='us-ascii', lang='', defects=[]): - res, char, l, d = _ew.decode(source) - self.assertEqual(res, result) - self.assertEqual(char, charset) - self.assertEqual(l, lang) - self.assertDefectsEqual(d, defects) + def test_decode( + self, + source, + result, + charset='us-ascii', + lang='', + defects=[], + ): + actual, actual_charset, actual_lang, actual_defects = _ew.decode(source) + self.assertEqual(actual, result) + self.assertEqual(actual_charset, charset) + self.assertEqual(actual_lang, lang) + self.assertDefectsEqual(actual_defects, defects) - params_test = Params( + params_test_decode = Params( simple_q = C( '=?us-ascii?q?foo?=', @@ -134,7 +125,7 @@ def test(self, source, result, charset='us-ascii', lang='', defects=[]): non_trivial_q = C( '=?latin-1?q?=20F=fcr=20Elise=20?=', ' Für Elise ', - 'latin-1', + charset='latin-1', ), q_escaped_bytes_preserved = C( @@ -211,38 +202,33 @@ def test(self, source, result, charset='us-ascii', lang='', defects=[]): ) -class TestEncodeQ(TestEmailBase): +class TestEncoders(TestEmailBase): - @params - def test(self, src, expected): - self.assertEqual(_ew.encode_q(src), expected) + def _test(self, function, source, expected): + self.assertEqual(function(source), expected) - params_test = Params( + @params( all_safe = C(b'foobar', 'foobar'), spaces = C(b'foo bar ', 'foo_bar_'), run_of_encodables = C(b'foo ,,bar', 'foo__=2C=2Cbar'), ) + def test_encode_q(self, *args, **kw): + return self._test(_ew.encode_q, *args, **kw) -class TestEncodeB(TestEmailBase): - - @params - def test(self, src, expected): - self.assertEqual(_ew.encode_b(src), expected) - - params_test = Params( + @params( simple = C(b'foo', 'Zm9v'), padding = C(b'vi', 'dmk='), ) + def test_encode_b(self, *args, **kw): + return self._test(_ew.encode_b, *args, **kw) -class TestEncode(TestEmailBase): - @params - def test(self, callspec, expected): + def test_encode(self, callspec, expected): self.assertEqual(callspec(_ew.encode), expected) - params_test = Params( + params_test_encode = Params( q = C( C('foo', 'utf-8', 'q'), From d44e56d468203f72faa4de624eac99889f7c660b Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 3 Jan 2026 13:40:00 -0500 Subject: [PATCH 024/152] Begin get_encoded_word test refactor. Add comments to make the next diff cleaner. --- Lib/test/test_email/test__header_value_parser.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 0f7aa8fccd85632..399b8297928c304 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -612,6 +612,17 @@ def test_get_fws(self, s, *args, **kw): # get_encoded_word + @params + def test_get_encoded_word(self, s, *args, charset='us-ascii', lang='', **kw): + res = self._test_parse(parser.get_encoded_word, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertEqual(res.charset, charset) + self.assertEqual(res.lang, lang) + + params_test_get_encoded_word = old_api_only( + ) + def test_get_encoded_word_missing_start_raises(self): with self.assertRaises(errors.HeaderParseError): parser.get_encoded_word('abc') From 2a18c460ca5da350630a42d1769fbff47b0b420f Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 27 Dec 2025 17:09:45 -0500 Subject: [PATCH 025/152] Rough conversion of get_encoded_word tests. --- .../test_email/test__header_value_parser.py | 78 ++++++++++--------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 399b8297928c304..7628832b40e280d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -621,100 +621,104 @@ def test_get_encoded_word(self, s, *args, charset='us-ascii', lang='', **kw): self.assertEqual(res.lang, lang) params_test_get_encoded_word = old_api_only( - ) - def test_get_encoded_word_missing_start_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('abc') + test_get_encoded_word_missing_start_raises = C( + 'abc', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_encoded_word_missing_end_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('=?abc') + test_get_encoded_word_missing_end_raises = C( + '=?abc', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_encoded_word_missing_middle_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('=?abc?=') + test_get_encoded_word_missing_middle_raises = C( + '=?abc?=', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_encoded_word_invalid_cte(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('=?utf-8?X?somevalue?=') + test_get_encoded_word_invalid_cte = C( + '=?utf-8?X?abc?=', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_encoded_word_valid_ew(self): - self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_valid_ew = C( '=?us-ascii?q?this_is_a_test?= bird', 'this is a test', 'this is a test', [], ' bird') + , - def test_get_encoded_word_internal_spaces(self): - self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_internal_spaces = C( '=?us-ascii?q?this is a test?= bird', 'this is a test', 'this is a test', [errors.InvalidHeaderDefect], ' bird') + , - def test_get_encoded_word_gets_first(self): - self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_gets_first = C( '=?us-ascii?q?first?= =?utf-8?q?second?=', 'first', 'first', [], ' =?utf-8?q?second?=') + , - def test_get_encoded_word_gets_first_even_if_no_space(self): - self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_gets_first_even_if_no_space = C( '=?us-ascii?q?first?==?utf-8?q?second?=', 'first', 'first', [errors.InvalidHeaderDefect], '=?utf-8?q?second?=') + , - def test_get_encoded_word_sets_extra_attributes(self): - ew = self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_sets_extra_attributes = C( '=?us-ascii*jive?q?first_second?=', 'first second', 'first second', [], - '') - self.assertEqual(ew.charset, 'us-ascii') - self.assertEqual(ew.lang, 'jive') + '', + lang='jive', + ), - def test_get_encoded_word_lang_default_is_blank(self): - ew = self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_lang_default_is_blank = C( '=?us-ascii?q?first_second?=', 'first second', 'first second', [], '') - self.assertEqual(ew.charset, 'us-ascii') - self.assertEqual(ew.lang, '') + , - def test_get_encoded_word_non_printable_defect(self): - self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_non_printable_defect = C( '=?us-ascii?q?first\x02second?=', 'first\x02second', 'first\x02second', [errors.NonPrintableDefect], '') + , - def test_get_encoded_word_leading_internal_space(self): - self._test_get_x(parser.get_encoded_word, + test_get_encoded_word_leading_internal_space = C( '=?us-ascii?q?=20foo?=', ' foo', ' foo', [], '') + , - def test_get_encoded_word_quopri_utf_escape_follows_cte(self): + test_get_encoded_word_quopri_utf_escape_follows_cte = C( # Issue 18044 - self._test_get_x(parser.get_encoded_word, '=?utf-8?q?=C3=89ric?=', 'Éric', 'Éric', [], - '') + '', + charset='utf-8', + ), + + ) + # get_unstructured From a0a5aaaef937e02781de4c682a4e71a59acef9a7 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 3 Jan 2026 14:06:23 -0500 Subject: [PATCH 026/152] Fix test names and whitespace in get_encoded_word tests. --- .../test_email/test__header_value_parser.py | 170 +++++++++--------- 1 file changed, 85 insertions(+), 85 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 7628832b40e280d..f2a4f0c47ae08b6 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -622,100 +622,100 @@ def test_get_encoded_word(self, s, *args, charset='us-ascii', lang='', **kw): params_test_get_encoded_word = old_api_only( - test_get_encoded_word_missing_start_raises = C( - 'abc', - exception=(errors.HeaderParseError, '.*'), + missing_start_raises = C( + 'abc', + exception=(errors.HeaderParseError, '.*'), ), - test_get_encoded_word_missing_end_raises = C( - '=?abc', - exception=(errors.HeaderParseError, '.*'), + missing_end_raises = C( + '=?abc', + exception=(errors.HeaderParseError, '.*'), ), - test_get_encoded_word_missing_middle_raises = C( - '=?abc?=', - exception=(errors.HeaderParseError, '.*'), + missing_middle_raises = C( + '=?abc?=', + exception=(errors.HeaderParseError, '.*'), ), - test_get_encoded_word_invalid_cte = C( - '=?utf-8?X?abc?=', - exception=(errors.HeaderParseError, '.*'), + invalid_cte = C( + '=?utf-8?X?abc?=', + exception=(errors.HeaderParseError, '.*'), ), - test_get_encoded_word_valid_ew = C( - '=?us-ascii?q?this_is_a_test?= bird', - 'this is a test', - 'this is a test', - [], - ' bird') - , - - test_get_encoded_word_internal_spaces = C( - '=?us-ascii?q?this is a test?= bird', - 'this is a test', - 'this is a test', - [errors.InvalidHeaderDefect], - ' bird') - , - - test_get_encoded_word_gets_first = C( - '=?us-ascii?q?first?= =?utf-8?q?second?=', - 'first', - 'first', - [], - ' =?utf-8?q?second?=') - , - - test_get_encoded_word_gets_first_even_if_no_space = C( - '=?us-ascii?q?first?==?utf-8?q?second?=', - 'first', - 'first', - [errors.InvalidHeaderDefect], - '=?utf-8?q?second?=') - , - - test_get_encoded_word_sets_extra_attributes = C( - '=?us-ascii*jive?q?first_second?=', - 'first second', - 'first second', - [], - '', - lang='jive', - ), - - test_get_encoded_word_lang_default_is_blank = C( - '=?us-ascii?q?first_second?=', - 'first second', - 'first second', - [], - '') - , - - test_get_encoded_word_non_printable_defect = C( - '=?us-ascii?q?first\x02second?=', - 'first\x02second', - 'first\x02second', - [errors.NonPrintableDefect], - '') - , - - test_get_encoded_word_leading_internal_space = C( - '=?us-ascii?q?=20foo?=', - ' foo', - ' foo', - [], - '') - , - - test_get_encoded_word_quopri_utf_escape_follows_cte = C( + valid_ew = C( + '=?us-ascii?q?this_is_a_test?= bird', + 'this is a test', + 'this is a test', + [], + ' bird', + ), + + internal_spaces = C( + '=?us-ascii?q?this is a test?= bird', + 'this is a test', + 'this is a test', + [errors.InvalidHeaderDefect], + ' bird', + ), + + gets_first = C( + '=?us-ascii?q?first?= =?utf-8?q?second?=', + 'first', + 'first', + [], + ' =?utf-8?q?second?=', + ), + + gets_first_even_if_no_space = C( + '=?us-ascii?q?first?==?utf-8?q?second?=', + 'first', + 'first', + [errors.InvalidHeaderDefect], + '=?utf-8?q?second?=', + ), + + sets_extra_attributes = C( + '=?us-ascii*jive?q?first_second?=', + 'first second', + 'first second', + [], + '', + lang='jive', + ), + + lang_default_is_blank = C( + '=?us-ascii?q?first_second?=', + 'first second', + 'first second', + [], + '', + ), + + non_printable_defect = C( + '=?us-ascii?q?first\x02second?=', + 'first\x02second', + 'first\x02second', + [errors.NonPrintableDefect], + '', + ), + + leading_internal_space = C( + '=?us-ascii?q?=20foo?=', + ' foo', + ' foo', + [], + '', + ), + + quopri_utf_escape_follows_cte = C( # Issue 18044 - '=?utf-8?q?=C3=89ric?=', - 'Éric', - 'Éric', - [], - '', - charset='utf-8', - ), + '=?utf-8?q?=C3=89ric?=', + 'Éric', + 'Éric', + [], + '', + charset='utf-8', + ), ) From 972955854784e7dc63d4b9a5103e4e734a37422c Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 27 Dec 2025 17:39:18 -0500 Subject: [PATCH 027/152] Convert get_encoded_word tests to keyword form. And add regexes for the exceptions and defects. I replaced one test: the 'lang default is blank' test is redundant given that the test harness now tests that. I replaced it with one that checks that a non-default charset does appear in the charset attribute. --- .../test_email/test__header_value_parser.py | 85 +++++++++---------- 1 file changed, 38 insertions(+), 47 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f2a4f0c47ae08b6..8f6a36924c5a5db 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -624,96 +624,87 @@ def test_get_encoded_word(self, s, *args, charset='us-ascii', lang='', **kw): missing_start_raises = C( 'abc', - exception=(errors.HeaderParseError, '.*'), + # "expected encoded word but found abc" + exception=(errors.HeaderParseError, r'abc'), ), missing_end_raises = C( '=?abc', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, r'=?abc'), ), missing_middle_raises = C( '=?abc?=', - exception=(errors.HeaderParseError, '.*'), + # "encoded word format invalid: '=?abc?='" + exception=( + errors.HeaderParseError, + rf'(?=.*invalid)(?=.*{re.escape("=?abc?=")})', + ), ), - invalid_cte = C( + invalid_cte_raises = C( '=?utf-8?X?abc?=', - exception=(errors.HeaderParseError, '.*'), + exception=( + errors.HeaderParseError, + rf'(?=.*invalid)(?=.*{re.escape("=?utf-8?X?abc?=")})', + ), ), valid_ew = C( '=?us-ascii?q?this_is_a_test?= bird', - 'this is a test', - 'this is a test', - [], - ' bird', + stringified='this is a test', + remainder=' bird', ), internal_spaces = C( '=?us-ascii?q?this is a test?= bird', - 'this is a test', - 'this is a test', - [errors.InvalidHeaderDefect], - ' bird', + stringified='this is a test', + # 'whitespace inside encoded word' + defects=[whitespace_inside_ew_defect], + remainder=' bird', ), - gets_first = C( + only_gets_first_ew = C( '=?us-ascii?q?first?= =?utf-8?q?second?=', - 'first', - 'first', - [], - ' =?utf-8?q?second?=', + stringified='first', + remainder=' =?utf-8?q?second?=', ), - gets_first_even_if_no_space = C( + only_gets_first_ew_even_if_no_space = C( '=?us-ascii?q?first?==?utf-8?q?second?=', - 'first', - 'first', - [errors.InvalidHeaderDefect], - '=?utf-8?q?second?=', + stringified='first', + # 'missing trailing whitespace after encoded-word' + defects=[missing_whitespace_after_ew_defect], + remainder='=?utf-8?q?second?=', ), - sets_extra_attributes = C( + lang_set = C( '=?us-ascii*jive?q?first_second?=', - 'first second', - 'first second', - [], - '', + stringified='first second', lang='jive', ), - lang_default_is_blank = C( - '=?us-ascii?q?first_second?=', - 'first second', - 'first second', - [], - '', + utf8_charset = C( + '=?utf-8?q?first_second?=', + stringified='first second', + charset='utf-8', ), non_printable_defect = C( '=?us-ascii?q?first\x02second?=', - 'first\x02second', - 'first\x02second', - [errors.NonPrintableDefect], - '', + stringified='first\x02second', + defects=[(nonprintable_defect, '\x02')], ), leading_internal_space = C( '=?us-ascii?q?=20foo?=', - ' foo', - ' foo', - [], - '', + stringified=' foo', ), - quopri_utf_escape_follows_cte = C( # Issue 18044 + quopri_utf_escape_follows_cte = C( '=?utf-8?q?=C3=89ric?=', - 'Éric', - 'Éric', - [], - '', + stringified='Éric', charset='utf-8', ), From 22f292a74ad8c3a9737ad346179b6e8b4ee4bdda Mon Sep 17 00:00:00 2001 From: R David Murray Date: Tue, 19 May 2026 22:15:07 -0400 Subject: [PATCH 028/152] Begin simplifying get_encoded_word 'raise' tests. --- .../test_email/test__header_value_parser.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 8f6a36924c5a5db..1e17ecd50b6b6e0 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -620,36 +620,41 @@ def test_get_encoded_word(self, s, *args, charset='us-ascii', lang='', **kw): self.assertEqual(res.charset, charset) self.assertEqual(res.lang, lang) - params_test_get_encoded_word = old_api_only( + # This params_map will handle either single strings or C objects. + @params_map + def expect_get_encoded_word_raise(v, *args, **kw): + newspec = C( + v, + *args, + # "expected encoded word but found '...'" + exception=(errors.HeaderParseError, re.escape(v)), + test_start=False, + **kw, + ) + yield 'oldapi', newspec + + params_test_get_encoded_word__invalid_input = expect_get_encoded_word_raise( missing_start_raises = C( 'abc', - # "expected encoded word but found abc" - exception=(errors.HeaderParseError, r'abc'), ), missing_end_raises = C( '=?abc', - exception=(errors.HeaderParseError, r'=?abc'), ), missing_middle_raises = C( '=?abc?=', - # "encoded word format invalid: '=?abc?='" - exception=( - errors.HeaderParseError, - rf'(?=.*invalid)(?=.*{re.escape("=?abc?=")})', - ), ), invalid_cte_raises = C( '=?utf-8?X?abc?=', - exception=( - errors.HeaderParseError, - rf'(?=.*invalid)(?=.*{re.escape("=?utf-8?X?abc?=")})', - ), ), + ) + + params_test_get_encoded_word = old_api_only( + valid_ew = C( '=?us-ascii?q?this_is_a_test?= bird', stringified='this is a test', From 17cc660485cb7e433cf10f7bd13e56bf567f8c5f Mon Sep 17 00:00:00 2001 From: R David Murray Date: Thu, 7 May 2026 15:39:39 -0400 Subject: [PATCH 029/152] Simplify get_encoded_word raise tests further. The params_map allows us to skip specifying 'C', and all these tests will fit on one line. I think it reads cleaner with the strings lined up. --- .../test_email/test__header_value_parser.py | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1e17ecd50b6b6e0..ea588d543ce1b83 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -634,23 +634,10 @@ def expect_get_encoded_word_raise(v, *args, **kw): yield 'oldapi', newspec params_test_get_encoded_word__invalid_input = expect_get_encoded_word_raise( - - missing_start_raises = C( - 'abc', - ), - - missing_end_raises = C( - '=?abc', - ), - - missing_middle_raises = C( - '=?abc?=', - ), - - invalid_cte_raises = C( - '=?utf-8?X?abc?=', - ), - + missing_start_raises = 'abc', + missing_end_raises = '=?abc', + missing_middle_raises = '=?abc?=', + invalid_cte_raises = '=?utf-8?X?abc?=', ) params_test_get_encoded_word = old_api_only( From ad0423b67c5268a26094deead88c49cfdf298111 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 29 Jan 2026 13:16:01 -0500 Subject: [PATCH 030/152] Update get_encoded_word raise tests before adding more. Switch to 'content' instead of 'abc' for clarity as to which part it is, and update the names to harmonize with the ones I'm about to add. --- Lib/test/test_email/test__header_value_parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ea588d543ce1b83..d34400afa739a17 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -634,10 +634,10 @@ def expect_get_encoded_word_raise(v, *args, **kw): yield 'oldapi', newspec params_test_get_encoded_word__invalid_input = expect_get_encoded_word_raise( - missing_start_raises = 'abc', - missing_end_raises = '=?abc', - missing_middle_raises = '=?abc?=', - invalid_cte_raises = '=?utf-8?X?abc?=', + no_chrome = 'content', + start_and_charset_only = '=?UTF-8', + missing_both_middle = '=?content?=', + unknown_cte = '=?UTF-8?X?content?=', ) params_test_get_encoded_word = old_api_only( From a0ddf9b85174b1fc98b829db1e024882c6ee88cf Mon Sep 17 00:00:00 2001 From: R David Murray Date: Thu, 14 May 2026 19:15:04 -0400 Subject: [PATCH 031/152] Update fix for bpo-27397/gh-71584. BUGFIX: The fix for bpo-27397/gh-71584 introduced a situation where an encoded word that could not be decoded had its chrome stripped instead. While the general nature of the fix for the bug (exceptions when the length of a base64 string was impossible) was correct for the other contexts in which the exceptions occurred, it was incorrect for encoded words. When an encoded word cannot be decoded it should be left as is in the output. This is now what happens for this particular decoding error. --- Lib/email/_header_value_parser.py | 4 ++++ Lib/test/test_email/test__header_value_parser.py | 9 +++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 9481f0cab93842f..ad01c7520073d82 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1104,6 +1104,10 @@ def get_encoded_word(value, terminal_type='vtext'): "encoded word format invalid: '{}'".format(ew.cte)) ew.charset = charset ew.lang = lang + if any(isinstance(x, errors.InvalidBase64LengthDefect) for x in defects): + raise _InvalidEwError( + "encoded word could not be decoded: '{}'".format(ew.cte), + ) ew.defects.extend(defects) while text: if text[0] in WSP: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index d34400afa739a17..5a72f72c5ca0519 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -638,6 +638,7 @@ def expect_get_encoded_word_raise(v, *args, **kw): start_and_charset_only = '=?UTF-8', missing_both_middle = '=?content?=', unknown_cte = '=?UTF-8?X?content?=', + invalid_base64_length = '=?utf-8?b?abcde?=', ) params_test_get_encoded_word = old_api_only( @@ -885,12 +886,12 @@ def test_get_unstructured_invalid_base64_character_and_bad_padding(self): '') def test_get_unstructured_invalid_base64_length(self): - # bpo-27397: Return the encoded string since there's no way to decode. + # bpo-27397/gh-71584: there's no way to decode this. self._test_get_x(self._get_unst, '=?utf-8?b?abcde?=', - 'abcde', - 'abcde', - [errors.InvalidBase64LengthDefect], + '=?utf-8?b?abcde?=', + '=?utf-8?b?abcde?=', + [], '') def test_get_unstructured_no_whitespace_between_ews(self): From 5f94a6551905f648659fd2c08c2d936aeee45d5d Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 13 Jun 2026 13:56:58 -0400 Subject: [PATCH 032/152] Add more get_encoded_word tests. BUGFIX: If there is (rfc invalid) whitespace around the charset or language in an encoded word, the charset and lang attribute values no longer include the whitespace. There are other bugs we'll fix in the refactor, as indicated by the test comments. Some of the new tests are copied from the get_unstructured tests; we'll doing some DRY refactoring there in a bit. --- Lib/email/_header_value_parser.py | 4 +- .../test_email/test__header_value_parser.py | 200 +++++++++++++++++- 2 files changed, 195 insertions(+), 9 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ad01c7520073d82..68fcf45d650e3fa 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1102,12 +1102,12 @@ def get_encoded_word(value, terminal_type='vtext'): except (ValueError, KeyError): raise _InvalidEwError( "encoded word format invalid: '{}'".format(ew.cte)) - ew.charset = charset - ew.lang = lang if any(isinstance(x, errors.InvalidBase64LengthDefect) for x in defects): raise _InvalidEwError( "encoded word could not be decoded: '{}'".format(ew.cte), ) + ew.charset = charset.strip() + ew.lang = lang.strip() ew.defects.extend(defects) while text: if text[0] in WSP: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 5a72f72c5ca0519..f2d5dbf8d5d812d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -29,6 +29,18 @@ # https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 RFC_NONPRINTABLES = bytes([*range(0, 33), 127]).decode('ascii') +# https://datatracker.ietf.org/doc/html/rfc2978#section-2.3 +# Except that like +# https://datatracker.ietf.org/doc/html/rfc8187#section-3.2.1 +# we omit the "'" character as otherwise it is difficult to correctly parse +# extended parameters values absent a complete registry. In any case charset +# names generally do not include special characters in practice. +RFC_CHARSET_CHARS = ''.join(( + string.ascii_letters, + string.digits, + "!#$%&+-^_`{}~", + )) + ALL_ASCII = bytes(range(0, 128)).decode('ascii') @@ -613,12 +625,23 @@ def test_get_fws(self, s, *args, **kw): # get_encoded_word @params - def test_get_encoded_word(self, s, *args, charset='us-ascii', lang='', **kw): - res = self._test_parse(parser.get_encoded_word, C(s), *args, **kw) + def test_get_encoded_word( + self, + s, + *args, + charset='us-ascii', + lang='', + terminal_type=None, + **kw, + ): + callspec = C(s) if terminal_type is None else C(s, terminal_type) + res = self._test_parse(parser.get_encoded_word, callspec, *args, **kw) if 'exception' in kw: return self.assertEqual(res.charset, charset) self.assertEqual(res.lang, lang) + terminal_type = 'vtext' if terminal_type is None else terminal_type + self.verify_terminal_types(res, terminal_type, 'fws') # This params_map will handle either single strings or C objects. @params_map @@ -634,11 +657,38 @@ def expect_get_encoded_word_raise(v, *args, **kw): yield 'oldapi', newspec params_test_get_encoded_word__invalid_input = expect_get_encoded_word_raise( + null_string = '', no_chrome = 'content', + eq_only = '=content', + start_chrome_only = '=?', start_and_charset_only = '=?UTF-8', + start_charset_qm_only = '=?UTF-8?', + start_charset_qm_cte_only = '=?UTF-8?q', + start_charset_qm_cte_qm_only = '=?UTF-8?q?', + start_charset_qm_cte_qm_content_only = '=?UTF-8?q?content', + start_charset_qm_cte_qm_content_qm_only = '=?UTF-8?q?content?', + end_eq_only = 'content=', + end_chrome_only = '?=', + end_and_content_only = 'content?=', + end_content_eq_only = '?content?=', + end_content_eq_cte_only = 'q?content?=', + end_content_eq_cte_eq_only = '?q?content?=', + end_content_eq_cte_eq_charset_only = 'UTF-8?q?content?=', + end_content_eq_cte_eq_charset_eq_only = '?UTF-8?q?content?=', missing_both_middle = '=?content?=', + missing_one_middle = '=?q?content?=', + empty_cte = '=UTF-8??content?=', + empty_charset_and_cte = '=???content?=', + empty_everything = '=????=', unknown_cte = '=?UTF-8?X?content?=', invalid_base64_length = '=?utf-8?b?abcde?=', + multicharacter_cte = '=?UTF-8?qq?content?=', + too_many_qm = '=?UTF-8?q?q?content?=', + empty_lang = '=?UTF-8*??q?content?=', + lang_with_empty_charset = '=?*foo??q?content?=', + **for_each_character(ALL_ASCII)( + character_before_valid_ew = C('{char}=?us-ascii?q?test?='), + ), ) params_test_get_encoded_word = old_api_only( @@ -649,6 +699,30 @@ def expect_get_encoded_word_raise(v, *args, **kw): remainder=' bird', ), + # XXX XXX the skip for the RFC_WSP will go away after refactor. It's + # here because it would be a pain to handle the lack of the defect, + # which will go away in the refactor. + **for_each_character(ALL_ASCII, skip=RFC_WSP)( + ew_followed_by = C( + '=?us-ascii?q?foo?={char}', + stringified='foo', + remainder='{char}', + defects=[missing_whitespace_after_ew_defect], + ), + ), + + # XXX some of these characters should result in defects depending on + # the context from which get_encoded_word is called (ex: ()s are + # illegal in comment encoded words), but but at least at the moment + # that it isn't worth the effort to implement. + # XXX XXX the skip for ? is a bug which will be fixed in the refactor + **for_each_character(RFC_PRINTABLES, skip='_?')( + q_content_may_contain = C( + '=?us-ascii?q?foo_{char}_bar_{char}?=', + stringified='foo {char} bar {char}', + ) + ), + internal_spaces = C( '=?us-ascii?q?this is a test?= bird', stringified='this is a test', @@ -663,6 +737,7 @@ def expect_get_encoded_word_raise(v, *args, **kw): remainder=' =?utf-8?q?second?=', ), + # XXX XXX This defect will also go away (gets detected higher up) only_gets_first_ew_even_if_no_space = C( '=?us-ascii?q?first?==?utf-8?q?second?=', stringified='first', @@ -683,17 +758,55 @@ def expect_get_encoded_word_raise(v, *args, **kw): charset='utf-8', ), - non_printable_defect = C( - '=?us-ascii?q?first\x02second?=', - stringified='first\x02second', - defects=[(nonprintable_defect, '\x02')], + **for_each_character( + RFC_NONPRINTABLES, + # XXX XXX skip things split considers whitespace. This is buggy. + # US RS GS FS + skip=RFC_WSP + '\r\n\v\f\x1f\x1e\x1d\x1c', + )( + non_printable_defect = C( + '=?us-ascii?q?first{char}second?=', + stringified='first{char}second', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + # Note that other characters may work as well, but these *must* work. + **for_each_character(RFC_CHARSET_CHARS)( + char_valid_in_charset_name = C( + '=?a_bad_{char}set_name?q?foo?=', + stringified='foo', + defects=[(charset_defect('a_bad_{echar}set_name'))], + charset='a_bad_{char}set_name', + ), ), - leading_internal_space = C( + leading_internal_encoded_space = C( '=?us-ascii?q?=20foo?=', stringified=' foo', ), + leading_internal_unencoded_space = C( + '=?us-ascii?q? foo?=', + stringified=' foo', + defects=[whitespace_inside_ew_defect], + ), + + trailing_internal_encoded_space = C( + '=?us-ascii?q?foo=20_?= bird', + stringified='foo ', + value='foo ', + remainder=' bird', + ), + + trailing_internal_unencoded_space = C( + '=?us-ascii?q?foo _ ?= bird', + stringified='foo ', + value='foo ', + defects=[whitespace_inside_ew_defect], + remainder=' bird', + ), + # Issue 18044 quopri_utf_escape_follows_cte = C( '=?utf-8?q?=C3=89ric?=', @@ -701,6 +814,79 @@ def expect_get_encoded_word_raise(v, *args, **kw): charset='utf-8', ), + unknown_charset_leads_to_undecodable_bytes_with_non_ascii = C( + '=?invalid?q?=C3=89ric?=', + stringified='\udcc3\udc89ric', + charset='invalid', + defects=[charset_defect('invalid'), undecodable_bytes_defect], + ), + + empty_charset = C( + '=??q?content?=', + stringified='content', + charset='', + defects=[charset_defect('')], + ), + + missing_base64_padding = C( + '=?us-ascii?b?dmk?=', + stringified='vi', + defects=[invalid_base64_padding_defect], + ), + + + invalid_base64_character = C( + '=?us-ascii?b?dm\x01k===?=', + stringified='vi', + defects=[invalid_base64_characters_defect], + ), + + invalid_base64_character_and_bad_padding = C( + '=?us-ascii?b?dm\x01k?=', + stringified='vi', + defects=[ + invalid_base64_padding_defect, + invalid_base64_characters_defect, + ], + ), + + ws_only_charset_leads_to_undecodable_bytes_with_non_ascii = C( + '=? * ?q?=C3=89ric?=', + stringified='\udcc3\udc89ric', + charset='', + defects=[ + charset_defect(' '), + undecodable_bytes_defect, + whitespace_inside_ew_defect, + ], + ), + + eq_is_only_special_with_two_digits_after_it = C( + '=?UTF-8?q?=C3=89ric_=_?=', + stringified='Éric = ', + charset='UTF-8', + ), + + ws_around_charset_and_lang = C( + '=? us-ascii\t* jive\t ?q?test?= bird', + stringified='test', + lang='jive', + defects=[whitespace_inside_ew_defect], + remainder=' bird', + ), + + set_terminal_type_on_single_word_content = C( + '=?us-ascii?q?text?=', + stringified='text', + terminal_type='test', + ), + + set_terminal_type_on_multiple_word_content = C( + '=?us-ascii?q?text_and_more_text?=', + stringified='text and more text', + terminal_type='test', + ), + ) From 3d55369929a65a286fae8015fcc46a84143cb2c4 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 13 Jun 2026 13:58:33 -0400 Subject: [PATCH 033/152] Begin refactoring get_unstructured tests. --- .../test_email/test__header_value_parser.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f2d5dbf8d5d812d..ddc91b63a59eb1e 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -892,13 +892,90 @@ def expect_get_encoded_word_raise(v, *args, **kw): # get_unstructured + @params + def test_get_unstructured(self, s, *args, **kw): + result = self._test_parse( + parser.get_unstructured, + C(s), + *args, + test_start=False, + warnings=..., # XXX XXX ignore warnings until after refactor. + **kw, + ) + self.assertIsInstance(result, parser.UnstructuredTokenList) + self.verify_terminal_types(result, 'utext', 'fws') + + # get_unstructured should correctly decode anything get_encoded_word does, + # so it should correctly handle most get_encoded_word parameters. + @params_map(with_namelist=True) + def adapt_get_encoded_word_tests_for_get_unstructured(nl, *args, **kw): + kw.pop('test_start') + kw.pop('charset', None) + kw.pop('terminal_type', None) + kw.pop('lang', None) + # get_unstructured parses all of its input, so it will also parse and + # return anything get_encoded_word treats as a remainder. + remainder = kw.pop('remainder', '') + if '=?' in remainder or 'ew_followed_by' in nl: + # The remainder includes something get_unstructured would decode, + # or might contain something it would treat as a defect. Either + # way, parse_unstructured isn't expected to handle those parameters. + return + if 'stringified' in kw: + stringified = kw['stringified'] + kw['stringified'] = stringified + remainder + rstripped = remainder.lstrip(RFC_WSP) + if remainder != rstripped: + kw['value'] = kw.get('value', stringified) + ' ' + rstripped + # Drop the 'warning=...' added by only_old_api; we're doing it ourselves + # in the test method. + kw.pop('warnings') + yield 'from_test_get_encoded_word', C(*args, **kw) + + @params_map(with_namelist=True) + def adapt_get_encoded_word_invalid_input_for_get_unstructured(nl, s, **kw): + # Get unstructured should return the inputs unaltered, + # except for the ones where the ew itself is valid. + if 'character_before_valid_ew' in nl: + return + yield 'from_test_get_encoded_word_invalid_input', C(s) + + @params_map + def add_unstructured_prefix_and_suffix(s, *args, **kw): + # Make sure the reused parameters are correctly interpreted when + # intermixed with other text by adding some text. + pad = lambda s: f'pre fix {s} suf fix' + if not s: + # null value is a special case, and we already have a test for it. + return + s = pad(s) + kw = {n: (pad(v) if n in ('stringified', 'value') else v) + for n, v in kw.items() + } + yield '', C(s, *args, **kw) + + params_test_get_unstructured = Params( + + add_unstructured_prefix_and_suffix( + adapt_get_encoded_word_tests_for_get_unstructured( + params_test_get_encoded_word, + ), + adapt_get_encoded_word_invalid_input_for_get_unstructured( + params_test_get_encoded_word__invalid_input, + ), + ), + + ) + def _get_unst(self, value): token = parser.get_unstructured(value) return token, '' + # XXX XXX TEMP test1 def test_get_unstructured_null(self): self._test_get_x(self._get_unst, '', '', '', [], '') + # XXX XXX TEMP test2 def test_get_unstructured_one_word(self): self._test_get_x(self._get_unst, 'foo', 'foo', 'foo', [], '') From e69c65f5353a2bae467598149a22bec5af75dbbd Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 5 Jan 2026 13:09:30 -0500 Subject: [PATCH 034/152] Rough conversion of get_unstructured tests. --- .../test_email/test__header_value_parser.py | 145 +++++++++--------- 1 file changed, 75 insertions(+), 70 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ddc91b63a59eb1e..55c86c2895ddb61 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -965,248 +965,253 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): ), ), - ) - - def _get_unst(self, value): - token = parser.get_unstructured(value) - return token, '' - # XXX XXX TEMP test1 - def test_get_unstructured_null(self): - self._test_get_x(self._get_unst, '', '', '', [], '') + test_get_unstructured_null = C( + '', '', '', [], ''), # XXX XXX TEMP test2 - def test_get_unstructured_one_word(self): - self._test_get_x(self._get_unst, 'foo', 'foo', 'foo', [], '') + test_get_unstructured_one_word = C( + 'foo', 'foo', 'foo', [], ''), - def test_get_unstructured_normal_phrase(self): - self._test_get_x(self._get_unst, 'foo bar bird', + test_get_unstructured_normal_phrase = C( + 'foo bar bird', 'foo bar bird', 'foo bar bird', [], '') + , - def test_get_unstructured_normal_phrase_with_whitespace(self): - self._test_get_x(self._get_unst, 'foo \t bar bird', + test_get_unstructured_normal_phrase_with_whitespace = C( + 'foo \t bar bird', 'foo \t bar bird', 'foo bar bird', [], '') + , - def test_get_unstructured_leading_whitespace(self): - self._test_get_x(self._get_unst, ' foo bar', + test_get_unstructured_leading_whitespace = C( + ' foo bar', ' foo bar', ' foo bar', [], '') + , - def test_get_unstructured_trailing_whitespace(self): - self._test_get_x(self._get_unst, 'foo bar ', + test_get_unstructured_trailing_whitespace = C( + 'foo bar ', 'foo bar ', 'foo bar ', [], '') + , - def test_get_unstructured_leading_and_trailing_whitespace(self): - self._test_get_x(self._get_unst, ' foo bar ', + test_get_unstructured_leading_and_trailing_whitespace = C( + ' foo bar ', ' foo bar ', ' foo bar ', [], '') + , - def test_get_unstructured_one_valid_ew_no_ws(self): - self._test_get_x(self._get_unst, '=?us-ascii?q?bar?=', + test_get_unstructured_one_valid_ew_no_ws = C( + '=?us-ascii?q?bar?=', 'bar', 'bar', [], '') + , - def test_get_unstructured_one_ew_trailing_ws(self): - self._test_get_x(self._get_unst, '=?us-ascii?q?bar?= ', + test_get_unstructured_one_ew_trailing_ws = C( + '=?us-ascii?q?bar?= ', 'bar ', 'bar ', [], '') + , - def test_get_unstructured_one_valid_ew_trailing_text(self): - self._test_get_x(self._get_unst, '=?us-ascii?q?bar?= bird', + test_get_unstructured_one_valid_ew_trailing_text = C( + '=?us-ascii?q?bar?= bird', 'bar bird', 'bar bird', [], '') + , - def test_get_unstructured_phrase_with_ew_in_middle_of_text(self): - self._test_get_x(self._get_unst, 'foo =?us-ascii?q?bar?= bird', + test_get_unstructured_phrase_with_ew_in_middle_of_text = C( + 'foo =?us-ascii?q?bar?= bird', 'foo bar bird', 'foo bar bird', [], '') + , - def test_get_unstructured_phrase_with_two_ew(self): - self._test_get_x(self._get_unst, + test_get_unstructured_phrase_with_two_ew = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?=', 'foo barbird', 'foo barbird', [], '') + , - def test_get_unstructured_phrase_with_two_ew_trailing_ws(self): - self._test_get_x(self._get_unst, + test_get_unstructured_phrase_with_two_ew_trailing_ws = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?= ', 'foo barbird ', 'foo barbird ', [], '') + , - def test_get_unstructured_phrase_with_ew_with_leading_ws(self): - self._test_get_x(self._get_unst, + test_get_unstructured_phrase_with_ew_with_leading_ws = C( ' =?us-ascii?q?bar?=', ' bar', ' bar', [], '') + , - def test_get_unstructured_phrase_with_two_ew_extra_ws(self): - self._test_get_x(self._get_unst, + test_get_unstructured_phrase_with_two_ew_extra_ws = C( 'foo =?us-ascii?q?bar?= \t =?us-ascii?q?bird?=', 'foo barbird', 'foo barbird', [], '') + , - def test_get_unstructured_two_ew_extra_ws_trailing_text(self): - self._test_get_x(self._get_unst, + test_get_unstructured_two_ew_extra_ws_trailing_text = C( '=?us-ascii?q?test?= =?us-ascii?q?foo?= val', 'testfoo val', 'testfoo val', [], '') + , - def test_get_unstructured_ew_with_internal_ws(self): - self._test_get_x(self._get_unst, + test_get_unstructured_ew_with_internal_ws = C( '=?iso-8859-1?q?hello=20world?=', 'hello world', 'hello world', [], '') + , - def test_get_unstructured_ew_with_internal_leading_ws(self): - self._test_get_x(self._get_unst, + test_get_unstructured_ew_with_internal_leading_ws = C( ' =?us-ascii?q?=20test?= =?us-ascii?q?=20foo?= val', ' test foo val', ' test foo val', [], '') + , - def test_get_unstructured_invalid_ew(self): - self._test_get_x(self._get_unst, + test_get_unstructured_invalid_ew = C( '=?test val', '=?test val', '=?test val', [], '') + , - def test_get_unstructured_undecodable_bytes(self): - self._test_get_x(self._get_unst, + test_get_unstructured_undecodable_bytes = C( b'test \xACfoo val'.decode('ascii', 'surrogateescape'), 'test \uDCACfoo val', 'test \uDCACfoo val', [errors.UndecodableBytesDefect], '') + , - def test_get_unstructured_undecodable_bytes_in_EW(self): - self._test_get_x(self._get_unst, + test_get_unstructured_undecodable_bytes_in_EW = C( (b'=?us-ascii?q?=20test?= =?us-ascii?q?=20\xACfoo?=' b' val').decode('ascii', 'surrogateescape'), ' test \uDCACfoo val', ' test \uDCACfoo val', [errors.UndecodableBytesDefect]*2, '') + , - def test_get_unstructured_missing_base64_padding(self): - self._test_get_x(self._get_unst, + test_get_unstructured_missing_base64_padding = C( '=?utf-8?b?dmk?=', 'vi', 'vi', [errors.InvalidBase64PaddingDefect], '') + , - def test_get_unstructured_invalid_base64_character(self): - self._test_get_x(self._get_unst, + test_get_unstructured_invalid_base64_character = C( '=?utf-8?b?dm\x01k===?=', 'vi', 'vi', [errors.InvalidBase64CharactersDefect], '') + , - def test_get_unstructured_invalid_base64_character_and_bad_padding(self): - self._test_get_x(self._get_unst, + test_get_unstructured_invalid_base64_character_and_bad_padding = C( '=?utf-8?b?dm\x01k?=', 'vi', 'vi', [errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect], '') + , - def test_get_unstructured_invalid_base64_length(self): + test_get_unstructured_invalid_base64_length = C( # bpo-27397/gh-71584: there's no way to decode this. - self._test_get_x(self._get_unst, '=?utf-8?b?abcde?=', '=?utf-8?b?abcde?=', '=?utf-8?b?abcde?=', [], '') + , - def test_get_unstructured_no_whitespace_between_ews(self): - self._test_get_x(self._get_unst, + test_get_unstructured_no_whitespace_between_ews = C( '=?utf-8?q?foo?==?utf-8?q?bar?=', 'foobar', 'foobar', [errors.InvalidHeaderDefect, errors.InvalidHeaderDefect], '') + , - def test_get_unstructured_ew_without_leading_whitespace(self): - self._test_get_x( - self._get_unst, + test_get_unstructured_ew_without_leading_whitespace = C( 'nowhitespace=?utf-8?q?somevalue?=', 'nowhitespacesomevalue', 'nowhitespacesomevalue', [errors.InvalidHeaderDefect], '') + , - def test_get_unstructured_ew_without_trailing_whitespace(self): - self._test_get_x( - self._get_unst, + test_get_unstructured_ew_without_trailing_whitespace = C( '=?utf-8?q?somevalue?=nowhitespace', 'somevaluenowhitespace', 'somevaluenowhitespace', [errors.InvalidHeaderDefect], '') + , - def test_get_unstructured_without_trailing_whitespace_hang_case(self): - self._test_get_x(self._get_unst, + # bpo-37764 + test_get_unstructured_without_trailing_whitespace_hang_case = C( '=?utf-8?q?somevalue?=aa', 'somevalueaa', 'somevalueaa', [errors.InvalidHeaderDefect], '') + , - def test_get_unstructured_invalid_ew2(self): - self._test_get_x(self._get_unst, + test_get_unstructured_invalid_ew2 = C( '=?utf-8?q?=somevalue?=', '=?utf-8?q?=somevalue?=', '=?utf-8?q?=somevalue?=', [], '') + , - def test_get_unstructured_invalid_ew_cte(self): - self._test_get_x(self._get_unst, + test_get_unstructured_invalid_ew_cte = C( '=?utf-8?X?=somevalue?=', '=?utf-8?X?=somevalue?=', '=?utf-8?X?=somevalue?=', [], '') + , + + ) + # get_qp_ctext From 86f9dca6c5d235b478517fa0415bfa2b24893ab0 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 5 Jan 2026 13:14:39 -0500 Subject: [PATCH 035/152] Fix whitespace/test names in get_unstructured tests. --- .../test_email/test__header_value_parser.py | 302 +++++++++--------- 1 file changed, 157 insertions(+), 145 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 55c86c2895ddb61..acd4bdbb332eb7c 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -965,250 +965,262 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): ), ), - # XXX XXX TEMP test1 - test_get_unstructured_null = C( - '', '', '', [], ''), - - # XXX XXX TEMP test2 - test_get_unstructured_one_word = C( - 'foo', 'foo', 'foo', [], ''), - - test_get_unstructured_normal_phrase = C( - 'foo bar bird', - 'foo bar bird', - 'foo bar bird', - [], - '') - , - - test_get_unstructured_normal_phrase_with_whitespace = C( - 'foo \t bar bird', - 'foo \t bar bird', - 'foo bar bird', - [], - '') - , - - test_get_unstructured_leading_whitespace = C( - ' foo bar', - ' foo bar', - ' foo bar', - [], - '') - , - - test_get_unstructured_trailing_whitespace = C( - 'foo bar ', - 'foo bar ', - 'foo bar ', - [], - '') - , - - test_get_unstructured_leading_and_trailing_whitespace = C( - ' foo bar ', - ' foo bar ', - ' foo bar ', - [], - '') - , - - test_get_unstructured_one_valid_ew_no_ws = C( - '=?us-ascii?q?bar?=', - 'bar', - 'bar', - [], - '') - , - - test_get_unstructured_one_ew_trailing_ws = C( - '=?us-ascii?q?bar?= ', - 'bar ', - 'bar ', - [], - '') - , - - test_get_unstructured_one_valid_ew_trailing_text = C( - '=?us-ascii?q?bar?= bird', - 'bar bird', - 'bar bird', - [], - '') - , - - test_get_unstructured_phrase_with_ew_in_middle_of_text = C( - 'foo =?us-ascii?q?bar?= bird', - 'foo bar bird', - 'foo bar bird', - [], - '') - , - - test_get_unstructured_phrase_with_two_ew = C( + null = C( + '', + '', + '', + [], + '', + ), + + one_word = C( + 'foo', + 'foo', + 'foo', + [], + '', + ), + + normal_phrase = C( + 'foo bar bird', + 'foo bar bird', + 'foo bar bird', + [], + '', + ), + + normal_phrase_with_whitespace = C( + 'foo \t bar bird', + 'foo \t bar bird', + 'foo bar bird', + [], + '', + ), + + leading_whitespace = C( + ' foo bar', + ' foo bar', + ' foo bar', + [], + '', + ), + + trailing_whitespace = C( + 'foo bar ', + 'foo bar ', + 'foo bar ', + [], + '', + ), + + leading_and_trailing_whitespace = C( + ' foo bar ', + ' foo bar ', + ' foo bar ', + [], + '', + ), + + one_valid_ew_no_ws = C( + '=?us-ascii?q?bar?=', + 'bar', + 'bar', + [], + '', + ), + + one_ew_trailing_ws = C( + '=?us-ascii?q?bar?= ', + 'bar ', + 'bar ', + [], + '', + ), + + one_valid_ew_trailing_text = C( + '=?us-ascii?q?bar?= bird', + 'bar bird', + 'bar bird', + [], + '', + ), + + phrase_with_ew_in_middle_of_text = C( + 'foo =?us-ascii?q?bar?= bird', + 'foo bar bird', + 'foo bar bird', + [], + '', + ), + + phrase_with_two_ew = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?=', 'foo barbird', 'foo barbird', [], - '') - , + '', + ), - test_get_unstructured_phrase_with_two_ew_trailing_ws = C( + phrase_with_two_ew_trailing_ws = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?= ', 'foo barbird ', 'foo barbird ', [], - '') - , + '', + ), - test_get_unstructured_phrase_with_ew_with_leading_ws = C( + phrase_with_ew_with_leading_ws = C( ' =?us-ascii?q?bar?=', ' bar', ' bar', [], - '') - , + '', + ), - test_get_unstructured_phrase_with_two_ew_extra_ws = C( + phrase_with_two_ew_extra_ws = C( 'foo =?us-ascii?q?bar?= \t =?us-ascii?q?bird?=', 'foo barbird', 'foo barbird', [], - '') - , + '', + ), - test_get_unstructured_two_ew_extra_ws_trailing_text = C( + two_ew_extra_ws_trailing_text = C( '=?us-ascii?q?test?= =?us-ascii?q?foo?= val', 'testfoo val', 'testfoo val', [], - '') - , + '', + ), - test_get_unstructured_ew_with_internal_ws = C( + ew_with_internal_ws = C( '=?iso-8859-1?q?hello=20world?=', 'hello world', 'hello world', [], - '') - , + '', + ), - test_get_unstructured_ew_with_internal_leading_ws = C( + ew_with_internal_leading_ws = C( ' =?us-ascii?q?=20test?= =?us-ascii?q?=20foo?= val', ' test foo val', ' test foo val', [], - '') - , + '', + ), - test_get_unstructured_invalid_ew = C( + invalid_ew = C( '=?test val', '=?test val', '=?test val', [], - '') - , + '', + ), - test_get_unstructured_undecodable_bytes = C( + undecodable_bytes = C( b'test \xACfoo val'.decode('ascii', 'surrogateescape'), 'test \uDCACfoo val', 'test \uDCACfoo val', [errors.UndecodableBytesDefect], - '') - , + '', + ), - test_get_unstructured_undecodable_bytes_in_EW = C( + undecodable_bytes_in_EW = C( (b'=?us-ascii?q?=20test?= =?us-ascii?q?=20\xACfoo?=' b' val').decode('ascii', 'surrogateescape'), ' test \uDCACfoo val', ' test \uDCACfoo val', [errors.UndecodableBytesDefect]*2, - '') - , + '', + ), - test_get_unstructured_missing_base64_padding = C( + missing_base64_padding = C( '=?utf-8?b?dmk?=', 'vi', 'vi', [errors.InvalidBase64PaddingDefect], - '') - , + '', + ), - test_get_unstructured_invalid_base64_character = C( + invalid_base64_character = C( '=?utf-8?b?dm\x01k===?=', 'vi', 'vi', [errors.InvalidBase64CharactersDefect], - '') - , + '', + ), - test_get_unstructured_invalid_base64_character_and_bad_padding = C( + invalid_base64_character_and_bad_padding = C( '=?utf-8?b?dm\x01k?=', 'vi', 'vi', - [errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect], - '') - , + [ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + '', + ), - test_get_unstructured_invalid_base64_length = C( # bpo-27397/gh-71584: there's no way to decode this. + invalid_base64_length = C( '=?utf-8?b?abcde?=', '=?utf-8?b?abcde?=', '=?utf-8?b?abcde?=', [], - '') - , + '' + ), - test_get_unstructured_no_whitespace_between_ews = C( + no_whitespace_between_ews = C( '=?utf-8?q?foo?==?utf-8?q?bar?=', 'foobar', 'foobar', - [errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect], - '') - , + [ + errors.InvalidHeaderDefect, + errors.InvalidHeaderDefect, + ], + '', + ), - test_get_unstructured_ew_without_leading_whitespace = C( + ew_without_leading_whitespace = C( 'nowhitespace=?utf-8?q?somevalue?=', 'nowhitespacesomevalue', 'nowhitespacesomevalue', [errors.InvalidHeaderDefect], - '') - , + '', + ), - test_get_unstructured_ew_without_trailing_whitespace = C( + ew_without_trailing_whitespace = C( '=?utf-8?q?somevalue?=nowhitespace', 'somevaluenowhitespace', 'somevaluenowhitespace', [errors.InvalidHeaderDefect], - '') - , + '', + ), - # bpo-37764 - test_get_unstructured_without_trailing_whitespace_hang_case = C( + # bpo-37764 + without_trailing_whitespace_hang_case = C( '=?utf-8?q?somevalue?=aa', 'somevalueaa', 'somevalueaa', [errors.InvalidHeaderDefect], - '') - , + '', + ), - test_get_unstructured_invalid_ew2 = C( + invalid_ew2 = C( '=?utf-8?q?=somevalue?=', '=?utf-8?q?=somevalue?=', '=?utf-8?q?=somevalue?=', [], - '') - , + '', + ), - test_get_unstructured_invalid_ew_cte = C( + invalid_ew_cte = C( '=?utf-8?X?=somevalue?=', '=?utf-8?X?=somevalue?=', '=?utf-8?X?=somevalue?=', [], - '') - , + '', + ), ) From 39494e0e65079f2ef99dd111c19de141141e5963 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 5 Jan 2026 13:28:44 -0500 Subject: [PATCH 036/152] Convert get_unstructured tests to keyword form. --- .../test_email/test__header_value_parser.py | 161 +++++------------- 1 file changed, 41 insertions(+), 120 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index acd4bdbb332eb7c..7f87227983bc229 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -966,199 +966,136 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): ), null = C( - '', - '', - '', - [], '', ), one_word = C( 'foo', - 'foo', - 'foo', - [], - '', ), normal_phrase = C( 'foo bar bird', - 'foo bar bird', - 'foo bar bird', - [], - '', ), normal_phrase_with_whitespace = C( 'foo \t bar bird', - 'foo \t bar bird', - 'foo bar bird', - [], - '', + value='foo bar bird', ), leading_whitespace = C( ' foo bar', - ' foo bar', - ' foo bar', - [], - '', + value=' foo bar', ), trailing_whitespace = C( 'foo bar ', - 'foo bar ', - 'foo bar ', - [], - '', + value='foo bar ', ), leading_and_trailing_whitespace = C( ' foo bar ', - ' foo bar ', - ' foo bar ', - [], - '', + value=' foo bar ', ), one_valid_ew_no_ws = C( '=?us-ascii?q?bar?=', - 'bar', - 'bar', - [], - '', + stringified='bar', + value='bar', ), one_ew_trailing_ws = C( '=?us-ascii?q?bar?= ', - 'bar ', - 'bar ', - [], - '', + stringified='bar ', + value='bar ', ), one_valid_ew_trailing_text = C( '=?us-ascii?q?bar?= bird', - 'bar bird', - 'bar bird', - [], - '', + stringified='bar bird', ), phrase_with_ew_in_middle_of_text = C( 'foo =?us-ascii?q?bar?= bird', - 'foo bar bird', - 'foo bar bird', - [], - '', + stringified='foo bar bird', ), phrase_with_two_ew = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?=', - 'foo barbird', - 'foo barbird', - [], - '', + stringified='foo barbird', ), phrase_with_two_ew_trailing_ws = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?= ', - 'foo barbird ', - 'foo barbird ', - [], - '', + stringified='foo barbird ', + value='foo barbird ', ), phrase_with_ew_with_leading_ws = C( ' =?us-ascii?q?bar?=', - ' bar', - ' bar', - [], - '', + stringified=' bar', + value=' bar', ), phrase_with_two_ew_extra_ws = C( 'foo =?us-ascii?q?bar?= \t =?us-ascii?q?bird?=', - 'foo barbird', - 'foo barbird', - [], - '', + stringified='foo barbird', ), two_ew_extra_ws_trailing_text = C( '=?us-ascii?q?test?= =?us-ascii?q?foo?= val', - 'testfoo val', - 'testfoo val', - [], - '', + stringified='testfoo val', + value='testfoo val', ), ew_with_internal_ws = C( '=?iso-8859-1?q?hello=20world?=', - 'hello world', - 'hello world', - [], - '', + stringified='hello world', ), ew_with_internal_leading_ws = C( ' =?us-ascii?q?=20test?= =?us-ascii?q?=20foo?= val', - ' test foo val', - ' test foo val', - [], - '', + stringified=' test foo val', + value=' test foo val', ), invalid_ew = C( '=?test val', - '=?test val', - '=?test val', - [], - '', ), undecodable_bytes = C( b'test \xACfoo val'.decode('ascii', 'surrogateescape'), - 'test \uDCACfoo val', - 'test \uDCACfoo val', - [errors.UndecodableBytesDefect], - '', + stringified='test \uDCACfoo val', + value='test \uDCACfoo val', + defects=[errors.UndecodableBytesDefect], ), undecodable_bytes_in_EW = C( (b'=?us-ascii?q?=20test?= =?us-ascii?q?=20\xACfoo?=' b' val').decode('ascii', 'surrogateescape'), - ' test \uDCACfoo val', - ' test \uDCACfoo val', - [errors.UndecodableBytesDefect]*2, - '', + stringified=' test \uDCACfoo val', + value=' test \uDCACfoo val', + defects=[errors.UndecodableBytesDefect]*2, ), missing_base64_padding = C( '=?utf-8?b?dmk?=', - 'vi', - 'vi', - [errors.InvalidBase64PaddingDefect], - '', + stringified='vi', + defects=[errors.InvalidBase64PaddingDefect], ), invalid_base64_character = C( '=?utf-8?b?dm\x01k===?=', - 'vi', - 'vi', - [errors.InvalidBase64CharactersDefect], - '', + stringified='vi', + defects=[errors.InvalidBase64CharactersDefect], ), invalid_base64_character_and_bad_padding = C( '=?utf-8?b?dm\x01k?=', - 'vi', - 'vi', - [ + stringified='vi', + defects=[ errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect, ], - '', ), # bpo-27397/gh-71584: there's no way to decode this. @@ -1172,54 +1109,38 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): no_whitespace_between_ews = C( '=?utf-8?q?foo?==?utf-8?q?bar?=', - 'foobar', - 'foobar', - [ + stringified='foobar', + defects=[ errors.InvalidHeaderDefect, errors.InvalidHeaderDefect, ], - '', ), ew_without_leading_whitespace = C( 'nowhitespace=?utf-8?q?somevalue?=', - 'nowhitespacesomevalue', - 'nowhitespacesomevalue', - [errors.InvalidHeaderDefect], - '', + stringified='nowhitespacesomevalue', + defects=[errors.InvalidHeaderDefect], ), ew_without_trailing_whitespace = C( '=?utf-8?q?somevalue?=nowhitespace', - 'somevaluenowhitespace', - 'somevaluenowhitespace', - [errors.InvalidHeaderDefect], - '', + stringified='somevaluenowhitespace', + defects=[errors.InvalidHeaderDefect], ), # bpo-37764 without_trailing_whitespace_hang_case = C( '=?utf-8?q?somevalue?=aa', - 'somevalueaa', - 'somevalueaa', - [errors.InvalidHeaderDefect], - '', + stringified='somevalueaa', + defects=[errors.InvalidHeaderDefect], ), invalid_ew2 = C( '=?utf-8?q?=somevalue?=', - '=?utf-8?q?=somevalue?=', - '=?utf-8?q?=somevalue?=', - [], - '', ), invalid_ew_cte = C( '=?utf-8?X?=somevalue?=', - '=?utf-8?X?=somevalue?=', - '=?utf-8?X?=somevalue?=', - [], - '', ), ) From 69c93ea3c20c60054e73e3441d1ccd8a3a851a78 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 17 May 2026 10:09:42 -0400 Subject: [PATCH 037/152] Remove now redundant get_unstructured tests. These tests were copied in to the get_encoded_word test set earlier, and now get_unstructured is running them by reusing those parameters, making these parameter sets redundant. invalid_ew_cte is the only one whose name is different, it's name is unknown_cte in the get_encoded_word parameter set. --- .../test_email/test__header_value_parser.py | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 7f87227983bc229..90efa0ae051d662 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1077,36 +1077,6 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): defects=[errors.UndecodableBytesDefect]*2, ), - missing_base64_padding = C( - '=?utf-8?b?dmk?=', - stringified='vi', - defects=[errors.InvalidBase64PaddingDefect], - ), - - invalid_base64_character = C( - '=?utf-8?b?dm\x01k===?=', - stringified='vi', - defects=[errors.InvalidBase64CharactersDefect], - ), - - invalid_base64_character_and_bad_padding = C( - '=?utf-8?b?dm\x01k?=', - stringified='vi', - defects=[ - errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect, - ], - ), - - # bpo-27397/gh-71584: there's no way to decode this. - invalid_base64_length = C( - '=?utf-8?b?abcde?=', - '=?utf-8?b?abcde?=', - '=?utf-8?b?abcde?=', - [], - '' - ), - no_whitespace_between_ews = C( '=?utf-8?q?foo?==?utf-8?q?bar?=', stringified='foobar', @@ -1139,10 +1109,6 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): '=?utf-8?q?=somevalue?=', ), - invalid_ew_cte = C( - '=?utf-8?X?=somevalue?=', - ), - ) From 19187ebe646472ff327720e2e20ff83e458b48fa Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 5 Jan 2026 13:43:02 -0500 Subject: [PATCH 038/152] Add more get_unstructured tests, specific defects. --- .../test_email/test__header_value_parser.py | 74 +++++++++++++++++-- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 90efa0ae051d662..8515618e188e906 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1066,7 +1066,7 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): b'test \xACfoo val'.decode('ascii', 'surrogateescape'), stringified='test \uDCACfoo val', value='test \uDCACfoo val', - defects=[errors.UndecodableBytesDefect], + defects=[undecodable_bytes_defect], ), undecodable_bytes_in_EW = C( @@ -1074,41 +1074,101 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): b' val').decode('ascii', 'surrogateescape'), stringified=' test \uDCACfoo val', value=' test \uDCACfoo val', - defects=[errors.UndecodableBytesDefect]*2, + defects=[ + undecodable_bytes_defect, + (undecodable_bytes_in_ew_defect, 'us-ascii'), + ], ), + no_whitespace_between_ews = C( '=?utf-8?q?foo?==?utf-8?q?bar?=', stringified='foobar', defects=[ - errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, ], ), ew_without_leading_whitespace = C( 'nowhitespace=?utf-8?q?somevalue?=', stringified='nowhitespacesomevalue', - defects=[errors.InvalidHeaderDefect], + defects=[missing_whitespace_before_ew_defect], ), ew_without_trailing_whitespace = C( '=?utf-8?q?somevalue?=nowhitespace', stringified='somevaluenowhitespace', - defects=[errors.InvalidHeaderDefect], + defects=[missing_whitespace_after_ew_defect], ), # bpo-37764 without_trailing_whitespace_hang_case = C( '=?utf-8?q?somevalue?=aa', stringified='somevalueaa', - defects=[errors.InvalidHeaderDefect], + defects=[missing_whitespace_after_ew_defect], ), invalid_ew2 = C( '=?utf-8?q?=somevalue?=', ), + **for_each_character(RFC_PRINTABLES)( + printable_around_and_between_ews = C( + '{char} =?utf-8?q?foo?= {char} =?utf-8?q?bar?= {char}', + stringified='{char} foo {char} bar {char}', + ), + ), + + # XXX XXX the '?=' skip is a sort-of bug the refactoring will fix. + **for_each_character(RFC_PRINTABLES, skip='_?=')( + printable_inside_ews = C( + '=?utf-8?q?rock{char}?= =?utf-8?q?{char}hard_place?=', + stringified='rock{char}{char}hard place', + ), + ), + + **for_each_character( + RFC_NONPRINTABLES, + # XXX XXX skip things split considers whitespace. This is buggy. + # US RS GS FS + skip=RFC_WSP + '\r\n\v\f\x1f\x1e\x1d\x1c', + )( + non_wsp_non_printable = C( + 'some {char} text', + stringified='some {char} text', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_wsp_non_printable_inside_ew = C( + '=?utf-8?q?some{char}?= text', + stringified='some{char} text', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + unicode = C( + '📦', + ), + + non_ascii_bytes = C( + '📦'.encode().decode('ascii', 'surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + invalid_ew_charset = C( + 'a =?invalid?q?=C3=89ric?= b', + stringified='a \udcc3\udc89ric b', + defects=[charset_defect('invalid'), undecodable_bytes_defect], + ), + + ew_start_chrome_before_real_ew = C( + 'z=?xx =?UTF-8?Q?foo?=', + stringified='z=?xx foo', + ), + ) From a496348a56fe7097e5f7bd81c3895f4a27b7e671 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 7 Jan 2026 14:22:00 -0500 Subject: [PATCH 039/152] Begin conversion of get_qp_ctext tests. --- Lib/test/test_email/test__header_value_parser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 8515618e188e906..1a046f3b1d746c2 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1174,6 +1174,15 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): # get_qp_ctext + @params + def test_get_qp_ctext(self, s, *args, **kw): + ptext = self._test_parse(parser.get_qp_ctext, C(s), *args, **kw) + self.assertIsInstance(ptext, parser.Terminal) + self.assertEqual(ptext.token_type, 'ptext') + + params_test_get_qp_ctext = old_api_only( + ) + def test_get_qp_ctext_only(self): ptext = self._test_get_x(parser.get_qp_ctext, 'foobar', 'foobar', ' ', [], '') From 0499a3c3ee79c21f6f8afef57ef71315359d8f75 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 7 Jan 2026 14:47:08 -0500 Subject: [PATCH 040/152] Convert get_qp_ctext tests. --- .../test_email/test__header_value_parser.py | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1a046f3b1d746c2..295503d0afdb641 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1181,69 +1181,69 @@ def test_get_qp_ctext(self, s, *args, **kw): self.assertEqual(ptext.token_type, 'ptext') params_test_get_qp_ctext = old_api_only( - ) - def test_get_qp_ctext_only(self): - ptext = self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_only = C( 'foobar', 'foobar', ' ', [], '') - self.assertEqual(ptext.token_type, 'ptext') + , - def test_get_qp_ctext_all_printables(self): - with_qp = self.rfc_printable_ascii.replace('\\', '\\\\') - with_qp = with_qp. replace('(', r'\(') - with_qp = with_qp.replace(')', r'\)') - ptext = self._test_get_x(parser.get_qp_ctext, - with_qp, self.rfc_printable_ascii, ' ', [], '') + test_get_qp_ctext_all_printables = C( + RFC_PRINTABLES. + replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)'), + RFC_PRINTABLES, + ' ', [], '', + ), - def test_get_qp_ctext_two_words_gets_first(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_two_words_gets_first = C( 'foo de', 'foo', ' ', [], ' de') + , - def test_get_qp_ctext_following_wsp_preserved(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_following_wsp_preserved = C( 'foo \t\tde', 'foo', ' ', [], ' \t\tde') + , - def test_get_qp_ctext_up_to_close_paren_only(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_up_to_close_paren_only = C( 'foo)', 'foo', ' ', [], ')') + , - def test_get_qp_ctext_wsp_before_close_paren_preserved(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_wsp_before_close_paren_preserved = C( 'foo )', 'foo', ' ', [], ' )') + , - def test_get_qp_ctext_close_paren_mid_word(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_close_paren_mid_word = C( 'foo)bar', 'foo', ' ', [], ')bar') + , - def test_get_qp_ctext_up_to_open_paren_only(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_up_to_open_paren_only = C( 'foo(', 'foo', ' ', [], '(') + , - def test_get_qp_ctext_wsp_before_open_paren_preserved(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_wsp_before_open_paren_preserved = C( 'foo (', 'foo', ' ', [], ' (') + , - def test_get_qp_ctext_open_paren_mid_word(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_open_paren_mid_word = C( 'foo(bar', 'foo', ' ', [], '(bar') + , - def test_get_qp_ctext_non_printables(self): - ptext = self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_non_printables = C( 'foo\x00bar)', 'foo\x00bar', ' ', [errors.NonPrintableDefect], ')') - self.assertEqual(ptext.defects[0].non_printables[0], '\x00') + , + #self.assertEqual(ptext.defects[0].non_printables[0], '\x00') - def test_get_qp_ctext_close_paren_only(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_close_paren_only = C( ')', '', ' ', [], ')') + , - def test_get_qp_ctext_open_paren_only(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_open_paren_only = C( '(', '', ' ', [], '(') + , - def test_get_qp_ctext_no_end_char(self): - self._test_get_x(parser.get_qp_ctext, + test_get_qp_ctext_no_end_char = C( '', '', ' ', [], '') + , + + ) # get_qcontent From 2a3538848a2fea5e898f87cabf33decf4c21ddfd Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 7 Jan 2026 15:07:40 -0500 Subject: [PATCH 041/152] Fix whitespace and test names in get_qp_ctext tests. --- .../test_email/test__header_value_parser.py | 165 ++++++++++++------ 1 file changed, 109 insertions(+), 56 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 295503d0afdb641..f9f65781fafe1de 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1182,66 +1182,119 @@ def test_get_qp_ctext(self, s, *args, **kw): params_test_get_qp_ctext = old_api_only( - test_get_qp_ctext_only = C( - 'foobar', 'foobar', ' ', [], '') - , + only = C( + 'foobar', + 'foobar', + ' ', + [], + '', + ), - test_get_qp_ctext_all_printables = C( + all_printables = C( RFC_PRINTABLES. replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)'), RFC_PRINTABLES, - ' ', [], '', - ), - - test_get_qp_ctext_two_words_gets_first = C( - 'foo de', 'foo', ' ', [], ' de') - , - - test_get_qp_ctext_following_wsp_preserved = C( - 'foo \t\tde', 'foo', ' ', [], ' \t\tde') - , - - test_get_qp_ctext_up_to_close_paren_only = C( - 'foo)', 'foo', ' ', [], ')') - , - - test_get_qp_ctext_wsp_before_close_paren_preserved = C( - 'foo )', 'foo', ' ', [], ' )') - , - - test_get_qp_ctext_close_paren_mid_word = C( - 'foo)bar', 'foo', ' ', [], ')bar') - , - - test_get_qp_ctext_up_to_open_paren_only = C( - 'foo(', 'foo', ' ', [], '(') - , - - test_get_qp_ctext_wsp_before_open_paren_preserved = C( - 'foo (', 'foo', ' ', [], ' (') - , - - test_get_qp_ctext_open_paren_mid_word = C( - 'foo(bar', 'foo', ' ', [], '(bar') - , - - test_get_qp_ctext_non_printables = C( - 'foo\x00bar)', 'foo\x00bar', ' ', - [errors.NonPrintableDefect], ')') - , - #self.assertEqual(ptext.defects[0].non_printables[0], '\x00') - - test_get_qp_ctext_close_paren_only = C( - ')', '', ' ', [], ')') - , - - test_get_qp_ctext_open_paren_only = C( - '(', '', ' ', [], '(') - , - - test_get_qp_ctext_no_end_char = C( - '', '', ' ', [], '') - , + ' ', + [], + '', + ), + + two_words_gets_first = C( + 'foo de', + 'foo', + ' ', + [], + ' de', + ), + + following_wsp_preserved = C( + 'foo \t\tde', + 'foo', + ' ', + [], + ' \t\tde', + ), + + up_to_close_paren_only = C( + 'foo)', + 'foo', + ' ', + [], + ')', + ), + + wsp_before_close_paren_preserved = C( + 'foo )', + 'foo', + ' ', + [], + ' )', + ), + + close_paren_mid_word = C( + 'foo)bar', + 'foo', + ' ', + [], + ')bar', + ), + + up_to_open_paren_only = C( + 'foo(', + 'foo', + ' ', + [], + '(', + ), + + wsp_before_open_paren_preserved = C( + 'foo (', + 'foo', + ' ', + [], + ' (', + ), + + open_paren_mid_word = C( + 'foo(bar', + 'foo', + ' ', + [], + '(bar', + ), + + non_printables = C( + 'foo\x00bar)', + 'foo\x00bar', + ' ', + [errors.NonPrintableDefect], + ')', + ), + #self.assertEqual(ptext.defects[0].non_printables[0], '\x00') + + close_paren_only = C( + ')', + '', + ' ', + [], + ')', + ), + + open_paren_only = C( + '(', + '', + ' ', + [], + '(', + ), + + no_end_char = C( + '', + '', + ' ', + [], + '', + ), ) From 407040923f5004736577737acbcbd9209a8113ed Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 7 Jan 2026 15:14:21 -0500 Subject: [PATCH 042/152] Convert get_qp_ctext tests to keyword form. --- .../test_email/test__header_value_parser.py | 80 +++++-------------- 1 file changed, 21 insertions(+), 59 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f9f65781fafe1de..9c8b8b869ad6a1c 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1175,8 +1175,14 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): # get_qp_ctext @params - def test_get_qp_ctext(self, s, *args, **kw): - ptext = self._test_parse(parser.get_qp_ctext, C(s), *args, **kw) + def test_get_qp_ctext(self, s, *args, value=' ', **kw): + ptext = self._test_parse( + parser.get_qp_ctext, + C(s), + *args, + value=value, + **kw, + ) self.assertIsInstance(ptext, parser.Terminal) self.assertEqual(ptext.token_type, 'ptext') @@ -1184,115 +1190,71 @@ def test_get_qp_ctext(self, s, *args, **kw): only = C( 'foobar', - 'foobar', - ' ', - [], - '', ), all_printables = C( RFC_PRINTABLES. replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)'), - RFC_PRINTABLES, - ' ', - [], - '', + stringified=RFC_PRINTABLES, ), two_words_gets_first = C( 'foo de', - 'foo', - ' ', - [], - ' de', + remainder=' de', ), following_wsp_preserved = C( 'foo \t\tde', - 'foo', - ' ', - [], - ' \t\tde', + remainder=' \t\tde', ), up_to_close_paren_only = C( 'foo)', - 'foo', - ' ', - [], - ')', + remainder=')', ), wsp_before_close_paren_preserved = C( 'foo )', - 'foo', - ' ', - [], - ' )', + remainder=' )', ), close_paren_mid_word = C( 'foo)bar', - 'foo', - ' ', - [], - ')bar', + remainder=')bar', ), up_to_open_paren_only = C( 'foo(', - 'foo', - ' ', - [], - '(', + remainder='(', ), wsp_before_open_paren_preserved = C( 'foo (', - 'foo', - ' ', - [], - ' (', + remainder=' (', ), open_paren_mid_word = C( 'foo(bar', - 'foo', - ' ', - [], - '(bar', + remainder='(bar', ), non_printables = C( 'foo\x00bar)', - 'foo\x00bar', - ' ', - [errors.NonPrintableDefect], - ')', + defects=[nonprintable_defect('\x00')], + remainder=')', ), - #self.assertEqual(ptext.defects[0].non_printables[0], '\x00') close_paren_only = C( ')', - '', - ' ', - [], - ')', + remainder=')', ), open_paren_only = C( '(', - '', - ' ', - [], - '(', + remainder='(', ), no_end_char = C( - '', - '', - ' ', - [], '', ), From 59d908866e9568ce16c5c660e100fbbcd90f345d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 7 Jan 2026 16:32:20 -0500 Subject: [PATCH 043/152] get_qp_ctext test improvements. --- .../test_email/test__header_value_parser.py | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9c8b8b869ad6a1c..e23b602a5d3b269 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1188,7 +1188,7 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): params_test_get_qp_ctext = old_api_only( - only = C( + value_ends_at_input_end = C( 'foobar', ), @@ -1238,10 +1238,12 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): remainder='(bar', ), - non_printables = C( - 'foo\x00bar)', - defects=[nonprintable_defect('\x00')], - remainder=')', + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printables = C( + 'foo{char}bar)', + defects=[(nonprintable_defect, '{char}')], + remainder=')', + ), ), close_paren_only = C( @@ -1254,10 +1256,38 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): remainder='(', ), - no_end_char = C( + no_content = C( '', ), + parens_are_content_if_quoted = C( + r'\(bar\)\)bird\(', + stringified='(bar))bird(', + ), + + escapes_are_removed_in_str = C( + r'fairly\&\boring\W\@\!ks', + stringified='fairly&boringW@!ks', + ), + + any_printable_may_be_escaped = C( + ''.join(rf'\{c}' for c in RFC_PRINTABLES), + RFC_PRINTABLES, + ), + + unicode_content = C( + '⛔❌❗', + ), + + mixed_unicode_and_ascii = C( + 'ministry✌of⛔silly❌walks❗', + ), + + unicode_can_be_quoted = C( + r'sillier\❌walks\❗', + stringified='sillier❌walks❗', + ), + ) From b560cc6b120d443a41c84fa91c273633fb21e949 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 11 Jan 2026 12:14:13 -0500 Subject: [PATCH 044/152] Begin converting get_qcontent tests to new framework. --- Lib/test/test_email/test__header_value_parser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e23b602a5d3b269..5d413ad1e3cbf07 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1293,6 +1293,15 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): # get_qcontent + @params + def test_get_qcontent(self, s, *args, **kw): + ptext = self._test_parse(parser.get_qcontent, C(s), *args, **kw) + self.assertIsInstance(ptext, parser.Terminal) + self.assertEqual(ptext.token_type, 'ptext') + + params_test_get_qcontent = old_api_only( + ) + def test_get_qcontent_only(self): ptext = self._test_get_x(parser.get_qcontent, 'foobar', 'foobar', 'foobar', [], '') From b31a66e5dcd717a028a8264db1ba05ca1c03d3af Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 11 Jan 2026 17:05:31 -0500 Subject: [PATCH 045/152] Rough conversion of get_qcontent tests. --- .../test_email/test__header_value_parser.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 5d413ad1e3cbf07..3913e1187279ee2 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1300,53 +1300,53 @@ def test_get_qcontent(self, s, *args, **kw): self.assertEqual(ptext.token_type, 'ptext') params_test_get_qcontent = old_api_only( - ) - def test_get_qcontent_only(self): - ptext = self._test_get_x(parser.get_qcontent, + test_get_qcontent_only = C( 'foobar', 'foobar', 'foobar', [], '') - self.assertEqual(ptext.token_type, 'ptext') + , - def test_get_qcontent_all_printables(self): - with_qp = self.rfc_printable_ascii.replace('\\', '\\\\') - with_qp = with_qp. replace('"', r'\"') - ptext = self._test_get_x(parser.get_qcontent, with_qp, - self.rfc_printable_ascii, - self.rfc_printable_ascii, [], '') + test_get_qcontent_all_printables = C( + RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"'), + RFC_PRINTABLES, + RFC_PRINTABLES, [], '', + ), - def test_get_qcontent_two_words_gets_first(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_two_words_gets_first = C( 'foo de', 'foo', 'foo', [], ' de') + , - def test_get_qcontent_following_wsp_preserved(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_following_wsp_preserved = C( 'foo \t\tde', 'foo', 'foo', [], ' \t\tde') + , - def test_get_qcontent_up_to_dquote_only(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_up_to_dquote_only = C( 'foo"', 'foo', 'foo', [], '"') + , - def test_get_qcontent_wsp_before_close_paren_preserved(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_wsp_before_close_paren_preserved = C( 'foo "', 'foo', 'foo', [], ' "') + , - def test_get_qcontent_close_paren_mid_word(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_close_paren_mid_word = C( 'foo"bar', 'foo', 'foo', [], '"bar') + , - def test_get_qcontent_non_printables(self): - ptext = self._test_get_x(parser.get_qcontent, + test_get_qcontent_non_printables = C( 'foo\x00fg"', 'foo\x00fg', 'foo\x00fg', [errors.NonPrintableDefect], '"') - self.assertEqual(ptext.defects[0].non_printables[0], '\x00') + , + #self.assertEqual(ptext.defects[0].non_printables[0], '\x00' - def test_get_qcontent_empty(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_empty = C( '"', '', '', [], '"') + , - def test_get_qcontent_no_end_char(self): - self._test_get_x(parser.get_qcontent, + test_get_qcontent_no_end_char = C( '', '', '', [], '') + , + + ) + # get_atext From 92941638c9a234dcaab9297a528795966572d6a6 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 11 Jan 2026 17:36:01 -0500 Subject: [PATCH 046/152] Fix whitespace and test names in get_qcontent tests. --- .../test_email/test__header_value_parser.py | 101 ++++++++++++------ 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 3913e1187279ee2..b4d504aca5a5f94 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1301,49 +1301,86 @@ def test_get_qcontent(self, s, *args, **kw): params_test_get_qcontent = old_api_only( - test_get_qcontent_only = C( - 'foobar', 'foobar', 'foobar', [], '') - , + only = C( + 'foobar', + 'foobar', + 'foobar', + [], + '', + ), - test_get_qcontent_all_printables = C( + all_printables = C( RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"'), - RFC_PRINTABLES, - RFC_PRINTABLES, [], '', + RFC_PRINTABLES, + RFC_PRINTABLES, + [], + '', ), - test_get_qcontent_two_words_gets_first = C( - 'foo de', 'foo', 'foo', [], ' de') - , + two_words_gets_first = C( + 'foo de', + 'foo', + 'foo', + [], + ' de', + ), - test_get_qcontent_following_wsp_preserved = C( - 'foo \t\tde', 'foo', 'foo', [], ' \t\tde') - , + following_wsp_preserved = C( + 'foo \t\tde', + 'foo', + 'foo', + [], + ' \t\tde', + ), - test_get_qcontent_up_to_dquote_only = C( - 'foo"', 'foo', 'foo', [], '"') - , + up_to_dquote_only = C( + 'foo"', + 'foo', + 'foo', + [], + '"', + ), - test_get_qcontent_wsp_before_close_paren_preserved = C( - 'foo "', 'foo', 'foo', [], ' "') - , + wsp_before_close_paren_preserved = C( + 'foo "', + 'foo', + 'foo', + [], + ' "', + ), - test_get_qcontent_close_paren_mid_word = C( - 'foo"bar', 'foo', 'foo', [], '"bar') - , + close_paren_mid_word = C( + 'foo"bar', + 'foo', + 'foo', + [], + '"bar', + ), - test_get_qcontent_non_printables = C( - 'foo\x00fg"', 'foo\x00fg', 'foo\x00fg', - [errors.NonPrintableDefect], '"') - , - #self.assertEqual(ptext.defects[0].non_printables[0], '\x00' + non_printables = C( + 'foo\x00fg"', + 'foo\x00fg', + 'foo\x00fg', + [errors.NonPrintableDefect], + '"', + ), + #self.assertEqual(ptext.defects[0].non_printables[0], '\x00' - test_get_qcontent_empty = C( - '"', '', '', [], '"') - , + empty = C( + '"', + '', + '', + [], + '"', + ), - test_get_qcontent_no_end_char = C( - '', '', '', [], '') - , + no_end_char = C( + '', + '', + '', + [], + '', + ), ) From 4adc7df51e762f7132f012454e10ddf2ada8b51b Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 11 Jan 2026 17:41:25 -0500 Subject: [PATCH 047/152] Put get_qcontent tests in keyword form. --- .../test_email/test__header_value_parser.py | 49 ++++--------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b4d504aca5a5f94..1e4a7229b9db393 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1303,82 +1303,51 @@ def test_get_qcontent(self, s, *args, **kw): only = C( 'foobar', - 'foobar', - 'foobar', - [], - '', ), all_printables = C( RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"'), - RFC_PRINTABLES, - RFC_PRINTABLES, - [], - '', + stringified=RFC_PRINTABLES, ), two_words_gets_first = C( 'foo de', - 'foo', - 'foo', - [], - ' de', + remainder=' de', ), following_wsp_preserved = C( 'foo \t\tde', - 'foo', - 'foo', - [], - ' \t\tde', + remainder=' \t\tde', ), up_to_dquote_only = C( 'foo"', - 'foo', - 'foo', - [], - '"', + remainder='"', ), wsp_before_close_paren_preserved = C( 'foo "', - 'foo', - 'foo', - [], - ' "', + remainder=' "', ), close_paren_mid_word = C( 'foo"bar', - 'foo', - 'foo', - [], - '"bar', + remainder='"bar', ), non_printables = C( 'foo\x00fg"', - 'foo\x00fg', - 'foo\x00fg', - [errors.NonPrintableDefect], - '"', + defects=[errors.NonPrintableDefect], + remainder='"', ), #self.assertEqual(ptext.defects[0].non_printables[0], '\x00' empty = C( '"', - '', - '', - [], - '"', + remainder='"', ), no_end_char = C( - '', - '', - '', - [], '', ), From 2ba9eab3799df4ab4f4e69816c193a91e59200db Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 11 Jan 2026 18:00:36 -0500 Subject: [PATCH 048/152] Improve get_qcontent tests. --- .../test_email/test__header_value_parser.py | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1e4a7229b9db393..070e3b6669e5383 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1301,7 +1301,7 @@ def test_get_qcontent(self, s, *args, **kw): params_test_get_qcontent = old_api_only( - only = C( + no_qp_no_end_char = C( 'foobar', ), @@ -1310,10 +1310,12 @@ def test_get_qcontent(self, s, *args, **kw): stringified=RFC_PRINTABLES, ), - two_words_gets_first = C( - 'foo de', - remainder=' de', - ), + **for_each_character(RFC_WSP)( + two_words_gets_first = C( + 'foo{char}de', + remainder='{char}de', + ), + ), following_wsp_preserved = C( 'foo \t\tde', @@ -1325,29 +1327,30 @@ def test_get_qcontent(self, s, *args, **kw): remainder='"', ), - wsp_before_close_paren_preserved = C( + wsp_before_dquote_preserved = C( 'foo "', remainder=' "', ), - close_paren_mid_word = C( + dquote_mid_word = C( 'foo"bar', remainder='"bar', ), - non_printables = C( - 'foo\x00fg"', - defects=[errors.NonPrintableDefect], - remainder='"', + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable = C( + 'foo{char}bar"', + defects=[(nonprintable_defect, '{char}')], + remainder='"', + ), ), - #self.assertEqual(ptext.defects[0].non_printables[0], '\x00' - empty = C( + no_content_before_dquote = C( '"', remainder='"', ), - no_end_char = C( + empty_value = C( '', ), From 0c6a8ca22510caa83804bdb6b7eaafc7f605d773 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 12 Jan 2026 19:23:38 -0500 Subject: [PATCH 049/152] Start conversion of get_atext tests. --- Lib/test/test_email/test__header_value_parser.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 070e3b6669e5383..087724d5da25028 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1359,6 +1359,17 @@ def test_get_qcontent(self, s, *args, **kw): # get_atext + @params + def test_get_atext(self, s, *args, **kw): + atext = self._test_parse(parser.get_atext, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atext, parser.Terminal) + self.assertEqual(atext.token_type, 'atext') + + params_test_get_atext = old_api_only( + ) + def test_get_atext_only(self): atext = self._test_get_x(parser.get_atext, 'foobar', 'foobar', 'foobar', [], '') From efa1b72f994a958c1013a2559a61bb42b72f284e Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 12 Jan 2026 19:39:48 -0500 Subject: [PATCH 050/152] Rough conversion of get_atext tests. Switch to a module global constant for the RFC atext character list. --- .../test_email/test__header_value_parser.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 087724d5da25028..b975e024b984b49 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -41,6 +41,13 @@ "!#$%&+-^_`{}~", )) +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 +RFC_ATEXT = ''.join(( + string.ascii_letters, + string.digits, + "!#$%&'*+-/=?^_`{|}~", + )) + ALL_ASCII = bytes(range(0, 128)).decode('ascii') @@ -394,8 +401,6 @@ def for_each_api(nl, *args, **kw): class TestParser(TestParserMixin, TestEmailBase): rfc_printable_ascii = bytes(range(33, 127)).decode('ascii') - rfc_atext_chars = (string.ascii_letters + string.digits + - "!#$%&\'*+-/=?^_`{}|~") rfc_dtext_chars = rfc_printable_ascii.translate(str.maketrans('','',r'\[]')) # _wsp_splitter @@ -1368,35 +1373,37 @@ def test_get_atext(self, s, *args, **kw): self.assertEqual(atext.token_type, 'atext') params_test_get_atext = old_api_only( - ) - def test_get_atext_only(self): - atext = self._test_get_x(parser.get_atext, + test_get_atext_only = C( 'foobar', 'foobar', 'foobar', [], '') - self.assertEqual(atext.token_type, 'atext') + , - def test_get_atext_all_atext(self): - atext = self._test_get_x(parser.get_atext, self.rfc_atext_chars, - self.rfc_atext_chars, - self.rfc_atext_chars, [], '') + test_get_atext_all_atext = C( + RFC_ATEXT, + RFC_ATEXT, + RFC_ATEXT, [], '') + , - def test_get_atext_two_words_gets_first(self): - self._test_get_x(parser.get_atext, + test_get_atext_two_words_gets_first = C( 'foo bar', 'foo', 'foo', [], ' bar') + , - def test_get_atext_following_wsp_preserved(self): - self._test_get_x(parser.get_atext, + test_get_atext_following_wsp_preserved = C( 'foo \t\tbar', 'foo', 'foo', [], ' \t\tbar') + , - def test_get_atext_up_to_special(self): - self._test_get_x(parser.get_atext, + test_get_atext_up_to_special = C( 'foo@bar', 'foo', 'foo', [], '@bar') + , - def test_get_atext_non_printables(self): - atext = self._test_get_x(parser.get_atext, + test_get_atext_non_printables = C( 'foo\x00bar(', 'foo\x00bar', 'foo\x00bar', [errors.NonPrintableDefect], '(') - self.assertEqual(atext.defects[0].non_printables[0], '\x00') + , + #self.assertEqual(atext.defects[0].non_printables[0], '\x00') + + ) + # get_bare_quoted_string From 6f29cf5b37c5039c282e06806a3b66844f8bd771 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 12 Jan 2026 19:47:45 -0500 Subject: [PATCH 051/152] Fix whitespace and test names in get_atext tests. --- .../test_email/test__header_value_parser.py | 65 ++++++++++++------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b975e024b984b49..98b594aa33fc519 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1374,33 +1374,54 @@ def test_get_atext(self, s, *args, **kw): params_test_get_atext = old_api_only( - test_get_atext_only = C( - 'foobar', 'foobar', 'foobar', [], '') - , + only = C( + 'foobar', + 'foobar', + 'foobar', + [], + '', + ), - test_get_atext_all_atext = C( - RFC_ATEXT, - RFC_ATEXT, - RFC_ATEXT, [], '') - , + all_atext = C( + RFC_ATEXT, + RFC_ATEXT, + RFC_ATEXT, + [], + '', + ), - test_get_atext_two_words_gets_first = C( - 'foo bar', 'foo', 'foo', [], ' bar') - , + two_words_gets_first = C( + 'foo bar', + 'foo', + 'foo', + [], + ' bar', + ), - test_get_atext_following_wsp_preserved = C( - 'foo \t\tbar', 'foo', 'foo', [], ' \t\tbar') - , + following_wsp_preserved = C( + 'foo \t\tbar', + 'foo', + 'foo', + [], + ' \t\tbar', + ), - test_get_atext_up_to_special = C( - 'foo@bar', 'foo', 'foo', [], '@bar') - , + up_to_special = C( + 'foo@bar', + 'foo', + 'foo', + [], + '@bar', + ), - test_get_atext_non_printables = C( - 'foo\x00bar(', 'foo\x00bar', 'foo\x00bar', - [errors.NonPrintableDefect], '(') - , - #self.assertEqual(atext.defects[0].non_printables[0], '\x00') + non_printables = C( + 'foo\x00bar(', + 'foo\x00bar', + 'foo\x00bar', + [errors.NonPrintableDefect], + '(', + ), + #self.assertEqual(atext.defects[0].non_printables[0], '\x00') ) From bfa0f8a37094d084fce0d7c589c9587e761781f2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 12 Jan 2026 19:48:54 -0500 Subject: [PATCH 052/152] Convert get_atext tests to keyeword form. --- .../test_email/test__header_value_parser.py | 29 ++++--------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 98b594aa33fc519..1bc5c1bc9802494 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1376,50 +1376,31 @@ def test_get_atext(self, s, *args, **kw): only = C( 'foobar', - 'foobar', - 'foobar', - [], - '', ), all_atext = C( RFC_ATEXT, - RFC_ATEXT, - RFC_ATEXT, - [], - '', ), two_words_gets_first = C( 'foo bar', - 'foo', - 'foo', - [], - ' bar', + remainder=' bar', ), following_wsp_preserved = C( 'foo \t\tbar', - 'foo', - 'foo', - [], - ' \t\tbar', + remainder=' \t\tbar', ), up_to_special = C( 'foo@bar', - 'foo', - 'foo', - [], - '@bar', + remainder='@bar', ), non_printables = C( 'foo\x00bar(', - 'foo\x00bar', - 'foo\x00bar', - [errors.NonPrintableDefect], - '(', + defects=[errors.NonPrintableDefect], + remainder='(', ), #self.assertEqual(atext.defects[0].non_printables[0], '\x00') From 095c94de8da052c34074e55b666df931419b8476 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 12 Jan 2026 20:08:56 -0500 Subject: [PATCH 053/152] Improve the get_atext tests. --- .../test_email/test__header_value_parser.py | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1bc5c1bc9802494..eee51a270b6b9da 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -48,6 +48,9 @@ "!#$%&'*+-/=?^_`{|}~", )) +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 +RFC_SPECIALS = r'()<>[]:;@\,."' + ALL_ASCII = bytes(range(0, 128)).decode('ascii') @@ -1392,17 +1395,38 @@ def test_get_atext(self, s, *args, **kw): remainder=' \t\tbar', ), - up_to_special = C( - 'foo@bar', - remainder='@bar', + **for_each_character(RFC_SPECIALS)( + up_to_special = C( + RFC_ATEXT. + replace('{', '{{').replace('}', '}}') + '{char}' + 'bar', + remainder='{char}bar', + ), ), - non_printables = C( - 'foo\x00bar(', - defects=[errors.NonPrintableDefect], - remainder='(', + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printables = C( + 'foo{char}bar(', + defects=[(nonprintable_defect, '{char}')], + remainder='(', + ), + ), + + **for_each_character(RFC_SPECIALS + RFC_WSP)( + no_atext_before_special_or_wsp = C( + '{char}foo', + exception=(errors.HeaderParseError, '{echar}foo'), + ), + ), + + undecodable_characters = C( + 'foo🎁bar'.encode().decode('us-ascii', errors='surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), ), - #self.assertEqual(atext.defects[0].non_printables[0], '\x00') ) From 771ee90e0cc0140d6e3db57cbb314bad401ff83d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 14 Jan 2026 16:51:16 -0500 Subject: [PATCH 054/152] Begin refactor of get_bare_quoted_string tests. --- .../test_email/test__header_value_parser.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index eee51a270b6b9da..4c7ac08c6582290 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1433,14 +1433,32 @@ def test_get_atext(self, s, *args, **kw): # get_bare_quoted_string + @params + def test_get_bare_quoted_string(self, s, *args, **kw): + bqs = self._test_parse( + parser.get_bare_quoted_string, + C(s), + *args, + **kw, + ) + if 'exception' in kw: + return + self.assertIsInstance(bqs, parser.BareQuotedString) + self.assertEqual(bqs.token_type, 'bare-quoted-string') + + params_test_get_bare_quoted_string = old_api_only( + ) + def test_get_bare_quoted_string_only(self): bqs = self._test_get_x(parser.get_bare_quoted_string, '"foo"', '"foo"', 'foo', [], '') self.assertEqual(bqs.token_type, 'bare-quoted-string') - def test_get_bare_quoted_string_must_start_with_dquote(self): + def test_get_bare_quoted_string_must_start_with_dquote_non_ws(self): with self.assertRaises(errors.HeaderParseError): parser.get_bare_quoted_string('foo"') + + def test_get_bare_quoted_string_must_start_with_dquote_ws(self): with self.assertRaises(errors.HeaderParseError): parser.get_bare_quoted_string(' "foo"') @@ -1481,6 +1499,8 @@ def test_get_bare_quoted_string_no_end_dquote(self): self._test_get_x(parser.get_bare_quoted_string, '"foo', '"foo"', 'foo', [errors.InvalidHeaderDefect], '') + + def test_get_bare_quoted_string_no_end_dquote_ws(self): self._test_get_x(parser.get_bare_quoted_string, '"foo ', '"foo "', 'foo ', [errors.InvalidHeaderDefect], '') From d34b36f2720807e7991a8ba6c0a5a202239a9e11 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 14 Jan 2026 17:06:03 -0500 Subject: [PATCH 055/152] Rough refactoring of bare_quoted_string tests. --- .../test_email/test__header_value_parser.py | 71 ++++++++++--------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 4c7ac08c6582290..1b804a443681fff 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1447,77 +1447,80 @@ def test_get_bare_quoted_string(self, s, *args, **kw): self.assertEqual(bqs.token_type, 'bare-quoted-string') params_test_get_bare_quoted_string = old_api_only( - ) - def test_get_bare_quoted_string_only(self): - bqs = self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_only = C( '"foo"', '"foo"', 'foo', [], '') - self.assertEqual(bqs.token_type, 'bare-quoted-string') + , - def test_get_bare_quoted_string_must_start_with_dquote_non_ws(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_bare_quoted_string('foo"') + test_get_bare_quoted_string_must_start_with_dquote_non_ws = C( + 'foo"', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_bare_quoted_string_must_start_with_dquote_ws(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_bare_quoted_string(' "foo"') + test_get_bare_quoted_string_must_start_with_dquote_ws = C( + ' "foo"', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_bare_quoted_string_only_quotes(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_only_quotes = C( '""', '""', '', [], '') + , - def test_get_bare_quoted_string_missing_endquotes(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_missing_endquotes = C( '"', '""', '', [errors.InvalidHeaderDefect], '') + , - def test_get_bare_quoted_string_following_wsp_preserved(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_following_wsp_preserved = C( '"foo"\t bar', '"foo"', 'foo', [], '\t bar') + , - def test_get_bare_quoted_string_multiple_words(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_multiple_words = C( '"foo bar moo"', '"foo bar moo"', 'foo bar moo', [], '') + , - def test_get_bare_quoted_string_multiple_words_wsp_preserved(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_multiple_words_wsp_preserved = C( '" foo moo\t"', '" foo moo\t"', ' foo moo\t', [], '') + , - def test_get_bare_quoted_string_end_dquote_mid_word(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_end_dquote_mid_word = C( '"foo"bar', '"foo"', 'foo', [], 'bar') + , - def test_get_bare_quoted_string_quoted_dquote(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_quoted_dquote = C( r'"foo\"in"a', r'"foo\"in"', 'foo"in', [], 'a') + , - def test_get_bare_quoted_string_non_printables(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_non_printables = C( '"a\x01a"', '"a\x01a"', 'a\x01a', [errors.NonPrintableDefect], '') + , - def test_get_bare_quoted_string_no_end_dquote(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_no_end_dquote = C( '"foo', '"foo"', 'foo', [errors.InvalidHeaderDefect], '') + , - def test_get_bare_quoted_string_no_end_dquote_ws(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_no_end_dquote_ws = C( '"foo ', '"foo "', 'foo ', [errors.InvalidHeaderDefect], '') + , - def test_get_bare_quoted_string_empty_quotes(self): - self._test_get_x(parser.get_bare_quoted_string, + test_get_bare_quoted_string_empty_quotes = C( '""', '""', '', [], '') + , # Issue 16983: apply postel's law to some bad encoding. - def test_encoded_word_inside_quotes(self): - self._test_get_x(parser.get_bare_quoted_string, + test_encoded_word_inside_quotes = C( '"=?utf-8?Q?not_really_valid?="', '"not really valid"', 'not really valid', [errors.InvalidHeaderDefect, errors.InvalidHeaderDefect], '') + , + + ) + # get_comment From 6cda6521ddc29a126418941b9c3a8d8b108af750 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 14 Jan 2026 17:55:51 -0500 Subject: [PATCH 056/152] Fix whitespace and test names in bare_quoted_string tests. --- .../test_email/test__header_value_parser.py | 145 ++++++++++++------ 1 file changed, 96 insertions(+), 49 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1b804a443681fff..f9c2534cd625189 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1448,74 +1448,121 @@ def test_get_bare_quoted_string(self, s, *args, **kw): params_test_get_bare_quoted_string = old_api_only( - test_get_bare_quoted_string_only = C( - '"foo"', '"foo"', 'foo', [], '') - , + only = C( + '"foo"', + '"foo"', + 'foo', + [], + '', + ), - test_get_bare_quoted_string_must_start_with_dquote_non_ws = C( - 'foo"', - exception=(errors.HeaderParseError, '.*'), + must_start_with_dquote_non_ws = C( + 'foo"', + exception=(errors.HeaderParseError, '.*'), ), - test_get_bare_quoted_string_must_start_with_dquote_ws = C( - ' "foo"', - exception=(errors.HeaderParseError, '.*'), + must_start_with_dquote_ws = C( + ' "foo"', + exception=(errors.HeaderParseError, '.*'), ), - test_get_bare_quoted_string_only_quotes = C( - '""', '""', '', [], '') - , + only_quotes = C( + '""', + '""', + '', + [], + '', + ), - test_get_bare_quoted_string_missing_endquotes = C( - '"', '""', '', [errors.InvalidHeaderDefect], '') - , + missing_endquotes = C( + '"', + '""', + '', + [errors.InvalidHeaderDefect], + '', + ), - test_get_bare_quoted_string_following_wsp_preserved = C( - '"foo"\t bar', '"foo"', 'foo', [], '\t bar') - , + following_wsp_preserved = C( + '"foo"\t bar', + '"foo"', + 'foo', + [], + '\t bar', + ), - test_get_bare_quoted_string_multiple_words = C( - '"foo bar moo"', '"foo bar moo"', 'foo bar moo', [], '') - , + multiple_words = C( + '"foo bar moo"', + '"foo bar moo"', + 'foo bar moo', + [], + '', + ), - test_get_bare_quoted_string_multiple_words_wsp_preserved = C( - '" foo moo\t"', '" foo moo\t"', ' foo moo\t', [], '') - , + multiple_words_wsp_preserved = C( + '" foo moo\t"', + '" foo moo\t"', + ' foo moo\t', + [], + '', + ), - test_get_bare_quoted_string_end_dquote_mid_word = C( - '"foo"bar', '"foo"', 'foo', [], 'bar') - , + end_dquote_mid_word = C( + '"foo"bar', + '"foo"', + 'foo', + [], + 'bar', + ), - test_get_bare_quoted_string_quoted_dquote = C( - r'"foo\"in"a', r'"foo\"in"', 'foo"in', [], 'a') - , + quoted_dquote = C( + r'"foo\"in"a', + r'"foo\"in"', + 'foo"in', + [], + 'a', + ), - test_get_bare_quoted_string_non_printables = C( - '"a\x01a"', '"a\x01a"', 'a\x01a', - [errors.NonPrintableDefect], '') - , + non_printables = C( + '"a\x01a"', + '"a\x01a"', + 'a\x01a', + [errors.NonPrintableDefect], + '', + ), - test_get_bare_quoted_string_no_end_dquote = C( - '"foo', '"foo"', 'foo', - [errors.InvalidHeaderDefect], '') - , + no_end_dquote = C( + '"foo', + '"foo"', + 'foo', + [errors.InvalidHeaderDefect], + '', + ), - test_get_bare_quoted_string_no_end_dquote_ws = C( - '"foo ', '"foo "', 'foo ', - [errors.InvalidHeaderDefect], '') - , + no_end_dquote_ws = C( + '"foo ', + '"foo "', + 'foo ', + [errors.InvalidHeaderDefect], + '', + ), - test_get_bare_quoted_string_empty_quotes = C( - '""', '""', '', [], '') - , + empty_quotes = C( + '""', + '""', + '', + [], + '', + ), - # Issue 16983: apply postel's law to some bad encoding. - test_encoded_word_inside_quotes = C( + # Issue 16983: apply postel's law to some bad encoding. + encoded_word_inside_quotes = C( '"=?utf-8?Q?not_really_valid?="', '"not really valid"', 'not really valid', - [errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect], + [ + errors.InvalidHeaderDefect, + errors.InvalidHeaderDefect, + ], '') , From 4c540b4cfc66961807d6abf0d5fd41c954ed63f0 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 14 Jan 2026 18:00:11 -0500 Subject: [PATCH 057/152] Convert bare_quoted_string tests to keyword form. --- .../test_email/test__header_value_parser.py | 79 ++++++------------- 1 file changed, 26 insertions(+), 53 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f9c2534cd625189..553c79f71c1fc64 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1450,10 +1450,7 @@ def test_get_bare_quoted_string(self, s, *args, **kw): only = C( '"foo"', - '"foo"', - 'foo', - [], - '', + value='foo', ), must_start_with_dquote_non_ws = C( @@ -1468,103 +1465,79 @@ def test_get_bare_quoted_string(self, s, *args, **kw): only_quotes = C( '""', - '""', - '', - [], - '', + value='', ), missing_endquotes = C( '"', - '""', - '', - [errors.InvalidHeaderDefect], - '', + stringified='""', + value='', + defects=[errors.InvalidHeaderDefect], ), following_wsp_preserved = C( '"foo"\t bar', - '"foo"', - 'foo', - [], - '\t bar', + value='foo', + remainder='\t bar', ), multiple_words = C( '"foo bar moo"', - '"foo bar moo"', - 'foo bar moo', - [], - '', + value='foo bar moo', ), multiple_words_wsp_preserved = C( '" foo moo\t"', - '" foo moo\t"', - ' foo moo\t', - [], - '', + value=' foo moo\t', ), end_dquote_mid_word = C( '"foo"bar', - '"foo"', - 'foo', - [], - 'bar', + value='foo', + remainder='bar', ), quoted_dquote = C( r'"foo\"in"a', - r'"foo\"in"', - 'foo"in', - [], - 'a', + value='foo"in', + remainder='a', ), non_printables = C( '"a\x01a"', - '"a\x01a"', - 'a\x01a', - [errors.NonPrintableDefect], - '', + value='a\x01a', + defects=[errors.NonPrintableDefect], ), no_end_dquote = C( '"foo', - '"foo"', - 'foo', - [errors.InvalidHeaderDefect], - '', + stringified='"foo"', + value='foo', + defects=[errors.InvalidHeaderDefect], ), no_end_dquote_ws = C( '"foo ', - '"foo "', - 'foo ', - [errors.InvalidHeaderDefect], - '', + stringified='"foo "', + value='foo ', + defects=[errors.InvalidHeaderDefect], ), empty_quotes = C( '""', - '""', - '', - [], - '', + value='', ), # Issue 16983: apply postel's law to some bad encoding. encoded_word_inside_quotes = C( '"=?utf-8?Q?not_really_valid?="', - '"not really valid"', - 'not really valid', - [ + stringified='"not really valid"', + value='not really valid', + defects=[ errors.InvalidHeaderDefect, errors.InvalidHeaderDefect, ], - '') - , + ), ) From f896f04e1158c490a08b249f8b834666d2860355 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 15 Jan 2026 15:58:04 -0500 Subject: [PATCH 058/152] Clarify test names, remove redundant test. --- .../test_email/test__header_value_parser.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 553c79f71c1fc64..ba524f8f2f11b4d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1448,17 +1448,17 @@ def test_get_bare_quoted_string(self, s, *args, **kw): params_test_get_bare_quoted_string = old_api_only( - only = C( + non_ws = C( '"foo"', value='foo', ), - must_start_with_dquote_non_ws = C( + no_leading_dquote_before_non_ws = C( 'foo"', exception=(errors.HeaderParseError, '.*'), ), - must_start_with_dquote_ws = C( + no_leading_dquote_before_ws = C( ' "foo"', exception=(errors.HeaderParseError, '.*'), ), @@ -1468,7 +1468,7 @@ def test_get_bare_quoted_string(self, s, *args, **kw): value='', ), - missing_endquotes = C( + missing_endquote = C( '"', stringified='""', value='', @@ -1509,25 +1509,20 @@ def test_get_bare_quoted_string(self, s, *args, **kw): defects=[errors.NonPrintableDefect], ), - no_end_dquote = C( + no_end_dquote_after_non_ws = C( '"foo', stringified='"foo"', value='foo', defects=[errors.InvalidHeaderDefect], ), - no_end_dquote_ws = C( + no_end_dquote_after_ws = C( '"foo ', stringified='"foo "', value='foo ', defects=[errors.InvalidHeaderDefect], ), - empty_quotes = C( - '""', - value='', - ), - # Issue 16983: apply postel's law to some bad encoding. encoded_word_inside_quotes = C( '"=?utf-8?Q?not_really_valid?="', From cfab711ca3104fa73c615dad61c39b307f05c79d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 15 Jan 2026 15:21:45 -0500 Subject: [PATCH 059/152] Improve bare_quoted_string tests. And a small bugfux: BUGFIX: gh-121284/GH-122754 introduced the ability to specify the terminal type in get_encoded_word, but only the get_unstructured calls were updated to use it. It remained the case that a quoted string ended up with a mixture of 'ptext' and 'vtext' tokens. The folder teats these the same, so that did not cause any problems. The parser now correctly makes all terminal tokens in quoted string content be of type 'ptext'. --- Lib/email/_header_value_parser.py | 2 +- .../test_email/test__header_value_parser.py | 70 ++++++++++++++++--- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 68fcf45d650e3fa..ab9231c12fbacf3 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1260,7 +1260,7 @@ def get_bare_quoted_string(value): elif value[:2] == '=?': valid_ew = False try: - token, value = get_encoded_word(value) + token, value = get_encoded_word(value, terminal_type='ptext') bare_quoted_string.defects.append(errors.InvalidHeaderDefect( "encoded word inside quoted string")) valid_ew = True diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ba524f8f2f11b4d..dacccce951c9d53 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1445,6 +1445,7 @@ def test_get_bare_quoted_string(self, s, *args, **kw): return self.assertIsInstance(bqs, parser.BareQuotedString) self.assertEqual(bqs.token_type, 'bare-quoted-string') + self.verify_terminal_types(bqs, 'ptext', 'fws') params_test_get_bare_quoted_string = old_api_only( @@ -1455,12 +1456,12 @@ def test_get_bare_quoted_string(self, s, *args, **kw): no_leading_dquote_before_non_ws = C( 'foo"', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, 'expected.*foo'), ), no_leading_dquote_before_ws = C( ' "foo"', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, 'expected.*"foo"'), ), only_quotes = C( @@ -1472,7 +1473,7 @@ def test_get_bare_quoted_string(self, s, *args, **kw): '"', stringified='""', value='', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_quoted_string_defect], ), following_wsp_preserved = C( @@ -1503,24 +1504,38 @@ def test_get_bare_quoted_string(self, s, *args, **kw): remainder='a', ), - non_printables = C( - '"a\x01a"', - value='a\x01a', - defects=[errors.NonPrintableDefect], + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printables = C( + '"a{char}a"', + value='a{char}a', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + all_printables_allowed = C( + f'"{RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"')}"', + value=RFC_PRINTABLES, + ), + + any_printable_may_be_escaped = C( + f'"{''.join(rf'\{c}' for c in RFC_PRINTABLES)}"', + stringified= + f'"{RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"')}"', + value=RFC_PRINTABLES, ), no_end_dquote_after_non_ws = C( '"foo', stringified='"foo"', value='foo', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_quoted_string_defect], ), no_end_dquote_after_ws = C( '"foo ', stringified='"foo "', value='foo ', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_quoted_string_defect], ), # Issue 16983: apply postel's law to some bad encoding. @@ -1529,11 +1544,44 @@ def test_get_bare_quoted_string(self, s, *args, **kw): stringified='"not really valid"', value='not really valid', defects=[ - errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect, + ew_inside_quoted_string_defect, + missing_whitespace_after_ew_defect, ], ), + # XXX XXX The decode failure here will be fixed in the refactor. + mixed_encoded_words_and_regular_text = C( + '"This has=?utf-8?Q?multiple?= =?utf-8?q?errors?=in it', + stringified='"This has=?utf-8?Q?multiple?= errorsin it"', + value='This has=?utf-8?Q?multiple?= errorsin it', + defects=[ + ew_inside_quoted_string_defect, + missing_whitespace_after_ew_defect, + end_inside_quoted_string_defect, + ], + ), + + encoded_word_after_dquote_with_no_ws = C( + '"test"of=?UTF-8?q?bad?=data', + value='test', + remainder='of=?UTF-8?q?bad?=data', + ), + + invalid_charset = C( + '"=?foo?Q?not_really_valid?= at all"', + stringified='"not really valid at all"', + value='not really valid at all', + defects=[ + ew_inside_quoted_string_defect, + charset_defect('foo'), + ], + ), + + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + ) From 237fa39b2531c9af6db56912ac215f45a59ed007 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 16 Jan 2026 14:51:01 -0500 Subject: [PATCH 060/152] Begin refactoring get_comment tests. --- .../test_email/test__header_value_parser.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index dacccce951c9d53..d3ab244bcf6b3a4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1587,14 +1587,33 @@ def test_get_bare_quoted_string(self, s, *args, **kw): # get_comment + @params + def test_get_comment(self, s, *args, **kw): + cmt = self._test_parse( + parser.get_comment, + C(s), + *args, + **kw, + ) + if 'exception' in kw: + return + self.assertIsInstance(cmt, parser.Comment) + self.assertEqual(cmt.token_type, 'comment') + self.verify_terminals_type(cmt, 'vtext') + + params_test_get_comment = old_api_only( + ) + def test_get_comment_only(self): comment = self._test_get_x(parser.get_comment, '(comment)', '(comment)', ' ', [], '', ['comment']) self.assertEqual(comment.token_type, 'comment') - def test_get_comment_must_start_with_paren(self): + def test_get_comment_must_start_with_paren_no_ws(self): with self.assertRaises(errors.HeaderParseError): parser.get_comment('foo"') + + def test_get_comment_must_start_with_paren_ws(self): with self.assertRaises(errors.HeaderParseError): parser.get_comment(' (foo"') @@ -1624,10 +1643,12 @@ def test_get_comment_non_printable(self): '(foo\x7Fbar)', '(foo\x7Fbar)', ' ', [errors.NonPrintableDefect], '', ['foo\x7Fbar']) - def test_get_comment_no_end_paren(self): + def test_get_comment_no_end_paren_after_non_ws(self): self._test_get_x(parser.get_comment, '(foo bar', '(foo bar)', ' ', [errors.InvalidHeaderDefect], '', ['foo bar']) + + def test_get_comment_no_end_paren_after_ws(self): self._test_get_x(parser.get_comment, '(foo bar ', '(foo bar )', ' ', [errors.InvalidHeaderDefect], '', ['foo bar ']) From c2491b8d99f455d932442a04ceee1b54067fe065 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 17 Jan 2026 14:14:01 -0500 Subject: [PATCH 061/152] Rough conversion of get_comment tests. --- .../test_email/test__header_value_parser.py | 95 ++++++++++--------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index d3ab244bcf6b3a4..422dd7ec9d59c9a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1599,91 +1599,94 @@ def test_get_comment(self, s, *args, **kw): return self.assertIsInstance(cmt, parser.Comment) self.assertEqual(cmt.token_type, 'comment') - self.verify_terminals_type(cmt, 'vtext') + self.verify_terminal_types(cmt, 'ptext', 'fws') params_test_get_comment = old_api_only( - ) - def test_get_comment_only(self): - comment = self._test_get_x(parser.get_comment, + test_get_comment_only = C( '(comment)', '(comment)', ' ', [], '', ['comment']) - self.assertEqual(comment.token_type, 'comment') + , - def test_get_comment_must_start_with_paren_no_ws(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_comment('foo"') + test_get_comment_must_start_with_paren_no_ws = C( + 'foo"', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_comment_must_start_with_paren_ws(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_comment(' (foo"') + test_get_comment_must_start_with_paren_ws = C( + ' (foo"', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_comment_following_wsp_preserved(self): - self._test_get_x(parser.get_comment, + test_get_comment_following_wsp_preserved = C( '(comment) \t', '(comment)', ' ', [], ' \t', ['comment']) + , - def test_get_comment_multiple_words(self): - self._test_get_x(parser.get_comment, + test_get_comment_multiple_words = C( '(foo bar) \t', '(foo bar)', ' ', [], ' \t', ['foo bar']) + , - def test_get_comment_multiple_words_wsp_preserved(self): - self._test_get_x(parser.get_comment, + test_get_comment_multiple_words_wsp_preserved = C( '( foo bar\t ) \t', '( foo bar\t )', ' ', [], ' \t', [' foo bar\t ']) + , - def test_get_comment_end_paren_mid_word(self): - self._test_get_x(parser.get_comment, + test_get_comment_end_paren_mid_word = C( '(foo)bar', '(foo)', ' ', [], 'bar', ['foo']) + , - def test_get_comment_quoted_parens(self): - self._test_get_x(parser.get_comment, + test_get_comment_quoted_parens = C( r'(foo\) \(\)bar)', r'(foo\) \(\)bar)', ' ', [], '', ['foo) ()bar']) + , - def test_get_comment_non_printable(self): - self._test_get_x(parser.get_comment, + test_get_comment_non_printable = C( '(foo\x7Fbar)', '(foo\x7Fbar)', ' ', [errors.NonPrintableDefect], '', ['foo\x7Fbar']) + , - def test_get_comment_no_end_paren_after_non_ws(self): - self._test_get_x(parser.get_comment, + test_get_comment_no_end_paren_after_non_ws = C( '(foo bar', '(foo bar)', ' ', [errors.InvalidHeaderDefect], '', ['foo bar']) + , - def test_get_comment_no_end_paren_after_ws(self): - self._test_get_x(parser.get_comment, + test_get_comment_no_end_paren_after_ws = C( '(foo bar ', '(foo bar )', ' ', [errors.InvalidHeaderDefect], '', ['foo bar ']) + , - def test_get_comment_nested_comment(self): - comment = self._test_get_x(parser.get_comment, + test_get_comment_nested_comment = C( '(foo(bar))', '(foo(bar))', ' ', [], '', ['foo(bar)']) - self.assertEqual(comment[1].content, 'bar') + , + #self.assertEqual(comment[1].content, 'bar') - def test_get_comment_nested_comment_wsp(self): - comment = self._test_get_x(parser.get_comment, + test_get_comment_nested_comment_wsp = C( '(foo ( bar ) )', '(foo ( bar ) )', ' ', [], '', ['foo ( bar ) ']) - self.assertEqual(comment[2].content, ' bar ') + , + #self.assertEqual(comment[2].content, ' bar ') - def test_get_comment_empty_comment(self): - self._test_get_x(parser.get_comment, + test_get_comment_empty_comment = C( '()', '()', ' ', [], '', ['']) + , - def test_get_comment_multiple_nesting(self): - comment = self._test_get_x(parser.get_comment, + test_get_comment_multiple_nesting = C( '(((((foo)))))', '(((((foo)))))', ' ', [], '', ['((((foo))))']) - for i in range(4, 0, -1): - self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) - comment = comment[0] - self.assertEqual(comment.content, 'foo') + , + #for i in range(4, 0, -1): + # self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) + # comment = comment[0] + #self.assertEqual(comment.content, 'foo') - def test_get_comment_missing_end_of_nesting(self): - self._test_get_x(parser.get_comment, + test_get_comment_missing_end_of_nesting = C( '(((((foo)))', '(((((foo)))))', ' ', [errors.InvalidHeaderDefect]*2, '', ['((((foo))))']) + , - def test_get_comment_qs_in_nested_comment(self): - comment = self._test_get_x(parser.get_comment, + test_get_comment_qs_in_nested_comment = C( r'(foo (b\)))', r'(foo (b\)))', ' ', [], '', [r'foo (b\))']) - self.assertEqual(comment[2].content, 'b)') + , + #self.assertEqual(comment[2].content, 'b)') + + ) + # get_cfws From f3433f239d6a5e19088ec8dbec3998d6eba5eb27 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 18 Jan 2026 13:54:57 -0500 Subject: [PATCH 062/152] Fix whitespace and test names in get_comment tests. --- .../test_email/test__header_value_parser.py | 232 ++++++++++++------ 1 file changed, 151 insertions(+), 81 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 422dd7ec9d59c9a..c218db82601f549 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1603,87 +1603,157 @@ def test_get_comment(self, s, *args, **kw): params_test_get_comment = old_api_only( - test_get_comment_only = C( - '(comment)', '(comment)', ' ', [], '', ['comment']) - , - - test_get_comment_must_start_with_paren_no_ws = C( - 'foo"', - exception=(errors.HeaderParseError, '.*'), - ), - - test_get_comment_must_start_with_paren_ws = C( - ' (foo"', - exception=(errors.HeaderParseError, '.*'), - ), - - test_get_comment_following_wsp_preserved = C( - '(comment) \t', '(comment)', ' ', [], ' \t', ['comment']) - , - - test_get_comment_multiple_words = C( - '(foo bar) \t', '(foo bar)', ' ', [], ' \t', ['foo bar']) - , - - test_get_comment_multiple_words_wsp_preserved = C( - '( foo bar\t ) \t', '( foo bar\t )', ' ', [], ' \t', - [' foo bar\t ']) - , - - test_get_comment_end_paren_mid_word = C( - '(foo)bar', '(foo)', ' ', [], 'bar', ['foo']) - , - - test_get_comment_quoted_parens = C( - r'(foo\) \(\)bar)', r'(foo\) \(\)bar)', ' ', [], '', ['foo) ()bar']) - , - - test_get_comment_non_printable = C( - '(foo\x7Fbar)', '(foo\x7Fbar)', ' ', - [errors.NonPrintableDefect], '', ['foo\x7Fbar']) - , - - test_get_comment_no_end_paren_after_non_ws = C( - '(foo bar', '(foo bar)', ' ', - [errors.InvalidHeaderDefect], '', ['foo bar']) - , - - test_get_comment_no_end_paren_after_ws = C( - '(foo bar ', '(foo bar )', ' ', - [errors.InvalidHeaderDefect], '', ['foo bar ']) - , - - test_get_comment_nested_comment = C( - '(foo(bar))', '(foo(bar))', ' ', [], '', ['foo(bar)']) - , - #self.assertEqual(comment[1].content, 'bar') - - test_get_comment_nested_comment_wsp = C( - '(foo ( bar ) )', '(foo ( bar ) )', ' ', [], '', ['foo ( bar ) ']) - , - #self.assertEqual(comment[2].content, ' bar ') - - test_get_comment_empty_comment = C( - '()', '()', ' ', [], '', ['']) - , - - test_get_comment_multiple_nesting = C( - '(((((foo)))))', '(((((foo)))))', ' ', [], '', ['((((foo))))']) - , - #for i in range(4, 0, -1): - # self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) - # comment = comment[0] - #self.assertEqual(comment.content, 'foo') - - test_get_comment_missing_end_of_nesting = C( - '(((((foo)))', '(((((foo)))))', ' ', - [errors.InvalidHeaderDefect]*2, '', ['((((foo))))']) - , - - test_get_comment_qs_in_nested_comment = C( - r'(foo (b\)))', r'(foo (b\)))', ' ', [], '', [r'foo (b\))']) - , - #self.assertEqual(comment[2].content, 'b)') + only = C( + '(comment)', + '(comment)', + ' ', + [], + '', + ['comment'], + ), + + must_start_with_paren_no_ws = C( + 'foo"', + exception=(errors.HeaderParseError, '.*'), + ), + + must_start_with_paren_ws = C( + ' (foo"', + exception=(errors.HeaderParseError, '.*'), + ), + + following_wsp_preserved = C( + '(comment) \t', + '(comment)', + ' ', + [], + ' \t', + ['comment'], + ), + + multiple_words = C( + '(foo bar) \t', + '(foo bar)', + ' ', + [], + ' \t', + ['foo bar'], + ), + + multiple_words_wsp_preserved = C( + '( foo bar\t ) \t', + '( foo bar\t )', + ' ', + [], + ' \t', + [' foo bar\t '], + ), + + end_paren_mid_word = C( + '(foo)bar', + '(foo)', + ' ', + [], + 'bar', + ['foo'], + ), + + quoted_parens = C( + r'(foo\) \(\)bar)', + r'(foo\) \(\)bar)', + ' ', + [], + '', + ['foo) ()bar'], + ), + + non_printable = C( + '(foo\x7Fbar)', + '(foo\x7Fbar)', + ' ', + [errors.NonPrintableDefect], + '', + ['foo\x7Fbar'], + ), + + no_end_paren_after_non_ws = C( + '(foo bar', + '(foo bar)', + ' ', + [errors.InvalidHeaderDefect], + '', + ['foo bar'], + ), + + no_end_paren_after_ws = C( + '(foo bar ', + '(foo bar )', + ' ', + [errors.InvalidHeaderDefect], + '', + ['foo bar '], + ), + + nested_comment = C( + '(foo(bar))', + '(foo(bar))', + ' ', + [], + '', + ['foo(bar)'], + ), + #self.assertEqual(comment[1].content, 'bar') + + nested_comment_wsp = C( + '(foo ( bar ) )', + '(foo ( bar ) )', + ' ', + [], + '', + ['foo ( bar ) '], + ), + #self.assertEqual(comment[2].content, ' bar ') + + empty_comment = C( + '()', + '()', + ' ', + [], + '', + [''], + ), + + multiple_nesting = C( + '(((((foo)))))', + '(((((foo)))))', + ' ', + [], + '', + ['((((foo))))'], + ), + #for i in range(4, 0, -1): + # self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) + # comment = comment[0] + #self.assertEqual(comment.content, 'foo') + + missing_end_of_nesting = C( + '(((((foo)))', + '(((((foo)))))', + ' ', + [errors.InvalidHeaderDefect]*2, + '', + ['((((foo))))'], + ), + + qs_in_nested_comment = C( + r'(foo (b\)))', + r'(foo (b\)))', + ' ', + [], + '', + [r'foo (b\))'] + ), + #self.assertEqual(comment[2].content, 'b)') ) From 5cf8b69cc943bf91a21fc625b9c4070cd56f10a3 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 18 Jan 2026 14:13:19 -0500 Subject: [PATCH 063/152] Convert get_comment tests to keyword form. --- .../test_email/test__header_value_parser.py | 104 +++++------------- 1 file changed, 28 insertions(+), 76 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index c218db82601f549..ca96d59e174ac53 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1588,11 +1588,12 @@ def test_get_bare_quoted_string(self, s, *args, **kw): # get_comment @params - def test_get_comment(self, s, *args, **kw): + def test_get_comment(self, s, *args, value=' ', **kw): cmt = self._test_parse( parser.get_comment, C(s), *args, + value=value, **kw, ) if 'exception' in kw: @@ -1605,11 +1606,7 @@ def test_get_comment(self, s, *args, **kw): only = C( '(comment)', - '(comment)', - ' ', - [], - '', - ['comment'], + comments=['comment'], ), must_start_with_paren_no_ws = C( @@ -1624,112 +1621,73 @@ def test_get_comment(self, s, *args, **kw): following_wsp_preserved = C( '(comment) \t', - '(comment)', - ' ', - [], - ' \t', - ['comment'], + remainder=' \t', + comments=['comment'], ), multiple_words = C( '(foo bar) \t', - '(foo bar)', - ' ', - [], - ' \t', - ['foo bar'], + remainder=' \t', + comments=['foo bar'], ), multiple_words_wsp_preserved = C( '( foo bar\t ) \t', - '( foo bar\t )', - ' ', - [], - ' \t', - [' foo bar\t '], + remainder=' \t', + comments=[' foo bar\t '], ), end_paren_mid_word = C( '(foo)bar', - '(foo)', - ' ', - [], - 'bar', - ['foo'], + remainder='bar', + comments=['foo'], ), quoted_parens = C( r'(foo\) \(\)bar)', - r'(foo\) \(\)bar)', - ' ', - [], - '', - ['foo) ()bar'], + comments=['foo) ()bar'], ), non_printable = C( '(foo\x7Fbar)', - '(foo\x7Fbar)', - ' ', - [errors.NonPrintableDefect], - '', - ['foo\x7Fbar'], + defects=[errors.NonPrintableDefect], + comments=['foo\x7Fbar'], ), no_end_paren_after_non_ws = C( '(foo bar', - '(foo bar)', - ' ', - [errors.InvalidHeaderDefect], - '', - ['foo bar'], + stringified='(foo bar)', + defects=[errors.InvalidHeaderDefect], + comments=['foo bar'], ), no_end_paren_after_ws = C( '(foo bar ', - '(foo bar )', - ' ', - [errors.InvalidHeaderDefect], - '', - ['foo bar '], + stringified='(foo bar )', + defects=[errors.InvalidHeaderDefect], + comments=['foo bar '], ), nested_comment = C( '(foo(bar))', - '(foo(bar))', - ' ', - [], - '', - ['foo(bar)'], + comments=['foo(bar)'], ), #self.assertEqual(comment[1].content, 'bar') nested_comment_wsp = C( '(foo ( bar ) )', - '(foo ( bar ) )', - ' ', - [], - '', - ['foo ( bar ) '], + comments=['foo ( bar ) '], ), #self.assertEqual(comment[2].content, ' bar ') empty_comment = C( '()', - '()', - ' ', - [], - '', - [''], + comments=[''], ), multiple_nesting = C( '(((((foo)))))', - '(((((foo)))))', - ' ', - [], - '', - ['((((foo))))'], + comments=['((((foo))))'], ), #for i in range(4, 0, -1): # self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) @@ -1738,20 +1696,14 @@ def test_get_comment(self, s, *args, **kw): missing_end_of_nesting = C( '(((((foo)))', - '(((((foo)))))', - ' ', - [errors.InvalidHeaderDefect]*2, - '', - ['((((foo))))'], + stringified='(((((foo)))))', + defects=[errors.InvalidHeaderDefect]*2, + comments=['((((foo))))'], ), qs_in_nested_comment = C( r'(foo (b\)))', - r'(foo (b\)))', - ' ', - [], - '', - [r'foo (b\))'] + comments=[r'foo (b\))'] ), #self.assertEqual(comment[2].content, 'b)') From a5077eec862f83fa95e200e4b760d8d35857a669 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 28 Feb 2026 11:17:27 -0500 Subject: [PATCH 064/152] Improve get_comment test names. I changed the '"' to a '(' in non_wsp_before_left_paren_is_error. That it was a '"' was a copy and paste error, it should have been changed to a '(' when I originally copied the test from the quoted_string tests. --- .../test_email/test__header_value_parser.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ca96d59e174ac53..2475fccec9224ef 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1604,22 +1604,22 @@ def test_get_comment(self, s, *args, value=' ', **kw): params_test_get_comment = old_api_only( - only = C( + simple_comment_only = C( '(comment)', comments=['comment'], ), - must_start_with_paren_no_ws = C( - 'foo"', + non_wsp_before_left_paren_is_error = C( + 'foo(', exception=(errors.HeaderParseError, '.*'), ), - must_start_with_paren_ws = C( + wsp_before_left_paren_is_error = C( ' (foo"', exception=(errors.HeaderParseError, '.*'), ), - following_wsp_preserved = C( + wsp_after_right_paren = C( '(comment) \t', remainder=' \t', comments=['comment'], @@ -1631,13 +1631,13 @@ def test_get_comment(self, s, *args, value=' ', **kw): comments=['foo bar'], ), - multiple_words_wsp_preserved = C( + wsp_runs_inside_comment = C( '( foo bar\t ) \t', remainder=' \t', comments=[' foo bar\t '], ), - end_paren_mid_word = C( + non_wsp_after_right_paren = C( '(foo)bar', remainder='bar', comments=['foo'], @@ -1654,14 +1654,14 @@ def test_get_comment(self, s, *args, value=' ', **kw): comments=['foo\x7Fbar'], ), - no_end_paren_after_non_ws = C( + no_right_paren_after_non_ws = C( '(foo bar', stringified='(foo bar)', defects=[errors.InvalidHeaderDefect], comments=['foo bar'], ), - no_end_paren_after_ws = C( + no_right_paren_after_ws = C( '(foo bar ', stringified='(foo bar )', defects=[errors.InvalidHeaderDefect], @@ -1694,14 +1694,14 @@ def test_get_comment(self, s, *args, value=' ', **kw): # comment = comment[0] #self.assertEqual(comment.content, 'foo') - missing_end_of_nesting = C( + multiple_mesting_missing_two_right_parens = C( '(((((foo)))', stringified='(((((foo)))))', defects=[errors.InvalidHeaderDefect]*2, comments=['((((foo))))'], ), - qs_in_nested_comment = C( + quoted_paren_in_nested_comment = C( r'(foo (b\)))', comments=[r'foo (b\))'] ), From 7d340427b17f5d6e61d5749526ed83f697887101 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 19 Jan 2026 16:43:04 -0500 Subject: [PATCH 065/152] Improve the get_comment tests. This adds a bit to the testing infrastructure so we can reproduce the extra assertions about the nested comment structure. As the comments say, there's no good nested comment API, so we're just hacking up a test that the comment nesting in the parsed data structure is correct. I've tweaked two of the tests to remove a test of the remainder that is redundant with the test before them, so that the tests can be reused later by get_cfws. There are also a few tests demonstrating that comments are incorrectly not decoded in comments. I'll fix this in the refactor and add more tests. Finally, one of the new tests demonstrates that get_comment has a small bug: BUGFIX: Previously get_comment would return an empty Comment object if passed an empty input. It now correctly raises a HeaderParseError just as it always has if there is no leading '(' in input. --- Lib/email/_header_value_parser.py | 2 +- .../test_email/test__header_value_parser.py | 168 +++++++++++++++--- 2 files changed, 146 insertions(+), 24 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ab9231c12fbacf3..b9b42639fdee64e 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1288,7 +1288,7 @@ def get_comment(value): We handle nested comments here, and quoted-pair in our qp-ctext routine. """ - if value and value[0] != '(': + if not value or value[0] != '(': raise errors.HeaderParseError( "expected '(' but found '{}'".format(value)) comment = Comment() diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 2475fccec9224ef..adae2e500f2419e 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -255,6 +255,7 @@ def _test_parse( remainder='', comments=None, *, + commenttree=None, exception=None, warnings=None, test_start=True, @@ -306,6 +307,10 @@ def _test_parse( Assert that the comments attribute of the returned object matches comments. + If commenttree is not None, assert that the comment tree of the + returned object matches it. XXX commenttree is an internal testing + hack, a real API is needed some day. + Assert that the defects attribute of the returned object matches defects. @@ -363,6 +368,8 @@ def _test_parse( self.assertEqual(result.value, value) self.assertDefectsMatch(result.all_defects, defects) self.assertEqual(result.comments, comments) + if commenttree is not None: + self.assertEqual(self.ctree(result), commenttree) return (result, *other) if other else result def verify_terminal_types(self, tl, *text_types): @@ -378,6 +385,36 @@ def verify_terminal_types(self, tl, *text_types): self.assertIsNotNone(t.token_type, t) self.verify_terminal_types(t, *text_types) + def ctree(self, tl, cnt=0): + """Return a testing-adequate depiction of the nested comments""" + if isinstance(tl, parser.Comment): + return self._ctree(tl) + comments = [] + for t in tl: + if isinstance(t, parser.Comment): + comments.append(self._ctree(t)) + elif isinstance(t, parser.TokenList): + comments.extend(self.ctree(t)) + return comments + + def _ctree(self, tl): + comments = [] + empty = True + text = '' + for t in tl: + if isinstance(t, parser.Comment): + if text: + comments.append(text) + text = '' + comments.append(self._ctree(t)) + empty = False + else: + text += str(t) + if text or empty: + comments.append(text) + return comments + + # XXX XXX temporary step-wise refactoring tool, goes away at end of refactor. @params_map(with_namelist=True) def old_api_only(nl, *args, **kw): @@ -1588,16 +1625,30 @@ def test_get_bare_quoted_string(self, s, *args, **kw): # get_comment @params - def test_get_comment(self, s, *args, value=' ', **kw): + def test_get_comment(self, + s, + *args, + value=' ', + comments=None, + content=None, + commenttree=None, + **kw): + if content is None: + content = comments[0] if comments else None + if commenttree is None: + commenttree = [content] cmt = self._test_parse( parser.get_comment, C(s), *args, value=value, + comments=comments, + commenttree=commenttree, **kw, ) if 'exception' in kw: return + self.assertEqual(cmt.content, content) self.assertIsInstance(cmt, parser.Comment) self.assertEqual(cmt.token_type, 'comment') self.verify_terminal_types(cmt, 'ptext', 'fws') @@ -1610,13 +1661,13 @@ def test_get_comment(self, s, *args, value=' ', **kw): ), non_wsp_before_left_paren_is_error = C( - 'foo(', - exception=(errors.HeaderParseError, '.*'), + 'foo"', + exception=(errors.HeaderParseError, r'(?=.*expected)(?=.*foo)'), ), wsp_before_left_paren_is_error = C( ' (foo"', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, r'(?=.*expected)(?=.* \(foo)'), ), wsp_after_right_paren = C( @@ -1626,14 +1677,12 @@ def test_get_comment(self, s, *args, value=' ', **kw): ), multiple_words = C( - '(foo bar) \t', - remainder=' \t', + '(foo bar)', comments=['foo bar'], ), wsp_runs_inside_comment = C( - '( foo bar\t ) \t', - remainder=' \t', + '( foo bar\t )', comments=[' foo bar\t '], ), @@ -1648,64 +1697,137 @@ def test_get_comment(self, s, *args, value=' ', **kw): comments=['foo) ()bar'], ), - non_printable = C( - '(foo\x7Fbar)', - defects=[errors.NonPrintableDefect], - comments=['foo\x7Fbar'], + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable = C( + '(foo{char}bar)', + defects=[(nonprintable_defect, '{char}')], + comments=['foo{char}bar'], + ), ), no_right_paren_after_non_ws = C( '(foo bar', stringified='(foo bar)', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_comment_defect], comments=['foo bar'], ), no_right_paren_after_ws = C( '(foo bar ', stringified='(foo bar )', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_comment_defect], comments=['foo bar '], ), nested_comment = C( '(foo(bar))', comments=['foo(bar)'], + commenttree=['foo', ['bar']], ), - #self.assertEqual(comment[1].content, 'bar') nested_comment_wsp = C( '(foo ( bar ) )', comments=['foo ( bar ) '], + commenttree=['foo ', [' bar '], ' '], ), - #self.assertEqual(comment[2].content, ' bar ') empty_comment = C( '()', comments=[''], + commenttree=[''], ), multiple_nesting = C( '(((((foo)))))', comments=['((((foo))))'], + commenttree=[[[[['foo']]]]], ), - #for i in range(4, 0, -1): - # self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) - # comment = comment[0] - #self.assertEqual(comment.content, 'foo') multiple_mesting_missing_two_right_parens = C( '(((((foo)))', stringified='(((((foo)))))', - defects=[errors.InvalidHeaderDefect]*2, + defects=[*[end_inside_comment_defect]*2], comments=['((((foo))))'], + commenttree=[[[[['foo']]]]], ), quoted_paren_in_nested_comment = C( r'(foo (b\)))', - comments=[r'foo (b\))'] + comments=[r'foo (b\))'], + commenttree=['foo ', ['b)']], + ), + + any_printable_may_be_escaped = C( + f"({''.join(fr'\{c}' for c in RFC_PRINTABLES)})", + stringified= + f"({RFC_PRINTABLES + .replace('\\', r'\\') + .replace('(', r'\(') + .replace(')', r'\)') + })", + comments=[RFC_PRINTABLES], + ), + + all_printables = C( + f"({RFC_PRINTABLES. + replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)')})", + comments=[RFC_PRINTABLES], + ), + + multiple_nested_comments = C( + '(foo (nest 1) (nest 2 (nest 3)))', + comments=['foo (nest 1) (nest 2 (nest 3))'], + commenttree=['foo ', ['nest 1'], ' ', ['nest 2 ', ['nest 3']]], + ), + + nested_empty_comments = C( + '( () ( ( ) ) )', + comments=[' () ( ( ) ) '], + commenttree=[' ', [''], ' ', [' ', [' '], ' '], ' '], + ), + + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + ew_after_comment_no_ws = C( + '(foo)=?UTF-8?q?foo?=', + stringified='(foo)', + comments=['foo'], + remainder='=?UTF-8?q?foo?=', + ), + + # XXX XXX comments may contain EWs, but the current code is buggy. + # These will get decoded after the refactor is done. We'll add some + # some more test then, this is a target sample. + + ws_around_ew = C( + '( =?utf-8?q?test?= )', + #stringified='( test )', + comments=[' =?utf-8?q?test?= '], + #comments=[' test '], + ), + + ew_in_nested_comment = C( + '(foo (=?UTF-8?q?bar?=))', + #stringified='(foo (bar))', + comments=['foo (=?UTF-8?q?bar?=)'], + #comments=['foo (bar)'], + commenttree=['foo ', ['=?UTF-8?q?bar?=']], + #commenttree=['foo ', ['bar']], + ), + + ew_missing_whitespace = C( + '(=?UTF-8?q?foo?==?UTF-8?q?bar?=)', + #stringified='(foobar)', + comments=['=?UTF-8?q?foo?==?UTF-8?q?bar?='], + #comments=['foobar'], + #defects=[ + # missing_whitespace_after_ew_defect, + # missing_whitespace_before_ew_defect, + # ], ), - #self.assertEqual(comment[2].content, 'b)') ) From e8711086df72005028d0a453f074c781990d089d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 12:10:28 -0500 Subject: [PATCH 066/152] Begin conversion of get_cfws tests. We can re-use get_fws and get_comment tests to prove that get_cfws is correctly handling those subcases. Which it isn't, it turns out. Like get_fws, it fails to raise an error when pointed at a place in value that has no cfws. We'll handle it the same way, adding a deprecation warning during the refactor. --- .../test_email/test__header_value_parser.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index adae2e500f2419e..a31939346032a3f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -15,6 +15,7 @@ ) from test.test_email.params import ( C, + include_unless, params, Params, params_map, @@ -1834,6 +1835,54 @@ def test_get_comment(self, # get_cfws + @params + def test_get_cfws(self, s, *args, **kw): + kw.setdefault('value', ' ') + cfws = self._test_parse(parser.get_cfws, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(cfws, parser.CFWSList) + self.assertEqual(cfws.token_type, 'cfws') + self.verify_terminal_types(cfws, 'ptext', 'fws') + + # get_cfws should behave exactly the same as get_comment when parsing + # values containing just a comment. + @params_map(with_namelist=True) + def adapt_comment_tests_for_cfws(nl, s, *args, **kw): + # Our 'ctree' nested comment check returns a list of comments instead + # of just the single nested comment it does for Comment. + if 'commenttree' in kw: + kw['commenttree'] = [kw['commenttree']] + # XXX: get_cfws has the same bug that get_fws has: it does *not* raise + # an error if there is no cfws, and it should. + # XXX XXX Like get_fws, we'll deprecate this in the refactor. + if nl.has_any('empty', 'non_wsp_before_left_paren_is_error'): + kw.pop('exception') + kw['remainder'] = s + yield 'from_test_get_comment', C(s, *args, **kw) + + params_test_get_cfws = old_api_only( + + # get_cfws should behave exactly the same as get_fws when parsing + # whitespace only strings, except for the case of ending at a '(' + # because cfws *doesn't* end there. + include_unless( + lambda n, *a, **k: 'left_parenthesis' in n, + label="from_test_get_fws", + )(params_test_get_fws), + + # get_cfws should behave exactly the same as get_comment when parsing + # values containing just a comment. Even the tests with remainders + # should pass if the remainder doesn't start with whitespace. + include_unless( + lambda n, *a, remainder=..., **k: + remainder is not ... + and remainder.startswith(tuple(RFC_WSP)) + or 'wsp_before_left_paren_is_error' in n + )(adapt_comment_tests_for_cfws(params_test_get_comment)), + + ) + def test_get_cfws_only_ws(self): cfws = self._test_get_x(parser.get_cfws, ' \t \t', ' \t \t', ' ', [], '', []) From a1778747a5efe1ef5e3798ab04ef726e3e6d8ccc Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 28 Feb 2026 14:17:45 -0500 Subject: [PATCH 067/152] Remove now-redundant tests. only_ws is covered by the fws test wsp_run. only_comment is covered by the comment test simple_comment_only. non_printable_in_comment is covered by the comment test non_printable. --- Lib/test/test_email/test__header_value_parser.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index a31939346032a3f..4513904dacb3a72 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1883,16 +1883,6 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): ) - def test_get_cfws_only_ws(self): - cfws = self._test_get_x(parser.get_cfws, - ' \t \t', ' \t \t', ' ', [], '', []) - self.assertEqual(cfws.token_type, 'cfws') - - def test_get_cfws_only_comment(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo)', '(foo)', ' ', [], '', ['foo']) - self.assertEqual(cfws[0].content, 'foo') - def test_get_cfws_only_mixed(self): cfws = self._test_get_x(parser.get_cfws, ' (foo ) ( bar) ', ' (foo ) ( bar) ', ' ', [], '', @@ -1910,12 +1900,6 @@ def test_get_cfws_ends_at_non_printable(self): '(foo) \x07', '(foo) ', ' ', [], '\x07', ['foo']) self.assertEqual(cfws[0].content, 'foo') - def test_get_cfws_non_printable_in_comment(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo \x07) "test"', '(foo \x07) ', ' ', - [errors.NonPrintableDefect], '"test"', ['foo \x07']) - self.assertEqual(cfws[0].content, 'foo \x07') - def test_get_cfws_header_ends_in_comment(self): cfws = self._test_get_x(parser.get_cfws, ' (foo ', ' (foo )', ' ', From 12dda7cb391df5089e90f0192b6a3d9cf1e710ff Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 12:19:39 -0500 Subject: [PATCH 068/152] Rough conversion of get_cwfs tests. --- .../test_email/test__header_value_parser.py | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 4513904dacb3a72..591b9321b226423 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1837,7 +1837,8 @@ def test_get_comment(self, @params def test_get_cfws(self, s, *args, **kw): - kw.setdefault('value', ' ') + if len(args) < 2: + kw.setdefault('value', ' ') cfws = self._test_parse(parser.get_cfws, C(s), *args, **kw) if 'exception' in kw: return @@ -1881,37 +1882,38 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): or 'wsp_before_left_paren_is_error' in n )(adapt_comment_tests_for_cfws(params_test_get_comment)), - ) - - def test_get_cfws_only_mixed(self): - cfws = self._test_get_x(parser.get_cfws, + test_get_cfws_only_mixed = C( ' (foo ) ( bar) ', ' (foo ) ( bar) ', ' ', [], '', ['foo ', ' bar']) - self.assertEqual(cfws[1].content, 'foo ') - self.assertEqual(cfws[3].content, ' bar') + , + #self.assertEqual(cfws[1].content, 'foo ') + #self.assertEqual(cfws[3].content, ' bar') - def test_get_cfws_ends_at_non_leader(self): - cfws = self._test_get_x(parser.get_cfws, + test_get_cfws_ends_at_non_leader = C( '(foo) bar', '(foo) ', ' ', [], 'bar', ['foo']) - self.assertEqual(cfws[0].content, 'foo') + , + #self.assertEqual(cfws[0].content, 'foo') - def test_get_cfws_ends_at_non_printable(self): - cfws = self._test_get_x(parser.get_cfws, + test_get_cfws_ends_at_non_printable = C( '(foo) \x07', '(foo) ', ' ', [], '\x07', ['foo']) - self.assertEqual(cfws[0].content, 'foo') + , + #self.assertEqual(cfws[0].content, 'foo') - def test_get_cfws_header_ends_in_comment(self): - cfws = self._test_get_x(parser.get_cfws, + test_get_cfws_header_ends_in_comment = C( ' (foo ', ' (foo )', ' ', [errors.InvalidHeaderDefect], '', ['foo ']) - self.assertEqual(cfws[1].content, 'foo ') + , + #self.assertEqual(cfws[1].content, 'foo ') - def test_get_cfws_multiple_nested_comments(self): - cfws = self._test_get_x(parser.get_cfws, + test_get_cfws_multiple_nested_comments = C( '(foo (bar)) ((a)(a))', '(foo (bar)) ((a)(a))', ' ', [], '', ['foo (bar)', '(a)(a)']) - self.assertEqual(cfws[0].comments, ['foo (bar)']) - self.assertEqual(cfws[2].comments, ['(a)(a)']) + , + #self.assertEqual(cfws[0].comments, ['foo (bar)']) + #self.assertEqual(cfws[2].comments, ['(a)(a)']) + + + ) # get_quoted_string From 2e0d1b51d4e32f4216cbbd093fa9f3d9eb59dc6e Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 13:58:35 -0500 Subject: [PATCH 069/152] Fix whitespace in get_cfws tests. The --color-words diff is useless if I also change the test names in this commit, so this step gets split. --- .../test_email/test__header_value_parser.py | 63 ++++++++++++------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 591b9321b226423..43be65ed4e2e6e8 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1883,35 +1883,56 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): )(adapt_comment_tests_for_cfws(params_test_get_comment)), test_get_cfws_only_mixed = C( - ' (foo ) ( bar) ', ' (foo ) ( bar) ', ' ', [], '', - ['foo ', ' bar']) - , - #self.assertEqual(cfws[1].content, 'foo ') - #self.assertEqual(cfws[3].content, ' bar') + ' (foo ) ( bar) ', + ' (foo ) ( bar) ', + ' ', + [], + '', + ['foo ', ' bar'], + ), + #self.assertEqual(cfws[1].content, 'foo ') + #self.assertEqual(cfws[3].content, ' bar') test_get_cfws_ends_at_non_leader = C( - '(foo) bar', '(foo) ', ' ', [], 'bar', ['foo']) - , - #self.assertEqual(cfws[0].content, 'foo') + '(foo) bar', + '(foo) ', + ' ', + [], + 'bar', + ['foo'], + ), + #self.assertEqual(cfws[0].content, 'foo') test_get_cfws_ends_at_non_printable = C( - '(foo) \x07', '(foo) ', ' ', [], '\x07', ['foo']) - , - #self.assertEqual(cfws[0].content, 'foo') + '(foo) \x07', + '(foo) ', + ' ', + [], + '\x07', + ['foo'], + ), + #self.assertEqual(cfws[0].content, 'foo') test_get_cfws_header_ends_in_comment = C( - ' (foo ', ' (foo )', ' ', - [errors.InvalidHeaderDefect], '', ['foo ']) - , - #self.assertEqual(cfws[1].content, 'foo ') + ' (foo ', + ' (foo )', + ' ', + [errors.InvalidHeaderDefect], + '', + ['foo '], + ), + #self.assertEqual(cfws[1].content, 'foo ') test_get_cfws_multiple_nested_comments = C( - '(foo (bar)) ((a)(a))', '(foo (bar)) ((a)(a))', ' ', [], - '', ['foo (bar)', '(a)(a)']) - , - #self.assertEqual(cfws[0].comments, ['foo (bar)']) - #self.assertEqual(cfws[2].comments, ['(a)(a)']) - + '(foo (bar)) ((a)(a))', + '(foo (bar)) ((a)(a))', + ' ', + [], + '', + ['foo (bar)', '(a)(a)'], + ), + #self.assertEqual(cfws[0].comments, ['foo (bar)']) + #self.assertEqual(cfws[2].comments, ['(a)(a)']) ) From d3f9ad3b951b81737e3fec682f32ca554f55281b Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 13:59:50 -0500 Subject: [PATCH 070/152] Shorten test names in get_cfws tests. --- Lib/test/test_email/test__header_value_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 43be65ed4e2e6e8..8524d3ce39229d6 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1882,7 +1882,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): or 'wsp_before_left_paren_is_error' in n )(adapt_comment_tests_for_cfws(params_test_get_comment)), - test_get_cfws_only_mixed = C( + only_mixed = C( ' (foo ) ( bar) ', ' (foo ) ( bar) ', ' ', @@ -1893,7 +1893,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): #self.assertEqual(cfws[1].content, 'foo ') #self.assertEqual(cfws[3].content, ' bar') - test_get_cfws_ends_at_non_leader = C( + ends_at_non_leader = C( '(foo) bar', '(foo) ', ' ', @@ -1903,7 +1903,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): ), #self.assertEqual(cfws[0].content, 'foo') - test_get_cfws_ends_at_non_printable = C( + ends_at_non_printable = C( '(foo) \x07', '(foo) ', ' ', @@ -1913,7 +1913,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): ), #self.assertEqual(cfws[0].content, 'foo') - test_get_cfws_header_ends_in_comment = C( + header_ends_in_comment = C( ' (foo ', ' (foo )', ' ', @@ -1923,7 +1923,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): ), #self.assertEqual(cfws[1].content, 'foo ') - test_get_cfws_multiple_nested_comments = C( + multiple_nested_comments = C( '(foo (bar)) ((a)(a))', '(foo (bar)) ((a)(a))', ' ', From 654d28622cdcaf5603696d3de8c516e0643895e6 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 14:24:54 -0500 Subject: [PATCH 071/152] Convert get_cfws tests to keyword form. --- .../test_email/test__header_value_parser.py | 37 +++++-------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 8524d3ce39229d6..2a105f816843d7b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1837,8 +1837,7 @@ def test_get_comment(self, @params def test_get_cfws(self, s, *args, **kw): - if len(args) < 2: - kw.setdefault('value', ' ') + kw.setdefault('value', ' ') cfws = self._test_parse(parser.get_cfws, C(s), *args, **kw) if 'exception' in kw: return @@ -1884,52 +1883,36 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): only_mixed = C( ' (foo ) ( bar) ', - ' (foo ) ( bar) ', - ' ', - [], - '', - ['foo ', ' bar'], + comments=['foo ', ' bar'], ), #self.assertEqual(cfws[1].content, 'foo ') #self.assertEqual(cfws[3].content, ' bar') ends_at_non_leader = C( '(foo) bar', - '(foo) ', - ' ', - [], - 'bar', - ['foo'], + remainder='bar', + comments=['foo'], ), #self.assertEqual(cfws[0].content, 'foo') ends_at_non_printable = C( '(foo) \x07', - '(foo) ', - ' ', - [], - '\x07', - ['foo'], + remainder='\x07', + comments=['foo'], ), #self.assertEqual(cfws[0].content, 'foo') header_ends_in_comment = C( ' (foo ', - ' (foo )', - ' ', - [errors.InvalidHeaderDefect], - '', - ['foo '], + stringified=' (foo )', + defects=[errors.InvalidHeaderDefect], + comments=['foo '], ), #self.assertEqual(cfws[1].content, 'foo ') multiple_nested_comments = C( '(foo (bar)) ((a)(a))', - '(foo (bar)) ((a)(a))', - ' ', - [], - '', - ['foo (bar)', '(a)(a)'], + comments=['foo (bar)', '(a)(a)'], ), #self.assertEqual(cfws[0].comments, ['foo (bar)']) #self.assertEqual(cfws[2].comments, ['(a)(a)']) From f0861d283b5a3a70a02cbf8f71d6098bd8b44326 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 14:25:36 -0500 Subject: [PATCH 072/152] Add commenttree checks to get_cfws tests. --- Lib/test/test_email/test__header_value_parser.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 2a105f816843d7b..d74fc8a2c23ff9b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1884,38 +1884,36 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): only_mixed = C( ' (foo ) ( bar) ', comments=['foo ', ' bar'], + commenttree=[['foo '], [' bar']], ), - #self.assertEqual(cfws[1].content, 'foo ') - #self.assertEqual(cfws[3].content, ' bar') ends_at_non_leader = C( '(foo) bar', remainder='bar', comments=['foo'], + commenttree=[['foo']], ), - #self.assertEqual(cfws[0].content, 'foo') ends_at_non_printable = C( '(foo) \x07', remainder='\x07', comments=['foo'], + commenttree=[['foo']], ), - #self.assertEqual(cfws[0].content, 'foo') header_ends_in_comment = C( ' (foo ', stringified=' (foo )', defects=[errors.InvalidHeaderDefect], comments=['foo '], + commenttree=[['foo ']], ), - #self.assertEqual(cfws[1].content, 'foo ') multiple_nested_comments = C( '(foo (bar)) ((a)(a))', comments=['foo (bar)', '(a)(a)'], + commenttree=[['foo ', ['bar']], [['a'], ['a']]], ), - #self.assertEqual(cfws[0].comments, ['foo (bar)']) - #self.assertEqual(cfws[2].comments, ['(a)(a)']) ) From be16155f5a18d48b0398f595bf71b084fc6852b5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 16:08:00 -0500 Subject: [PATCH 073/152] Improve get_cfws tests. Two of the existing tests get combined into a single more comprehensive test. --- .../test_email/test__header_value_parser.py | 64 +++++++++++++++---- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index d74fc8a2c23ff9b..6773cfa01029068 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -52,6 +52,9 @@ # https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 RFC_SPECIALS = r'()<>[]:;@\,."' +# This isn't an RFC concept, but it is as useful in tests as it is in the code. +CFWS_LEADER = RFC_WSP + '(' + ALL_ASCII = bytes(range(0, 128)).decode('ascii') @@ -1881,30 +1884,30 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): or 'wsp_before_left_paren_is_error' in n )(adapt_comment_tests_for_cfws(params_test_get_comment)), - only_mixed = C( + mixed_comments_and_wsp = C( ' (foo ) ( bar) ', comments=['foo ', ' bar'], commenttree=[['foo '], [' bar']], ), - ends_at_non_leader = C( - '(foo) bar', - remainder='bar', - comments=['foo'], - commenttree=[['foo']], - ), - - ends_at_non_printable = C( - '(foo) \x07', - remainder='\x07', - comments=['foo'], - commenttree=[['foo']], + **for_each_character( + ALL_ASCII, + # XXX XXX skip things split considers whitespace. This is buggy. + # US RS GS FS + skip=CFWS_LEADER + '\r\n\v\f\x1f\x1e\x1d\x1c', + )( + ends_at_non_comment_non_ws = C( + '(foo) {char}', + remainder='{char}', + comments=['foo'], + commenttree=[['foo']], + ), ), header_ends_in_comment = C( ' (foo ', stringified=' (foo )', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_comment_defect], comments=['foo '], commenttree=[['foo ']], ), @@ -1915,6 +1918,39 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): commenttree=[['foo ', ['bar']], [['a'], ['a']]], ), + ew_after_comment_no_ws = C( + ' (bar) (foo)=?UTF-8?q?foo?=', + comments=['bar', 'foo'], + remainder='=?UTF-8?q?foo?=', + ), + + # XXX XXX these will get decoded after refactor is done. + + ew_in_nested_comment = C( + ' (a) (foo (=?UTF-8?q?bar?=))', + #stringified=' (a) (foo (bar))', + comments=['a', 'foo (=?UTF-8?q?bar?=)'], + #comments=['a', 'foo (bar)'], + #commenttree=[('a', []), ('foo (bar)', [('bar', [])])], + ), + + ew_missing_whitespace = C( + '(=?UTF-8?q?foo?==?UTF-8?q?bar?=) (b)', + #stringified='(foobar) (b)', + comments=['=?UTF-8?q?foo?==?UTF-8?q?bar?=', 'b'], + #comments=['foobar', 'b'], + #defects=[ + # missing_whitespace_after_ew_defect, + # missing_whitespace_before_ew_defect, + # ], + ), + + nested_and_unnested_empty_comments = C( + '() (()) ( () ) ( ( ) )', + comments=['', '()', ' () ', ' ( ) '], + commenttree=[[''], [['']], [' ', [''], ' '], [' ', [' '], ' ']], + ), + ) # get_quoted_string From 53a03c2be0e30af276b8794a333a37574170a3a5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 18:22:14 -0500 Subject: [PATCH 074/152] Start conversion of get_quoted_string tests. --- .../test_email/test__header_value_parser.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 6773cfa01029068..912e6c50ac1713d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1955,6 +1955,65 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): # get_quoted_string + @params + def test_get_quoted_string(self, s, *args, quoted_value, **kw): + qs = self._test_parse(parser.get_quoted_string, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertEqual(qs.quoted_value, quoted_value) + self.assertIsInstance(qs, parser.QuotedString) + self.assertEqual(qs.token_type, 'quoted-string') + self.verify_terminal_types(qs, 'ptext', 'fws') + + # get_quoted_string should pass any get_bare_quoted_string test that + # doesn't involve leading or trailing whitespace. + @params_map + def adapt_bare_quoted_string_tests_for_get_quoted_string(s, *args, **kw): + r = kw.get('remainder', '') + if s.startswith(tuple(RFC_WSP)) or r.startswith(tuple(RFC_WSP)): + return + kw['quoted_value'] = kw.get('stringified', s[:-len(r)] if r else s) + yield 'from_test_bare_quoted_string', C(s, *args, **kw) + + # If there is no remainder a cfws test string should be valid as a quoted + # string prefix or suffix, with a few exceptions that test for what happens + # if closing parens are missing. + @params_map(with_namelist=True) + def adapt_get_cfws_tests_for_get_quoted_string( + nl, + s, + *args, + stringified=None, + remainder=None, + **kw, + ): + if remainder or nl.has_any( + 'multiple_mesting_missing_two_right_parens', + 'no_right_paren_after_non_ws', + 'no_right_paren_after_ws', + 'header_ends_in_comment', + ): + return + new_s = f'{s} "foo" {s}' + if stringified: + kw['stringified'] = f'{stringified} "foo" {stringified}' + kw['value'] = ' foo ' + kw['quoted_value'] = ' "foo" ' + for k in ('comments', 'commenttree', 'defects'): + if (v := kw.get(k)): + kw[k] = v * 2 + yield 'adapted_from_get_cfws', C(new_s, **kw) + + params_test_get_quoted_string = old_api_only( + + adapt_bare_quoted_string_tests_for_get_quoted_string( + params_test_get_bare_quoted_string, + ), + + adapt_get_cfws_tests_for_get_quoted_string(params_test_get_cfws), + + ) + def test_get_quoted_string_only(self): qs = self._test_get_x(parser.get_quoted_string, '"bob"', '"bob"', 'bob', [], '') From 099654a5034e73ad844abdb6545ee180bcb2649c Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 6 Mar 2026 14:12:43 -0500 Subject: [PATCH 075/152] Remove now redundant get_quoted_string test. quoted_string_only is covered by the first get_bare_quoted_string test. --- Lib/test/test_email/test__header_value_parser.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 912e6c50ac1713d..0403d7ead03fe00 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2014,13 +2014,6 @@ def adapt_get_cfws_tests_for_get_quoted_string( ) - def test_get_quoted_string_only(self): - qs = self._test_get_x(parser.get_quoted_string, - '"bob"', '"bob"', 'bob', [], '') - self.assertEqual(qs.token_type, 'quoted-string') - self.assertEqual(qs.quoted_value, '"bob"') - self.assertEqual(qs.content, 'bob') - def test_get_quoted_string_with_wsp(self): qs = self._test_get_x(parser.get_quoted_string, '\t "bob" ', '\t "bob" ', ' bob ', [], '') From 496cebfd5cd295e26be502b3a51107cde329901b Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 24 Jan 2026 09:22:10 -0500 Subject: [PATCH 076/152] Rough conversion of get_quoted_string tests. This set of tests doesn't check the comments even though it should have. Temporarily we'll bypass that check and restore it momentarily after specifying actual test values. The quoted_value checks commented out will get converted to keyword form shortly. --- .../test_email/test__header_value_parser.py | 123 ++++++++++-------- 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 0403d7ead03fe00..1a16c8c6ec636be 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -371,7 +371,9 @@ def _test_parse( if isinstance(result, parser.TokenList): self.assertEqual(result.value, value) self.assertDefectsMatch(result.all_defects, defects) - self.assertEqual(result.comments, comments) + # XXX XXX at the end of the refactor get rid of this conditional. + if comments != ...: + self.assertEqual(result.comments, comments) if commenttree is not None: self.assertEqual(self.ctree(result), commenttree) return (result, *other) if other else result @@ -1956,11 +1958,15 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): # get_quoted_string @params - def test_get_quoted_string(self, s, *args, quoted_value, **kw): + def test_get_quoted_string(self, s, *args, quoted_value=None, **kw): + # XXX XXX temporary bypass of something not currently tested here + kw['comments'] = ... qs = self._test_parse(parser.get_quoted_string, C(s), *args, **kw) if 'exception' in kw: return - self.assertEqual(qs.quoted_value, quoted_value) + # XXX XXX temporary bypass pending test conversion to keyword form. + if quoted_value is not None: + self.assertEqual(qs.quoted_value, quoted_value) self.assertIsInstance(qs, parser.QuotedString) self.assertEqual(qs.token_type, 'quoted-string') self.verify_terminal_types(qs, 'ptext', 'fws') @@ -2012,85 +2018,88 @@ def adapt_get_cfws_tests_for_get_quoted_string( adapt_get_cfws_tests_for_get_quoted_string(params_test_get_cfws), - ) - - def test_get_quoted_string_with_wsp(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_with_wsp = C( '\t "bob" ', '\t "bob" ', ' bob ', [], '') - self.assertEqual(qs.quoted_value, ' "bob" ') - self.assertEqual(qs.content, 'bob') + , + #self.assertEqual(qs.quoted_value, ' "bob" ') + #self.assertEqual(qs.content, 'bob') - def test_get_quoted_string_with_comments_and_wsp(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_with_comments_and_wsp = C( ' (foo) "bob"(bar)', ' (foo) "bob"(bar)', ' bob ', [], '') - self.assertEqual(qs[0][1].content, 'foo') - self.assertEqual(qs[2][0].content, 'bar') - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') + , + #self.assertEqual(qs[0][1].content, 'foo') + #self.assertEqual(qs[2][0].content, 'bar') + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') - def test_get_quoted_string_with_multiple_comments(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_with_multiple_comments = C( ' (foo) (bar) "bob"(bird)', ' (foo) (bar) "bob"(bird)', ' bob ', [], '') - self.assertEqual(qs[0].comments, ['foo', 'bar']) - self.assertEqual(qs[2].comments, ['bird']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') + , + #self.assertEqual(qs[0].comments, ['foo', 'bar']) + #self.assertEqual(qs[2].comments, ['bird']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') - def test_get_quoted_string_non_printable_in_comment(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_non_printable_in_comment = C( ' (\x0A) "bob"', ' (\x0A) "bob"', ' bob', [errors.NonPrintableDefect], '') - self.assertEqual(qs[0].comments, ['\x0A']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob"') + , + #self.assertEqual(qs[0].comments, ['\x0A']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob"') - def test_get_quoted_string_non_printable_in_qcontent(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_non_printable_in_qcontent = C( ' (a) "a\x0B"', ' (a) "a\x0B"', ' a\x0B', [errors.NonPrintableDefect], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs.content, 'a\x0B') - self.assertEqual(qs.quoted_value, ' "a\x0B"') + , + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs.content, 'a\x0B') + #self.assertEqual(qs.quoted_value, ' "a\x0B"') - def test_get_quoted_string_internal_ws(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_internal_ws = C( ' (a) "foo bar "', ' (a) "foo bar "', ' foo bar ', [], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs.content, 'foo bar ') - self.assertEqual(qs.quoted_value, ' "foo bar "') + , + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs.content, 'foo bar ') + #self.assertEqual(qs.quoted_value, ' "foo bar "') - def test_get_quoted_string_header_ends_in_comment(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_header_ends_in_comment = C( ' (a) "bob" (a', ' (a) "bob" (a)', ' bob ', [errors.InvalidHeaderDefect], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs[2].comments, ['a']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') + , + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs[2].comments, ['a']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') - def test_get_quoted_string_header_ends_in_qcontent(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_header_ends_in_qcontent = C( ' (a) "bob', ' (a) "bob"', ' bob', [errors.InvalidHeaderDefect], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob"') + , + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob"') - def test_get_quoted_string_cfws_only_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_quoted_string(' (foo) ') + test_get_quoted_string_cfws_only_raises = C( + '(foo) ', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_quoted_string_no_quoted_string(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_quoted_string(' (ab) xyz') + test_get_quoted_string_no_quoted_string = C( + '(ab) xyz', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_quoted_string_qs_ends_at_noncfws(self): - qs = self._test_get_x(parser.get_quoted_string, + test_get_quoted_string_qs_ends_at_noncfws = C( '\t "bob" fee', '\t "bob" ', ' bob ', [], 'fee') - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') + , + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') + + ) + # get_atom From 8d7e3dfa132bef0e70fc72e52b8019b1228e48ab Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 24 Jan 2026 09:41:41 -0500 Subject: [PATCH 077/152] Fix whitespace and test names in get_quoted_string tests. Comment data added, comment check re-enabled. --- .../test_email/test__header_value_parser.py | 178 +++++++++++------- 1 file changed, 107 insertions(+), 71 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1a16c8c6ec636be..417e0bd1fb53f63 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1959,8 +1959,6 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): @params def test_get_quoted_string(self, s, *args, quoted_value=None, **kw): - # XXX XXX temporary bypass of something not currently tested here - kw['comments'] = ... qs = self._test_parse(parser.get_quoted_string, C(s), *args, **kw) if 'exception' in kw: return @@ -1971,6 +1969,7 @@ def test_get_quoted_string(self, s, *args, quoted_value=None, **kw): self.assertEqual(qs.token_type, 'quoted-string') self.verify_terminal_types(qs, 'ptext', 'fws') + # get_quoted_string should pass any get_bare_quoted_string test that # doesn't involve leading or trailing whitespace. @params_map @@ -2018,85 +2017,122 @@ def adapt_get_cfws_tests_for_get_quoted_string( adapt_get_cfws_tests_for_get_quoted_string(params_test_get_cfws), - test_get_quoted_string_with_wsp = C( - '\t "bob" ', '\t "bob" ', ' bob ', [], '') - , - #self.assertEqual(qs.quoted_value, ' "bob" ') - #self.assertEqual(qs.content, 'bob') - - test_get_quoted_string_with_comments_and_wsp = C( - ' (foo) "bob"(bar)', ' (foo) "bob"(bar)', ' bob ', [], '') - , - #self.assertEqual(qs[0][1].content, 'foo') - #self.assertEqual(qs[2][0].content, 'bar') - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') - - test_get_quoted_string_with_multiple_comments = C( - ' (foo) (bar) "bob"(bird)', ' (foo) (bar) "bob"(bird)', ' bob ', - [], '') - , - #self.assertEqual(qs[0].comments, ['foo', 'bar']) - #self.assertEqual(qs[2].comments, ['bird']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') - - test_get_quoted_string_non_printable_in_comment = C( - ' (\x0A) "bob"', ' (\x0A) "bob"', ' bob', - [errors.NonPrintableDefect], '') - , - #self.assertEqual(qs[0].comments, ['\x0A']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob"') + with_wsp = C( + '\t "bob" ', + '\t "bob" ', + ' bob ', + [], + '', + ), + #self.assertEqual(qs.quoted_value, ' "bob" ') + #self.assertEqual(qs.content, 'bob') - test_get_quoted_string_non_printable_in_qcontent = C( - ' (a) "a\x0B"', ' (a) "a\x0B"', ' a\x0B', - [errors.NonPrintableDefect], '') - , - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs.content, 'a\x0B') - #self.assertEqual(qs.quoted_value, ' "a\x0B"') + with_comments_and_wsp = C( + ' (foo) "bob"(bar)', + ' (foo) "bob"(bar)', + ' bob ', + [], + '', + ['foo', 'bar'], + ), + #self.assertEqual(qs[0][1].content, 'foo') + #self.assertEqual(qs[2][0].content, 'bar') + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') - test_get_quoted_string_internal_ws = C( - ' (a) "foo bar "', ' (a) "foo bar "', ' foo bar ', - [], '') - , - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs.content, 'foo bar ') - #self.assertEqual(qs.quoted_value, ' "foo bar "') + with_multiple_comments = C( + ' (foo) (bar) "bob"(bird)', + ' (foo) (bar) "bob"(bird)', + ' bob ', + [], + '', + ['foo', 'bar', 'bird'], + ), + #self.assertEqual(qs[0].comments, ['foo', 'bar']) + #self.assertEqual(qs[2].comments, ['bird']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') + + non_printable_in_comment = C( + ' (\x0A) "bob"', + ' (\x0A) "bob"', + ' bob', + [errors.NonPrintableDefect], + '', + ['\x0a'], + ), + #self.assertEqual(qs[0].comments, ['\x0A']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob"') - test_get_quoted_string_header_ends_in_comment = C( - ' (a) "bob" (a', ' (a) "bob" (a)', ' bob ', - [errors.InvalidHeaderDefect], '') - , - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs[2].comments, ['a']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') - - test_get_quoted_string_header_ends_in_qcontent = C( - ' (a) "bob', ' (a) "bob"', ' bob', - [errors.InvalidHeaderDefect], '') - , - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob"') + non_printable_in_qcontent = C( + ' (a) "a\x0B"', + ' (a) "a\x0B"', + ' a\x0B', + [errors.NonPrintableDefect], + '', + ['a'], + ), + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs.content, 'a\x0B') + #self.assertEqual(qs.quoted_value, ' "a\x0B"') - test_get_quoted_string_cfws_only_raises = C( - '(foo) ', + internal_ws = C( + ' (a) "foo bar "', + ' (a) "foo bar "', + ' foo bar ', + [], + '', + ['a'], + ), + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs.content, 'foo bar ') + #self.assertEqual(qs.quoted_value, ' "foo bar "') + + header_ends_in_comment = C( + ' (a) "bob" (a', + ' (a) "bob" (a)', + ' bob ', + [errors.InvalidHeaderDefect], + '', + ['a', 'a'], + ), + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs[2].comments, ['a']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') + + header_ends_in_qcontent = C( + ' (a) "bob', + ' (a) "bob"', + ' bob', + [errors.InvalidHeaderDefect], + '', + ['a'], + ), + #self.assertEqual(qs[0].comments, ['a']) + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob"') + + cfws_only_raises = C( + '(foo) ', exception=(errors.HeaderParseError, '.*'), ), - test_get_quoted_string_no_quoted_string = C( - '(ab) xyz', + no_quoted_string = C( + '(ab) xyz', exception=(errors.HeaderParseError, '.*'), ), - test_get_quoted_string_qs_ends_at_noncfws = C( - '\t "bob" fee', '\t "bob" ', ' bob ', [], 'fee') - , - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') + qs_ends_at_noncfws = C( + '\t "bob" fee', + '\t "bob" ', + ' bob ', + [], + 'fee', + ), + #self.assertEqual(qs.content, 'bob') + #self.assertEqual(qs.quoted_value, ' "bob" ') ) From 13f1727315817a48b180a17481c1cef509e98cf1 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 11:18:52 -0500 Subject: [PATCH 078/152] Convert get_quoted_string tests to keyword form. Converting the commented checks into commenttree and quoted_value checks. --- .../test_email/test__header_value_parser.py | 110 ++++++------------ 1 file changed, 36 insertions(+), 74 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 417e0bd1fb53f63..25c93273b786d39 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1962,9 +1962,7 @@ def test_get_quoted_string(self, s, *args, quoted_value=None, **kw): qs = self._test_parse(parser.get_quoted_string, C(s), *args, **kw) if 'exception' in kw: return - # XXX XXX temporary bypass pending test conversion to keyword form. - if quoted_value is not None: - self.assertEqual(qs.quoted_value, quoted_value) + self.assertEqual(qs.quoted_value, quoted_value) self.assertIsInstance(qs, parser.QuotedString) self.assertEqual(qs.token_type, 'quoted-string') self.verify_terminal_types(qs, 'ptext', 'fws') @@ -2019,100 +2017,67 @@ def adapt_get_cfws_tests_for_get_quoted_string( with_wsp = C( '\t "bob" ', - '\t "bob" ', - ' bob ', - [], - '', + value=' bob ', + quoted_value=' "bob" ', ), - #self.assertEqual(qs.quoted_value, ' "bob" ') - #self.assertEqual(qs.content, 'bob') with_comments_and_wsp = C( ' (foo) "bob"(bar)', - ' (foo) "bob"(bar)', - ' bob ', - [], - '', - ['foo', 'bar'], + value=' bob ', + quoted_value=' "bob" ', + comments=['foo', 'bar'], + commenttree=[['foo'], ['bar']], ), - #self.assertEqual(qs[0][1].content, 'foo') - #self.assertEqual(qs[2][0].content, 'bar') - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') with_multiple_comments = C( ' (foo) (bar) "bob"(bird)', - ' (foo) (bar) "bob"(bird)', - ' bob ', - [], - '', - ['foo', 'bar', 'bird'], + value=' bob ', + quoted_value=' "bob" ', + comments=['foo', 'bar', 'bird'], + commenttree=[['foo'], ['bar'], ['bird']], ), - #self.assertEqual(qs[0].comments, ['foo', 'bar']) - #self.assertEqual(qs[2].comments, ['bird']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') non_printable_in_comment = C( ' (\x0A) "bob"', - ' (\x0A) "bob"', - ' bob', - [errors.NonPrintableDefect], - '', - ['\x0a'], + value=' bob', + quoted_value=' "bob"', + defects=[errors.NonPrintableDefect], + comments=['\x0a'], ), - #self.assertEqual(qs[0].comments, ['\x0A']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob"') non_printable_in_qcontent = C( ' (a) "a\x0B"', - ' (a) "a\x0B"', - ' a\x0B', - [errors.NonPrintableDefect], - '', - ['a'], + value=' a\x0B', + quoted_value=' "a\x0B"', + defects=[errors.NonPrintableDefect], + comments=['a'], ), - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs.content, 'a\x0B') - #self.assertEqual(qs.quoted_value, ' "a\x0B"') internal_ws = C( ' (a) "foo bar "', - ' (a) "foo bar "', - ' foo bar ', - [], - '', - ['a'], + value=' foo bar ', + quoted_value=' "foo bar "', + comments=['a'], ), - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs.content, 'foo bar ') - #self.assertEqual(qs.quoted_value, ' "foo bar "') header_ends_in_comment = C( ' (a) "bob" (a', - ' (a) "bob" (a)', - ' bob ', - [errors.InvalidHeaderDefect], - '', - ['a', 'a'], + stringified=' (a) "bob" (a)', + value=' bob ', + quoted_value=' "bob" ', + defects=[errors.InvalidHeaderDefect], + comments=['a', 'a'], + commenttree=[['a'], ['a']], ), - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs[2].comments, ['a']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') header_ends_in_qcontent = C( ' (a) "bob', - ' (a) "bob"', - ' bob', - [errors.InvalidHeaderDefect], - '', - ['a'], + stringified=' (a) "bob"', + value=' bob', + quoted_value=' "bob"', + defects=[errors.InvalidHeaderDefect], + comments=['a'], ), - #self.assertEqual(qs[0].comments, ['a']) - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob"') cfws_only_raises = C( '(foo) ', @@ -2126,13 +2091,10 @@ def adapt_get_cfws_tests_for_get_quoted_string( qs_ends_at_noncfws = C( '\t "bob" fee', - '\t "bob" ', - ' bob ', - [], - 'fee', + value=' bob ', + quoted_value=' "bob" ', + remainder='fee', ), - #self.assertEqual(qs.content, 'bob') - #self.assertEqual(qs.quoted_value, ' "bob" ') ) From 14a5651d2e84bf7403ee55357b3327cc8f591225 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 13:57:19 -0500 Subject: [PATCH 079/152] Improve get_quoted_string tests. I changed the remainder for the quoted_dquote test to something that will make the parameters reusable in the get_phrase tests. --- .../test_email/test__header_value_parser.py | 62 +++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 25c93273b786d39..26a876a843f3bfa 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1542,9 +1542,9 @@ def test_get_bare_quoted_string(self, s, *args, **kw): ), quoted_dquote = C( - r'"foo\"in"a', + r'"foo\"in"@', value='foo"in', - remainder='a', + remainder='@', ), **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( @@ -2037,19 +2037,24 @@ def adapt_get_cfws_tests_for_get_quoted_string( commenttree=[['foo'], ['bar'], ['bird']], ), - non_printable_in_comment = C( - ' (\x0A) "bob"', - value=' bob', - quoted_value=' "bob"', - defects=[errors.NonPrintableDefect], - comments=['\x0a'], + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_comment = C( + ' ({char}) "bob"', + value=' bob', + quoted_value=' "bob"', + defects=[(nonprintable_defect, '{char}')], + comments=['{char}'], + ), ), + # all the non printables in qcontent are checked by the included + # bare_quoted_string tests, this one proves that the defect is + # correctly copied up even if there is also comment text involved. non_printable_in_qcontent = C( ' (a) "a\x0B"', value=' a\x0B', quoted_value=' "a\x0B"', - defects=[errors.NonPrintableDefect], + defects=[nonprintable_defect('\x0b')], comments=['a'], ), @@ -2065,7 +2070,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( stringified=' (a) "bob" (a)', value=' bob ', quoted_value=' "bob" ', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_comment_defect], comments=['a', 'a'], commenttree=[['a'], ['a']], ), @@ -2075,25 +2080,46 @@ def adapt_get_cfws_tests_for_get_quoted_string( stringified=' (a) "bob"', value=' bob', quoted_value=' "bob"', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_quoted_string_defect], comments=['a'], ), cfws_only_raises = C( '(foo) ', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, '(?i)expected'), ), no_quoted_string = C( '(ab) xyz', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, '(?=.*expected.*")(?=.*xyz)'), ), - qs_ends_at_noncfws = C( - '\t "bob" fee', - value=' bob ', - quoted_value=' "bob" ', - remainder='fee', + **for_each_character(RFC_PRINTABLES, skip='(')( + qs_ends_at_noncfws = C( + '\t "bob" {char}', + value=' bob ', + quoted_value=' "bob" ', + remainder='{char}', + ), + ), + + ew_after_dquote = C( + '"bob"=?UTF-8?q?foo?=', + value='bob', + quoted_value='"bob"', + remainder='=?UTF-8?q?foo?=', + ), + + empty_quotes_between_comments = C( + ' (a) "" (foo)', + value=' ', + quoted_value=' "" ', + comments=['a', 'foo'], + ), + + empty_input = C( + '', + exception=(errors.HeaderParseError, r'(?i)expected'), ), ) From ab2316560dc1b97144dbe386083c04ae9e0d61ec Mon Sep 17 00:00:00 2001 From: R David Murray Date: Wed, 8 Apr 2026 14:33:44 -0400 Subject: [PATCH 080/152] Add content checking to get_quoted_string tests. --- .../test_email/test__header_value_parser.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 26a876a843f3bfa..90c8feb9991632f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1958,10 +1958,18 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): # get_quoted_string @params - def test_get_quoted_string(self, s, *args, quoted_value=None, **kw): + def test_get_quoted_string( + self, + s, + *args, + content=None, + quoted_value=None, + **kw, + ): qs = self._test_parse(parser.get_quoted_string, C(s), *args, **kw) if 'exception' in kw: return + self.assertEqual(qs.content, content) self.assertEqual(qs.quoted_value, quoted_value) self.assertIsInstance(qs, parser.QuotedString) self.assertEqual(qs.token_type, 'quoted-string') @@ -1975,6 +1983,9 @@ def adapt_bare_quoted_string_tests_for_get_quoted_string(s, *args, **kw): r = kw.get('remainder', '') if s.startswith(tuple(RFC_WSP)) or r.startswith(tuple(RFC_WSP)): return + if not 'exception' in kw: + kw['quoted_value'] = kw.get('stringified', s[:-len(r)] if r else s) + kw['content'] = kw['value'] kw['quoted_value'] = kw.get('stringified', s[:-len(r)] if r else s) yield 'from_test_bare_quoted_string', C(s, *args, **kw) @@ -2002,6 +2013,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( kw['stringified'] = f'{stringified} "foo" {stringified}' kw['value'] = ' foo ' kw['quoted_value'] = ' "foo" ' + kw['content'] = 'foo' for k in ('comments', 'commenttree', 'defects'): if (v := kw.get(k)): kw[k] = v * 2 @@ -2019,12 +2031,14 @@ def adapt_get_cfws_tests_for_get_quoted_string( '\t "bob" ', value=' bob ', quoted_value=' "bob" ', + content='bob', ), with_comments_and_wsp = C( ' (foo) "bob"(bar)', value=' bob ', quoted_value=' "bob" ', + content='bob', comments=['foo', 'bar'], commenttree=[['foo'], ['bar']], ), @@ -2033,6 +2047,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( ' (foo) (bar) "bob"(bird)', value=' bob ', quoted_value=' "bob" ', + content='bob', comments=['foo', 'bar', 'bird'], commenttree=[['foo'], ['bar'], ['bird']], ), @@ -2042,6 +2057,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( ' ({char}) "bob"', value=' bob', quoted_value=' "bob"', + content='bob', defects=[(nonprintable_defect, '{char}')], comments=['{char}'], ), @@ -2054,6 +2070,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( ' (a) "a\x0B"', value=' a\x0B', quoted_value=' "a\x0B"', + content='a\x0B', defects=[nonprintable_defect('\x0b')], comments=['a'], ), @@ -2062,6 +2079,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( ' (a) "foo bar "', value=' foo bar ', quoted_value=' "foo bar "', + content='foo bar ', comments=['a'], ), @@ -2070,6 +2088,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( stringified=' (a) "bob" (a)', value=' bob ', quoted_value=' "bob" ', + content='bob', defects=[end_inside_comment_defect], comments=['a', 'a'], commenttree=[['a'], ['a']], @@ -2080,6 +2099,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( stringified=' (a) "bob"', value=' bob', quoted_value=' "bob"', + content='bob', defects=[end_inside_quoted_string_defect], comments=['a'], ), @@ -2099,6 +2119,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( '\t "bob" {char}', value=' bob ', quoted_value=' "bob" ', + content='bob', remainder='{char}', ), ), @@ -2107,6 +2128,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( '"bob"=?UTF-8?q?foo?=', value='bob', quoted_value='"bob"', + content='bob', remainder='=?UTF-8?q?foo?=', ), @@ -2114,6 +2136,7 @@ def adapt_get_cfws_tests_for_get_quoted_string( ' (a) "" (foo)', value=' ', quoted_value=' "" ', + content='', comments=['a', 'foo'], ), From a886459c9f1605c9361d889c277ff4ad5c77e2b6 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 15:44:38 -0500 Subject: [PATCH 081/152] Start refactoring get_atom tests. test_get_atom_only is more than replaced by running all the get_atext tests. --- .../test_email/test__header_value_parser.py | 53 +++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 90c8feb9991632f..ea60eb244f383c0 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2150,10 +2150,57 @@ def adapt_get_cfws_tests_for_get_quoted_string( # get_atom - def test_get_atom_only(self): - atom = self._test_get_x(parser.get_atom, - 'bob', 'bob', 'bob', [], '') + @params + def test_get_atom(self, s, *args, **kw): + atom = self._test_parse(parser.get_atom, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atom, parser.Atom) self.assertEqual(atom.token_type, 'atom') + self.verify_terminal_types(atom, 'atext', 'vtext', 'ptext', 'fws') + + # If there is no remainder a cfws test string should be valid as a atom + # prefix or suffix, with a few exceptions that test for what happens + # if closing parens are missing. + @params_map(with_namelist=True) + def adapt_get_cfws_tests_for_get_atom( + nl, + s, + *args, + stringified=None, + remainder=None, + **kw, + ): + if remainder or nl.has_any( + 'multiple_mesting_missing_two_right_parens', + 'no_right_paren_after_non_ws', + 'no_right_paren_after_ws', + 'header_ends_in_comment', + ): + return + new_s = f'{s} foo {s}' + if stringified: + kw['stringified'] = f'{stringified} foo {stringified}' + kw['value'] = ' foo ' + for k in ('comments', 'commenttree', 'defects'): + if (v := kw.get(k)): + kw[k] = v * 2 + yield 'adapted_from_get_cfws', C(new_s, **kw) + + params_test_get_atom = old_api_only( + + adapt_get_cfws_tests_for_get_atom(params_test_get_cfws), + + # get_atom should pass all the get_atext tests except for those + # involving leading or trailing whitespace. + include_unless( + lambda n, s, *a, remainder='', **k: + s.startswith(tuple(CFWS_LEADER)) + or remainder.startswith(tuple(CFWS_LEADER)), + label='from_test_get_atext', + )(params_test_get_atext), + + ) def test_get_atom_with_wsp(self): self._test_get_x(parser.get_atom, From a30ee302fc998150d28af5c3584484a35c32402d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 16:02:32 -0500 Subject: [PATCH 082/152] Rough conversion of get_atom tests. --- .../test_email/test__header_value_parser.py | 82 +++++++++---------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ea60eb244f383c0..1cd69514d229490 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2200,65 +2200,63 @@ def adapt_get_cfws_tests_for_get_atom( label='from_test_get_atext', )(params_test_get_atext), - ) - - def test_get_atom_with_wsp(self): - self._test_get_x(parser.get_atom, + test_get_atom_with_wsp = C( '\t bob ', '\t bob ', ' bob ', [], '') + , - def test_get_atom_with_comments_and_wsp(self): - atom = self._test_get_x(parser.get_atom, - ' (foo) bob(bar)', ' (foo) bob(bar)', ' bob ', [], '') - self.assertEqual(atom[0][1].content, 'foo') - self.assertEqual(atom[2][0].content, 'bar') + test_get_atom_with_comments_and_wsp = C( + ' (foo) bob(bar)', ' (foo) bob(bar)', ' bob ', [], '', + comments=['foo', 'bar'], + ), - def test_get_atom_with_multiple_comments(self): - atom = self._test_get_x(parser.get_atom, + test_get_atom_with_multiple_comments = C( ' (foo) (bar) bob(bird)', ' (foo) (bar) bob(bird)', ' bob ', - [], '') - self.assertEqual(atom[0].comments, ['foo', 'bar']) - self.assertEqual(atom[2].comments, ['bird']) + [], '', + comments=['foo', 'bar', 'bird'], + ), - def test_get_atom_non_printable_in_comment(self): - atom = self._test_get_x(parser.get_atom, + test_get_atom_non_printable_in_comment = C( ' (\x0A) bob', ' (\x0A) bob', ' bob', - [errors.NonPrintableDefect], '') - self.assertEqual(atom[0].comments, ['\x0A']) + [errors.NonPrintableDefect], '', + comments=['\x0A'], + ), - def test_get_atom_non_printable_in_atext(self): - atom = self._test_get_x(parser.get_atom, + test_get_atom_non_printable_in_atext = C( ' (a) a\x0B', ' (a) a\x0B', ' a\x0B', - [errors.NonPrintableDefect], '') - self.assertEqual(atom[0].comments, ['a']) + [errors.NonPrintableDefect], '', + comments=['a'], + ), - def test_get_atom_header_ends_in_comment(self): - atom = self._test_get_x(parser.get_atom, + test_get_atom_header_ends_in_comment = C( ' (a) bob (a', ' (a) bob (a)', ' bob ', - [errors.InvalidHeaderDefect], '') - self.assertEqual(atom[0].comments, ['a']) - self.assertEqual(atom[2].comments, ['a']) + [errors.InvalidHeaderDefect], '', + comments=['a', 'a'], + ), - def test_get_atom_no_atom(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_atom(' (ab) ') + test_get_atom_no_atom = C( + ' (ab) ', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_atom_no_atom_before_special(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_atom(' (ab) @') + test_get_atom_no_atom_before_special = C( + ' (ab) @', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_atom_atom_ends_at_special(self): - atom = self._test_get_x(parser.get_atom, - ' (foo) bob(bar) @bang', ' (foo) bob(bar) ', ' bob ', [], '@bang') - self.assertEqual(atom[0].comments, ['foo']) - self.assertEqual(atom[2].comments, ['bar']) + test_get_atom_atom_ends_at_special = C( + ' (foo) bob(bar) @bang', ' (foo) bob(bar) ', ' bob ', [], '@bang', + comments=['foo', 'bar'], + ), - def test_get_atom_atom_ends_at_noncfws(self): - self._test_get_x(parser.get_atom, + test_get_atom_atom_ends_at_noncfws = C( 'bob fred', 'bob ', 'bob ', [], 'fred') + , - def test_get_atom_rfc2047_atom(self): - self._test_get_x(parser.get_atom, + test_get_atom_rfc2047_atom = C( '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + , + + ) # get_dot_atom_text From 6a5a45fc896be78a06109d0f314a49ced37b485f Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 16:16:18 -0500 Subject: [PATCH 083/152] Fix whitespace and test names in get_atom tests. --- .../test_email/test__header_value_parser.py | 94 +++++++++++++------ 1 file changed, 63 insertions(+), 31 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1cd69514d229490..b8b69b58285bc9b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2200,61 +2200,93 @@ def adapt_get_cfws_tests_for_get_atom( label='from_test_get_atext', )(params_test_get_atext), - test_get_atom_with_wsp = C( - '\t bob ', '\t bob ', ' bob ', [], '') - , + with_wsp = C( + '\t bob ', + '\t bob ', + ' bob ', + [], + '', + ), - test_get_atom_with_comments_and_wsp = C( - ' (foo) bob(bar)', ' (foo) bob(bar)', ' bob ', [], '', + with_comments_and_wsp = C( + ' (foo) bob(bar)', + ' (foo) bob(bar)', + ' bob ', + [], + '', comments=['foo', 'bar'], ), - test_get_atom_with_multiple_comments = C( - ' (foo) (bar) bob(bird)', ' (foo) (bar) bob(bird)', ' bob ', - [], '', + with_multiple_comments = C( + ' (foo) (bar) bob(bird)', + ' (foo) (bar) bob(bird)', + ' bob ', + [], + '', comments=['foo', 'bar', 'bird'], ), - test_get_atom_non_printable_in_comment = C( - ' (\x0A) bob', ' (\x0A) bob', ' bob', - [errors.NonPrintableDefect], '', + non_printable_in_comment = C( + ' (\x0A) bob', + ' (\x0A) bob', + ' bob', + [errors.NonPrintableDefect], + '', comments=['\x0A'], ), - test_get_atom_non_printable_in_atext = C( - ' (a) a\x0B', ' (a) a\x0B', ' a\x0B', - [errors.NonPrintableDefect], '', + non_printable_in_atext = C( + ' (a) a\x0B', + ' (a) a\x0B', + ' a\x0B', + [errors.NonPrintableDefect], + '', comments=['a'], ), - test_get_atom_header_ends_in_comment = C( - ' (a) bob (a', ' (a) bob (a)', ' bob ', - [errors.InvalidHeaderDefect], '', + header_ends_in_comment = C( + ' (a) bob (a', + ' (a) bob (a)', + ' bob ', + [errors.InvalidHeaderDefect], + '', comments=['a', 'a'], ), - test_get_atom_no_atom = C( - ' (ab) ', - exception=(errors.HeaderParseError, '.*'), + no_atom = C( + ' (ab) ', + exception=(errors.HeaderParseError, '.*'), ), - test_get_atom_no_atom_before_special = C( - ' (ab) @', - exception=(errors.HeaderParseError, '.*'), + no_atom_before_special = C( + ' (ab) @', + exception=(errors.HeaderParseError, '.*'), ), - test_get_atom_atom_ends_at_special = C( - ' (foo) bob(bar) @bang', ' (foo) bob(bar) ', ' bob ', [], '@bang', + atom_ends_at_special = C( + ' (foo) bob(bar) @bang', + ' (foo) bob(bar) ', + ' bob ', + [], + '@bang', comments=['foo', 'bar'], ), - test_get_atom_atom_ends_at_noncfws = C( - 'bob fred', 'bob ', 'bob ', [], 'fred') - , + atom_ends_at_noncfws = C( + 'bob fred', + 'bob ', + 'bob ', + [], + 'fred', + ), - test_get_atom_rfc2047_atom = C( - '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') - , + rfc2047_atom = C( + '=?utf-8?q?=20bob?=', + ' bob', + ' bob', + [], + '', + ), ) From 0132b1888dd1bd35cc7e2bf32cc987974e760a55 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 16:19:40 -0500 Subject: [PATCH 084/152] Convert get_atom tests to keyword form. --- .../test_email/test__header_value_parser.py | 51 ++++++------------- 1 file changed, 15 insertions(+), 36 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b8b69b58285bc9b..387c6539904923e 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2202,54 +2202,40 @@ def adapt_get_cfws_tests_for_get_atom( with_wsp = C( '\t bob ', - '\t bob ', - ' bob ', - [], - '', + value=' bob ', ), with_comments_and_wsp = C( ' (foo) bob(bar)', - ' (foo) bob(bar)', - ' bob ', - [], - '', + value=' bob ', comments=['foo', 'bar'], ), with_multiple_comments = C( ' (foo) (bar) bob(bird)', - ' (foo) (bar) bob(bird)', - ' bob ', - [], - '', + value=' bob ', comments=['foo', 'bar', 'bird'], ), non_printable_in_comment = C( ' (\x0A) bob', - ' (\x0A) bob', - ' bob', - [errors.NonPrintableDefect], - '', + value=' bob', + defects=[errors.NonPrintableDefect], comments=['\x0A'], ), non_printable_in_atext = C( ' (a) a\x0B', - ' (a) a\x0B', - ' a\x0B', - [errors.NonPrintableDefect], - '', + value=' a\x0B', + defects=[errors.NonPrintableDefect], comments=['a'], ), header_ends_in_comment = C( ' (a) bob (a', - ' (a) bob (a)', - ' bob ', - [errors.InvalidHeaderDefect], - '', + stringified=' (a) bob (a)', + value=' bob ', + defects=[errors.InvalidHeaderDefect], comments=['a', 'a'], ), @@ -2265,27 +2251,20 @@ def adapt_get_cfws_tests_for_get_atom( atom_ends_at_special = C( ' (foo) bob(bar) @bang', - ' (foo) bob(bar) ', - ' bob ', - [], - '@bang', + value=' bob ', + remainder='@bang', comments=['foo', 'bar'], ), atom_ends_at_noncfws = C( 'bob fred', - 'bob ', - 'bob ', - [], - 'fred', + value='bob ', + remainder='fred', ), rfc2047_atom = C( '=?utf-8?q?=20bob?=', - ' bob', - ' bob', - [], - '', + stringified=' bob', ), ) From f1da65214fb80523491f76003c05329fafdb6934 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 17:56:49 -0500 Subject: [PATCH 085/152] Improve the get_atom tests. --- .../test_email/test__header_value_parser.py | 148 ++++++++++++++---- 1 file changed, 121 insertions(+), 27 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 387c6539904923e..30bd19e16e9308c 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2217,56 +2217,150 @@ def adapt_get_cfws_tests_for_get_atom( comments=['foo', 'bar', 'bird'], ), - non_printable_in_comment = C( - ' (\x0A) bob', - value=' bob', - defects=[errors.NonPrintableDefect], - comments=['\x0A'], - ), + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_comment = C( + ' ({char}) bob', + value=' bob', + defects=[(nonprintable_defect, '{char}')], + comments=['{char}'], + ), + + non_printable_in_atext = C( + ' (a) a{char}', + value=' a{char}', + defects=[(nonprintable_defect, '{char}')], + comments=['a'], + ), - non_printable_in_atext = C( - ' (a) a\x0B', - value=' a\x0B', - defects=[errors.NonPrintableDefect], - comments=['a'], ), header_ends_in_comment = C( ' (a) bob (a', stringified=' (a) bob (a)', value=' bob ', - defects=[errors.InvalidHeaderDefect], + defects=[end_inside_comment_defect], comments=['a', 'a'], ), no_atom = C( ' (ab) ', - exception=(errors.HeaderParseError, '.*'), + exception=(errors.HeaderParseError, '(?i)expected'), ), - no_atom_before_special = C( - ' (ab) @', - exception=(errors.HeaderParseError, '.*'), - ), + **for_each_character(RFC_SPECIALS, skip='(')( + + no_atom_before_special = C( + ' (ab) {char}', + exception=( + errors.HeaderParseError, + '(?i)(?=.*expected)(?=.*{echar})', + ), + ), + + atom_ends_at_special = C( + ' (foo) bob(bar) {char}bang', + value=' bob ', + remainder='{char}bang', + comments=['foo', 'bar'], + ), - atom_ends_at_special = C( - ' (foo) bob(bar) @bang', - value=' bob ', - remainder='@bang', - comments=['foo', 'bar'], ), - atom_ends_at_noncfws = C( - 'bob fred', - value='bob ', - remainder='fred', + **for_each_character(RFC_PRINTABLES, skip='(')( + atom_ends_at_noncfws = C( + 'bob {char}', + value='bob ', + remainder='{char}', + ), ), - rfc2047_atom = C( + ew_only = C( '=?utf-8?q?=20bob?=', stringified=' bob', ), + ew_and_comments = C( + '(a) =?UTF-8?q?bob?= (b)', + stringified='(a) bob (b)', + value=' bob ', + comments=['a', 'b'], + ), + + # XXX XXX this should actually be two missing whitespace defects. + ew_and_comments_no_ws = C( + '(a)=?UTF-8?q?bob?=(b)', + stringified='(a)bob(b)', + value=' bob ', + comments=['a', 'b'], + defects=[ + #missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ), + + # XXX XXX ditto + ew_and_empty_comments_no_ws = C( + '()=?UTF-8?q?bob?=()', + stringified='()bob()', + value=' bob ', + comments=['', ''], + defects=[ + #missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ), + + # XXX Ideally this should have a defect for the specials. + **for_each_character(RFC_SPECIALS)( + ew_with_unencoded_special = C( + '=?UTF-8?q?bob{char}?= @foo', + stringified='bob{char} ', + remainder='@foo', + ), + ), + + ew_after_atom_no_ws = C( + 'foo@=?UTF-8?q?bob?=', + value='foo', + remainder='@=?UTF-8?q?bob?=', + ), + + # XXX XXX Technically these are correct as is but we're going to fix it + # to always decode the ews anyway, because most email software does. + + multiple_ew_no_ws = C( + '=?UTF-8?q?foo?==?UTF-8?q?bar?=', + stringified='foo', + #stringified='foobar', + remainder='=?UTF-8?q?bar?=', + defects=[ + missing_whitespace_after_ew_defect, + #missing_whitespace_before_ew_defect, + ], + ), + + ew_in_middle_of_atom_text = C( + 'foo{=?UTF-8?q?foo?=}{=?UTF-8?q?bar?=}bar', + #stringified='foo{foo}{bar}bar', + #defects=[ + # missing_whitespace_before_ew_defect, + # missing_whitespace_after_ew_defect, + # missing_whitespace_before_ew_defect, + # missing_whitespace_after_ew_defect, + # ], + ), + + empty_comments_no_ws = C( + ' ()bob() ', + value=' bob ', + comments=['', ''], + ), + + all_non_special_printables_are_allowed = C( + f'{"".join(set(RFC_PRINTABLES) - set(RFC_SPECIALS))}@', + remainder='@', + ), + ) # get_dot_atom_text From 3a18d75aaa31ff6b846e1e9245b89c8248cc05f2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 1 Feb 2026 14:06:13 -0500 Subject: [PATCH 086/152] Start refactoring get_dot_atom_text tests. --- .../test_email/test__header_value_parser.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 30bd19e16e9308c..17497881c11152f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2365,7 +2365,27 @@ def adapt_get_cfws_tests_for_get_atom( # get_dot_atom_text - def test_get_dot_atom_text(self): + @params + def test_get_dot_atom_text(self, s, *args, **kw): + atom = self._test_parse(parser.get_dot_atom_text, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atom, parser.DotAtomText) + self.assertEqual(atom.token_type, 'dot-atom-text') + self.verify_terminal_types(atom, 'dot', 'atext') + + params_test_get_dot_atom_text = old_api_only( + + # a bare atext is valid in a dot-atom, so we should pass all the + # get_atext tests except the ones involving the dot. + include_unless( + lambda n, *a, **k: 'full_stop' in n, + label='from_test_get_atext', + )(params_test_get_atext), + + ) + + def test_get_dot_atom_text_only(self): dot_atom_text = self._test_get_x(parser.get_dot_atom_text, 'foo.bar.bang', 'foo.bar.bang', 'foo.bar.bang', [], '') self.assertEqual(dot_atom_text.token_type, 'dot-atom-text') @@ -2383,11 +2403,15 @@ def test_get_dot_atom_text_raises_on_trailing_dot(self): with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom_text('foo.bar.') - def test_get_dot_atom_text_raises_on_leading_non_atext(self): + def test_get_dot_atom_text_raises_on_leading_wsp(self): with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom_text(' foo.bar') + + def test_get_dot_atom_text_raises_on_leading_at(self): with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom_text('@foo.bar') + + def test_get_dot_atom_text_raises_on_leading_dquote(self): with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom_text('"foo.bar"') From 919853f84d7d3a92a18eceefb2524cd24ac12ce2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 9 Mar 2026 10:12:11 -0400 Subject: [PATCH 087/152] Delete now redundant get_dot_atom_text test. "lone_atom_is_valid" is more than replaced by running all the get_atext tests. --- Lib/test/test_email/test__header_value_parser.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 17497881c11152f..8017c82c95f6924 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2391,10 +2391,6 @@ def test_get_dot_atom_text_only(self): self.assertEqual(dot_atom_text.token_type, 'dot-atom-text') self.assertEqual(len(dot_atom_text), 5) - def test_get_dot_atom_text_lone_atom_is_valid(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, - 'foo', 'foo', 'foo', [], '') - def test_get_dot_atom_text_raises_on_leading_dot(self): with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom_text('.foo.bar') From a9b6576890156df004c0a8d77fa42e755695aac1 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 2 Feb 2026 11:42:58 -0500 Subject: [PATCH 088/152] Rough conversion of get_dot_atom_text tests. --- .../test_email/test__header_value_parser.py | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 8017c82c95f6924..fe9b4992d6ededa 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2383,41 +2383,45 @@ def test_get_dot_atom_text(self, s, *args, **kw): label='from_test_get_atext', )(params_test_get_atext), - ) - - def test_get_dot_atom_text_only(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, + test_get_dot_atom_text_only = C( 'foo.bar.bang', 'foo.bar.bang', 'foo.bar.bang', [], '') - self.assertEqual(dot_atom_text.token_type, 'dot-atom-text') - self.assertEqual(len(dot_atom_text), 5) + , - def test_get_dot_atom_text_raises_on_leading_dot(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('.foo.bar') + test_get_dot_atom_text_raises_on_leading_dot = C( + '.foo.bar', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_dot_atom_text_raises_on_trailing_dot(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('foo.bar.') + test_get_dot_atom_text_raises_on_trailing_dot = C( + 'foo.bar.', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_dot_atom_text_raises_on_leading_wsp(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text(' foo.bar') + test_get_dot_atom_text_raises_on_leading_wsp = C( + ' foo.bar', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_dot_atom_text_raises_on_leading_at(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('@foo.bar') + test_get_dot_atom_text_raises_on_leading_at = C( + '@foo.bar', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_dot_atom_text_raises_on_leading_dquote(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('"foo.bar"') + test_get_dot_atom_text_raises_on_leading_dquote = C( + '"foo.bar"', + exception=(errors.HeaderParseError, '.*'), + ), - def test_get_dot_atom_text_trailing_text_preserved(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, + test_get_dot_atom_text_trailing_text_preserved = C( 'foo@bar', 'foo', 'foo', [], '@bar') + , - def test_get_dot_atom_text_trailing_ws_preserved(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, + test_get_dot_atom_text_trailing_ws_preserved = C( 'foo .bar', 'foo', 'foo', [], ' .bar') + , + + ) + # get_dot_atom From a941170c0d49af30196ed63457db61272534fc02 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 2 Feb 2026 11:49:55 -0500 Subject: [PATCH 089/152] Fix whitespace and test names in get_dot_atom_text tests. --- .../test_email/test__header_value_parser.py | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index fe9b4992d6ededa..e817fc328ff497f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2383,42 +2383,54 @@ def test_get_dot_atom_text(self, s, *args, **kw): label='from_test_get_atext', )(params_test_get_atext), - test_get_dot_atom_text_only = C( - 'foo.bar.bang', 'foo.bar.bang', 'foo.bar.bang', [], '') - , + only = C( + 'foo.bar.bang', + 'foo.bar.bang', + 'foo.bar.bang', + [], + '', + ), - test_get_dot_atom_text_raises_on_leading_dot = C( - '.foo.bar', - exception=(errors.HeaderParseError, '.*'), + raises_on_leading_dot = C( + '.foo.bar', + exception=(errors.HeaderParseError, '.*'), ), - test_get_dot_atom_text_raises_on_trailing_dot = C( - 'foo.bar.', - exception=(errors.HeaderParseError, '.*'), + raises_on_trailing_dot = C( + 'foo.bar.', + exception=(errors.HeaderParseError, '.*'), ), - test_get_dot_atom_text_raises_on_leading_wsp = C( - ' foo.bar', - exception=(errors.HeaderParseError, '.*'), + raises_on_leading_wsp = C( + ' foo.bar', + exception=(errors.HeaderParseError, '.*'), ), - test_get_dot_atom_text_raises_on_leading_at = C( - '@foo.bar', - exception=(errors.HeaderParseError, '.*'), + raises_on_leading_at = C( + '@foo.bar', + exception=(errors.HeaderParseError, '.*'), ), - test_get_dot_atom_text_raises_on_leading_dquote = C( - '"foo.bar"', - exception=(errors.HeaderParseError, '.*'), + raises_on_leading_dquote = C( + '"foo.bar"', + exception=(errors.HeaderParseError, '.*'), ), - test_get_dot_atom_text_trailing_text_preserved = C( - 'foo@bar', 'foo', 'foo', [], '@bar') - , + trailing_text_preserved = C( + 'foo@bar', + 'foo', + 'foo', + [], + '@bar', + ), - test_get_dot_atom_text_trailing_ws_preserved = C( - 'foo .bar', 'foo', 'foo', [], ' .bar') - , + trailing_ws_preserved = C( + 'foo .bar', + 'foo', + 'foo', + [], + ' .bar', + ), ) From d6aa840de6db0882385899b9dce124b3fb53453c Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 5 Feb 2026 14:46:02 -0500 Subject: [PATCH 090/152] Convert get_dot_atom_text tests to keyword form. --- Lib/test/test_email/test__header_value_parser.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e817fc328ff497f..36d0b3eba50bb56 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2385,10 +2385,6 @@ def test_get_dot_atom_text(self, s, *args, **kw): only = C( 'foo.bar.bang', - 'foo.bar.bang', - 'foo.bar.bang', - [], - '', ), raises_on_leading_dot = C( @@ -2418,18 +2414,12 @@ def test_get_dot_atom_text(self, s, *args, **kw): trailing_text_preserved = C( 'foo@bar', - 'foo', - 'foo', - [], - '@bar', + remainder='@bar', ), trailing_ws_preserved = C( 'foo .bar', - 'foo', - 'foo', - [], - ' .bar', + remainder=' .bar', ), ) From c0e47b35ed133d004a0652efd05bfd83114ebb9d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 5 Feb 2026 14:59:04 -0500 Subject: [PATCH 091/152] Improve the get_dot_atom_text tests, step 1. We can replace three tests that check the behavior of leading invalid characters with a for_each_character expression. --- .../test_email/test__header_value_parser.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 36d0b3eba50bb56..8e77b94508f9eb5 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2397,19 +2397,11 @@ def test_get_dot_atom_text(self, s, *args, **kw): exception=(errors.HeaderParseError, '.*'), ), - raises_on_leading_wsp = C( - ' foo.bar', - exception=(errors.HeaderParseError, '.*'), - ), - - raises_on_leading_at = C( - '@foo.bar', - exception=(errors.HeaderParseError, '.*'), - ), - - raises_on_leading_dquote = C( - '"foo.bar"', - exception=(errors.HeaderParseError, '.*'), + **for_each_character(RFC_SPECIALS + RFC_WSP)( + raises_on_leading_special_or_wsp = C( + '{char}foo.bar', + exception=(errors.HeaderParseError, r'expected.*{echar}foo\.'), + ), ), trailing_text_preserved = C( From 71f9bf50631a277cac7a1d9aea61ca32c20600a6 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 9 Mar 2026 10:40:17 -0400 Subject: [PATCH 092/152] Improve the get_dot_atom_text tests step 2. Replace the trailing text tests, similarly to the leading text tests, with a for_each_character expression. Superficially this is redundant with the similar get_atext test, but here we're checking that the presence of the '.' doesn't affect the outcome. --- Lib/test/test_email/test__header_value_parser.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 8e77b94508f9eb5..a2a8ce77ff842a0 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2404,14 +2404,11 @@ def test_get_dot_atom_text(self, s, *args, **kw): ), ), - trailing_text_preserved = C( - 'foo@bar', - remainder='@bar', - ), - - trailing_ws_preserved = C( - 'foo .bar', - remainder=' .bar', + **for_each_character(RFC_SPECIALS + RFC_WSP, skip='.')( + ends_at_special_or_wsp = C( + 'foo.bird{char}bar', + remainder='{char}bar', + ), ), ) From 9601d7145839d11847258506c1ea8bc0949c27e2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 9 Mar 2026 10:47:24 -0400 Subject: [PATCH 093/152] Improve get_dot_atom_text tests step 3: more tests. --- .../test_email/test__header_value_parser.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index a2a8ce77ff842a0..23c37e39c4f111f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2411,6 +2411,31 @@ def test_get_dot_atom_text(self, s, *args, **kw): ), ), + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_atext = C( + 'foo.{char}.bar', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + undecodable_characters = C( + 'foo.🎁.bar'.encode().decode('us-ascii', errors='surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + all_atext_characters_allowed = C( + RFC_ATEXT + '.' + RFC_ATEXT + '@foo', + remainder = '@foo', + ), + + raises_on_paired_dots = C( + 'foo..bar', + exception=( + errors.HeaderParseError, + r'(?=.*expected)(?=.*atom)(?=.*\.\.bar)', + ), + ), + ) From 125cb2551551930c35ce09127804a946d1e33b31 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 8 Feb 2026 10:05:38 -0500 Subject: [PATCH 094/152] Begin refactoring get_dot_atom tests. BUGFIX: Previously passing an empty string to get_dot_atom would result in an IndexError. It now correctly results in a HeaderParseError. The get_dot_atom_only test is more than replaced by running the get_atom tests --- Lib/email/_header_value_parser.py | 2 +- .../test_email/test__header_value_parser.py | 24 +++++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b9b42639fdee64e..85da45e1b5348ea 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1391,7 +1391,7 @@ def get_dot_atom(value): word. """ dot_atom = DotAtom() - if value[0] in CFWS_LEADER: + if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) dot_atom.append(token) if value.startswith('=?'): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 23c37e39c4f111f..eeff26e58ac8b94 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2441,11 +2441,25 @@ def test_get_dot_atom_text(self, s, *args, **kw): # get_dot_atom - def test_get_dot_atom_only(self): - dot_atom = self._test_get_x(parser.get_dot_atom, - 'foo.bar.bing', 'foo.bar.bing', 'foo.bar.bing', [], '') - self.assertEqual(dot_atom.token_type, 'dot-atom') - self.assertEqual(len(dot_atom), 1) + @params + def test_get_dot_atom(self, s, *args, **kw): + atom = self._test_parse(parser.get_dot_atom, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atom, parser.DotAtom) + self.assertEqual(atom.token_type, 'dot-atom') + self.verify_terminal_types(atom, 'dot', 'atext', 'ptext', 'fws', 'vtext') + + params_test_get_dot_atom = old_api_only( + + # Atom is a subset of dot atom, so get_dot_atom should pass any + # get_atom test except those involving the dot (full_stop). + include_unless( + lambda n, *a, **k: 'full_stop' in n, + label='from_test_get_atom', + )(params_test_get_atom), + + ) def test_get_dot_atom_with_wsp(self): self._test_get_x(parser.get_dot_atom, From 7485b003d9659f723d7fa7e2e142d465844489ac Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 8 Feb 2026 10:17:10 -0500 Subject: [PATCH 095/152] Rough conversion of get_dot_atom tests. --- .../test_email/test__header_value_parser.py | 55 +++++++++++-------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index eeff26e58ac8b94..0e8fa3f53d917ad 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2459,41 +2459,48 @@ def test_get_dot_atom(self, s, *args, **kw): label='from_test_get_atom', )(params_test_get_atom), - ) - - def test_get_dot_atom_with_wsp(self): - self._test_get_x(parser.get_dot_atom, + test_get_dot_atom_with_wsp = C( '\t foo.bar.bing ', '\t foo.bar.bing ', ' foo.bar.bing ', [], '') + , - def test_get_dot_atom_with_comments_and_wsp(self): - self._test_get_x(parser.get_dot_atom, + test_get_dot_atom_with_comments_and_wsp = C( ' (sing) foo.bar.bing (here) ', ' (sing) foo.bar.bing (here) ', - ' foo.bar.bing ', [], '') + ' foo.bar.bing ', [], '', + comments=['sing', 'here'], + ), - def test_get_dot_atom_space_ends_dot_atom(self): - self._test_get_x(parser.get_dot_atom, + test_get_dot_atom_space_ends_dot_atom = C( ' (sing) foo.bar .bing (here) ', ' (sing) foo.bar ', - ' foo.bar ', [], '.bing (here) ') + ' foo.bar ', [], '.bing (here) ', + comments=['sing'], + ), - def test_get_dot_atom_no_atom_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom(' (foo) ') + test_get_dot_atom_no_atom_raises = C( + ' (foo) ', + exception=(errors.HeaderParseError, '.*') + ), - def test_get_dot_atom_leading_dot_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom(' (foo) .bar') + test_get_dot_atom_leading_dot_raises = C( + ' (foo) .bar', + exception=(errors.HeaderParseError, '.*') + ), - def test_get_dot_atom_two_dots_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom('bar..bang') + test_get_dot_atom_two_dots_raises = C( + 'bar..bang', + exception=(errors.HeaderParseError, '.*') + ), - def test_get_dot_atom_trailing_dot_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom(' (foo) bar.bang. foo') + test_get_dot_atom_trailing_dot_raises = C( + ' (foo) bar.bang. foo', + exception=(errors.HeaderParseError, '.*') + ), - def test_get_dot_atom_rfc2047_atom(self): - self._test_get_x(parser.get_dot_atom, + test_get_dot_atom_rfc2047_atom = C( '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + , + + ) + # get_word (if this were black box we'd repeat all the qs/atom tests) From ea5a5419f2a165a8fed5998e40516f1a26a51623 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 8 Feb 2026 10:19:42 -0500 Subject: [PATCH 096/152] Fix whitespace and test names in get_dot_atom tests. --- .../test_email/test__header_value_parser.py | 62 ++++++++++++------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 0e8fa3f53d917ad..567891c31d5992a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2459,45 +2459,59 @@ def test_get_dot_atom(self, s, *args, **kw): label='from_test_get_atom', )(params_test_get_atom), - test_get_dot_atom_with_wsp = C( - '\t foo.bar.bing ', '\t foo.bar.bing ', ' foo.bar.bing ', [], '') - , + with_wsp = C( + '\t foo.bar.bing ', + '\t foo.bar.bing ', + ' foo.bar.bing ', + [], + '', + ), - test_get_dot_atom_with_comments_and_wsp = C( - ' (sing) foo.bar.bing (here) ', ' (sing) foo.bar.bing (here) ', - ' foo.bar.bing ', [], '', + with_comments_and_wsp = C( + ' (sing) foo.bar.bing (here) ', + ' (sing) foo.bar.bing (here) ', + ' foo.bar.bing ', + [], + '', comments=['sing', 'here'], ), - test_get_dot_atom_space_ends_dot_atom = C( - ' (sing) foo.bar .bing (here) ', ' (sing) foo.bar ', - ' foo.bar ', [], '.bing (here) ', + space_ends_dot_atom = C( + ' (sing) foo.bar .bing (here) ', + ' (sing) foo.bar ', + ' foo.bar ', + [], + '.bing (here) ', comments=['sing'], ), - test_get_dot_atom_no_atom_raises = C( - ' (foo) ', - exception=(errors.HeaderParseError, '.*') + no_atom_raises = C( + ' (foo) ', + exception=(errors.HeaderParseError, '.*') ), - test_get_dot_atom_leading_dot_raises = C( - ' (foo) .bar', - exception=(errors.HeaderParseError, '.*') + leading_dot_raises = C( + ' (foo) .bar', + exception=(errors.HeaderParseError, '.*') ), - test_get_dot_atom_two_dots_raises = C( - 'bar..bang', - exception=(errors.HeaderParseError, '.*') + two_dots_raises = C( + 'bar..bang', + exception=(errors.HeaderParseError, '.*') ), - test_get_dot_atom_trailing_dot_raises = C( - ' (foo) bar.bang. foo', - exception=(errors.HeaderParseError, '.*') + trailing_dot_raises = C( + ' (foo) bar.bang. foo', + exception=(errors.HeaderParseError, '.*') ), - test_get_dot_atom_rfc2047_atom = C( - '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') - , + rfc2047_atom = C( + '=?utf-8?q?=20bob?=', + ' bob', + ' bob', + [], + '', + ), ) From b86d5e897526102f3c57933ffe8f9e4888e57cfa Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 8 Feb 2026 10:21:27 -0500 Subject: [PATCH 097/152] Convert get_dot_atom tests to keyword form. And add regexes for the exceptions. --- .../test_email/test__header_value_parser.py | 29 ++++++------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 567891c31d5992a..428fbe1ee0c31d7 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2461,56 +2461,45 @@ def test_get_dot_atom(self, s, *args, **kw): with_wsp = C( '\t foo.bar.bing ', - '\t foo.bar.bing ', - ' foo.bar.bing ', - [], - '', + value=' foo.bar.bing ', ), with_comments_and_wsp = C( ' (sing) foo.bar.bing (here) ', - ' (sing) foo.bar.bing (here) ', - ' foo.bar.bing ', - [], - '', + value=' foo.bar.bing ', comments=['sing', 'here'], ), space_ends_dot_atom = C( ' (sing) foo.bar .bing (here) ', - ' (sing) foo.bar ', - ' foo.bar ', - [], - '.bing (here) ', + value=' foo.bar ', + remainder='.bing (here) ', comments=['sing'], ), no_atom_raises = C( ' (foo) ', - exception=(errors.HeaderParseError, '.*') + exception=(errors.HeaderParseError, r'expected') ), leading_dot_raises = C( ' (foo) .bar', - exception=(errors.HeaderParseError, '.*') + exception=(errors.HeaderParseError, r'expected.*\.bar') ), two_dots_raises = C( 'bar..bang', - exception=(errors.HeaderParseError, '.*') + exception=(errors.HeaderParseError, r'expected.*\.\.bang') ), trailing_dot_raises = C( ' (foo) bar.bang. foo', - exception=(errors.HeaderParseError, '.*') + exception=(errors.HeaderParseError, r'expected.*\. foo') ), rfc2047_atom = C( '=?utf-8?q?=20bob?=', - ' bob', - ' bob', - [], - '', + stringified=' bob', ), ) From ebb8d3ff27b9e5bb06d15b257eb892462df8ea7e Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 8 Feb 2026 13:06:36 -0500 Subject: [PATCH 098/152] Improve the get_dot_atom tests. --- .../test_email/test__header_value_parser.py | 78 ++++++++++++++++++- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 428fbe1ee0c31d7..3402f5231e37161 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2482,9 +2482,11 @@ def test_get_dot_atom(self, s, *args, **kw): exception=(errors.HeaderParseError, r'expected') ), - leading_dot_raises = C( - ' (foo) .bar', - exception=(errors.HeaderParseError, r'expected.*\.bar') + **for_each_character(RFC_SPECIALS, skip='(')( + leading_special_raises = C( + ' (foo) {char}bar', + exception=(errors.HeaderParseError, r'(?i)expected.*{echar}bar') + ), ), two_dots_raises = C( @@ -2502,6 +2504,76 @@ def test_get_dot_atom(self, s, *args, **kw): stringified=' bob', ), + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_atext = C( + 'foo.{char}.bar', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + undecodable_characters = C( + 'foo.🎁.bar'.encode().decode('us-ascii', errors='surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + **for_each_character(RFC_SPECIALS, skip='.(')( + ends_at_special = C( + '(hey)foo.bar{char}.bird', + value=' foo.bar', + remainder='{char}.bird', + comments=['hey'], + ), + ), + + **for_each_character(RFC_SPECIALS, skip='(')( + ends_at_special_after_comment = C( + '(hey)foo.bar(hey){char} bird', + value=' foo.bar ', + remainder='{char} bird', + comments=['hey', 'hey'], + ), + ), + + two_ew_two_atoms = C( + '(hey) =?UTF-8?q?foo?= =?UTF-8?q?bar?=', + stringified='(hey) foo ', + value=' foo ', + remainder='=?UTF-8?q?bar?=', + comments=['hey'], + ), + + # XXX XXX These additional EW cases not already tested by the atom + # tests will be fully decoded after refactoring. + + mixed_ews_and_atext = C( + '(hey)foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar (hey)', + #stringified='(hey)foo.bar foobar.foobar (hey)', + value=' foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar ', + #value=' foo.bar foobar.foobar ', + #defects=[ + # missing_whitespace_before_ew_defect, + # missing_whitespace_after_ew_defect, + # missing_whitespace_before_ew_defect, + # missing_whitespace_after_ew_defect, + # ], + comments=['hey', 'hey'], + ), + + two_ew_with_dot = C( + '=?UTF-8?q?foo?=.=?UTF-8?q?bar?=(hey)', + stringified='foo', + #stringified='foo.bar(hey)', + value='foo', + #value='foo.bar ', + remainder='.=?UTF-8?q?bar?=(hey)', + defects=[ + missing_whitespace_after_ew_defect, + # missing_whitespace_before_ew_defect, + # missing_whitespace_after_ew_defect, + ], + #comments=['hey'], + ), + ) From 19b89bdaafa7e087cd0dfdc4664692ee89a6b40d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 13 Feb 2026 15:19:53 -0500 Subject: [PATCH 099/152] Refactor get_word tests. Now we test it like a black box, and do run the atom and quoted string tests (most of them) against get_word. These tests more than replace the existing tests: atom_yields_atom is replaced by the whole set of get_atom tests, qs_yields_qs likewise by the get_quoted_string tests, ends_at_dot is tested by the get_atom atom_ends_at_special test, and all_CFWS is tested by both get_atom's 'no_atom' test and get_quoted_string's cfws_only_raises. The asserts are safe to delete because they are either now superfluous (checked in the test method) or they are a bit misguided. At the get_word level we don't really have any business constraining the quoted-string token list to contain a bare-quoted-string. Since get atom parses *either* an atom *or* a quoted string, there are no additional tests that need to be added. There's a fix for a bug revealed by re-using the test parameters: BUGFIX: previously get_word would raise an IndexError if passed an empty string; it now correctly raises a HeaderParseError. --- Lib/email/_header_value_parser.py | 2 +- .../test_email/test__header_value_parser.py | 87 ++++++++++++++----- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 85da45e1b5348ea..afccea8fcb87735 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1425,7 +1425,7 @@ def get_word(value): parse tree is more confusing than it is helpful. """ - if value[0] in CFWS_LEADER: + if value and value[0] in CFWS_LEADER: leader, value = get_cfws(value) else: leader = None diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 3402f5231e37161..f7f4214817d9756 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2577,31 +2577,72 @@ def test_get_dot_atom(self, s, *args, **kw): ) - # get_word (if this were black box we'd repeat all the qs/atom tests) + # get_word - def test_get_word_atom_yields_atom(self): - word = self._test_get_x(parser.get_word, - ' (foo) bar (bang) :ah', ' (foo) bar (bang) ', ' bar ', [], ':ah') - self.assertEqual(word.token_type, 'atom') - self.assertEqual(word[0].token_type, 'cfws') + @params + def test_get_word( + self, + s, + *args, + quoted_value=None, + content=None, + tokenlisttype, + **kw, + ): + word = self._test_parse(parser.get_word, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(word, tokenlisttype) + if quoted_value is not None: + self.assertEqual(word.quoted_value, quoted_value) + if content is not None: + self.assertEqual(word.content, content) + self.verify_terminal_types(word, 'dot', 'atext', 'ptext', 'fws', 'vtext') + + @params_map + def adapt_get_atom_tests_for_get_word(*args, **kw): + kw['tokenlisttype'] = parser.TokenList + yield '', C(*args, **kw) + + @params_map + def adapt_get_quoted_string_tests_for_get_word(*args, **kw): + kw['tokenlisttype'] = parser.QuotedString + yield '', C(*args, **kw) + + params_test_get_word = old_api_only( + + # A word can be an atom, so get_word should pass many of the atom tests. + adapt_get_atom_tests_for_get_word( + include_unless( + lambda n, *a, **k: + # For get_atom a leading quotation mark means there is no + # atom and is therefor an error, but get_word will treat it + # as a quoted_string. Quoted strings are tested below. + n.has_any( + 'no_atom_before_special', + 'no_atext_before_special_or_wsp', + ) + and 'quotation_mark' in n, + label='from_test_get_atom', + )(params_test_get_atom), + ), + + # Or it can be a quoted string, so should pass most quoted_string tests. + adapt_get_quoted_string_tests_for_get_word( + include_unless( + lambda n, *a, **k: + # These tests have an atom first; get_quoted_string raises + # for that, but get_word parses it. Atoms are tested above. + n.has_any( + 'no_quoted_string', + 'no_leading_dquote_before_non_ws', + ), + label='from_test_get_quoted_string', + )(params_test_get_quoted_string), + ), + + ) - def test_get_word_all_CFWS(self): - # bpo-29412: Test that we don't raise IndexError when parsing CFWS only - # token. - with self.assertRaises(errors.HeaderParseError): - parser.get_word('(Recipients list suppressed') - - def test_get_word_qs_yields_qs(self): - word = self._test_get_x(parser.get_word, - '"bar " (bang) ah', '"bar " (bang) ', 'bar ', [], 'ah') - self.assertEqual(word.token_type, 'quoted-string') - self.assertEqual(word[0].token_type, 'bare-quoted-string') - self.assertEqual(word[0].value, 'bar ') - self.assertEqual(word.content, 'bar ') - - def test_get_word_ends_at_dot(self): - self._test_get_x(parser.get_word, - 'foo.', 'foo', 'foo', [], '.') # get_phrase From bce55f0a7dab882d8e2e0c5471dc722e877c4f7d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 20 Feb 2026 15:52:06 -0500 Subject: [PATCH 100/152] Begin refactoring get_phrase tests. Running the get_word tests reveals that get_phrase is buggy: it does not raise if there are no words, and per the RFC it should. get_display actually depends on this behavior. It's for an error case, though, so handling a raise from get_phrase would actually make more sense. We'll deprecate this behavior of get_phrase in the refactored code. 'get_phrase_cfws_only_raises', which was supposed to test the buggy behavior, is mis-named and never ran or revealed the bug. This changeset deletes it because it will fail currently, and the tests that are skipped by this changeset will test for the correct behavior after the refactoring. --- .../test_email/test__header_value_parser.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f7f4214817d9756..bd3bd1bb285bbf9 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2646,6 +2646,62 @@ def adapt_get_quoted_string_tests_for_get_word(*args, **kw): # get_phrase + @params + def test_get_phrase(self, s, *args, **kw): + phrase = self._test_parse(parser.get_phrase, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(phrase, parser.Phrase) + self.verify_terminal_types(phrase, 'dot', 'atext', 'ptext', 'fws', 'vtext') + + @params_map(with_namelist=True) + def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): + kw.pop('tokenlisttype') + kw.pop('quoted_value', None) + kw.pop('content', None) + # XXX XXX A phrase has to have at least one word, but the current code + # does not enforce this. We'll fix this in the refactor, but for now + # we skip the parameters that expect a raise on a value with no + # content. + if nl.has_any( + 'empty', + 'no_atom_before_special', + 'no_atom', + 'no_atext_before_special_or_wsp', + 'cfws_only_raises', + 'empty_input', + ): + return + yield '', C(*args, **kw) + + params_test_get_phrase = old_api_only( + + # A phrase is a series of words, and single words are valid, + # so get_phrase should pass many of the get_word tests. + adapt_get_word_tests_for_get_phrase( + include_unless( + lambda n, *a, remainder=False, **k: + n.has_any( + # A phrase only ends at specials other than " and . + 'atom_ends_at_noncfws', + 'qs_ends_at_noncfws', + 'ew_after_dquote', + 'encoded_word_after_dquote_with_no_ws', + 'end_dquote_mid_word', + # XXX XXX This test should pass after refactoring. + 'multiple_ew_no_ws', + ) + # A phrase does *not* end at a period or a quotation mark. + or remainder and n.has_any( + 'full_stop', + 'quotation_mark', + ), + label='from_test_get_word', + )(params_test_get_word), + ), + + ) + def test_get_phrase_simple(self): phrase = self._test_get_x(parser.get_phrase, '"Fred A. Johnson" is his name, oh.', From bb51f8b638aa1e3e7e8359b1280247c30bf7aa78 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 11:51:25 -0400 Subject: [PATCH 101/152] Rough conversion of get_phrase tests. --- .../test_email/test__header_value_parser.py | 97 ++++++++----------- 1 file changed, 39 insertions(+), 58 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index bd3bd1bb285bbf9..5ac4a03b5981daf 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2700,133 +2700,114 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): )(params_test_get_word), ), - ) - - def test_get_phrase_simple(self): - phrase = self._test_get_x(parser.get_phrase, + test_get_phrase_simple = C( '"Fred A. Johnson" is his name, oh.', '"Fred A. Johnson" is his name', 'Fred A. Johnson is his name', [], ', oh.') - self.assertEqual(phrase.token_type, 'phrase') + , - def test_get_phrase_complex(self): - phrase = self._test_get_x(parser.get_phrase, + test_get_phrase_complex = C( ' (A) bird (in (my|your)) "hand " is messy\t<>\t', ' (A) bird (in (my|your)) "hand " is messy\t', ' bird hand is messy ', [], - '<>\t') - self.assertEqual(phrase[0][0].comments, ['A']) - self.assertEqual(phrase[0][2].comments, ['in (my|your)']) + '<>\t', + ['A', 'in (my|your)'], + ), - def test_get_phrase_obsolete(self): - phrase = self._test_get_x(parser.get_phrase, + test_get_phrase_obsolete = C( 'Fred A.(weird).O Johnson', 'Fred A.(weird).O Johnson', 'Fred A. .O Johnson', [errors.ObsoleteHeaderDefect]*3, - '') - self.assertEqual(len(phrase), 7) - self.assertEqual(phrase[3].comments, ['weird']) + '', + ['weird'], + ), + #self.assertEqual(len(phrase), 7) - def test_get_phrase_pharse_must_start_with_word(self): - phrase = self._test_get_x(parser.get_phrase, + test_get_phrase_pharse_must_start_with_word = C( '(even weirder).name', '(even weirder).name', ' .name', [errors.InvalidHeaderDefect] + [errors.ObsoleteHeaderDefect]*2, - '') - self.assertEqual(len(phrase), 3) - self.assertEqual(phrase[0].comments, ['even weirder']) + '', + ['even weirder'], + ), + #self.assertEqual(len(phrase), 3) - def test_get_phrase_ending_with_obsolete(self): - phrase = self._test_get_x(parser.get_phrase, + test_get_phrase_ending_with_obsolete = C( 'simple phrase.(with trailing comment):boo', 'simple phrase.(with trailing comment)', 'simple phrase. ', [errors.ObsoleteHeaderDefect]*2, - ':boo') - self.assertEqual(len(phrase), 4) - self.assertEqual(phrase[3].comments, ['with trailing comment']) - - def get_phrase_cfws_only_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_phrase(' (foo) ') + ':boo', + ['with trailing comment'], + ), + #self.assertEqual(len(phrase), 4) - def test_get_phrase_adjacent_ew(self): + test_get_phrase_adjacent_ew = C( # "'linear-white-space' that separates a pair of adjacent # 'encoded-word's is ignored" (rfc2047 section 6.2) - self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '') + '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '') + , - def test_get_phrase_adjacent_ew_different_encodings(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_different_encodings = C( '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], '' - ) + ), - def test_get_phrase_adjacent_ew_encoded_spaces(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_encoded_spaces = C( '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', 'Encoded spaces preserved', 'Encoded spaces preserved', [], '' - ) + ), - def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_comment_is_not_linear_white_space = C( '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=', 'Comment (is not) linear-white-space', 'Comment linear-white-space', [], '', comments=['is not'], - ) + ), - def test_get_phrase_adjacent_ew_no_error_on_defects(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', 'Defect still joins', 'Defect still joins', [errors.InvalidHeaderDefect], # whitespace inside encoded word '' - ) + ), - def test_get_phrase_adjacent_ew_ignore_non_ew(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_ignore_non_ew = C( '=?ascii?q?No?= =?join?= for non-ew', 'No =?join?= for non-ew', 'No =?join?= for non-ew', [], '' - ) + ), - def test_get_phrase_adjacent_ew_ignore_invalid_ew(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_ignore_invalid_ew = C( '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew', 'No =?ascii?rot13?wbva= for invalid ew', 'No =?ascii?rot13?wbva= for invalid ew', [], '' - ) + ), - def test_get_phrase_adjacent_ew_missing_space(self): - self._test_get_x( - parser.get_phrase, + test_get_phrase_adjacent_ew_missing_space = C( '=?ascii?q?Joi?==?ascii?q?ned?=', 'Joined', 'Joined', [errors.InvalidHeaderDefect], # missing trailing whitespace '' + ), ) + # get_local_part def test_get_local_part_simple(self): From 3bd7f121c69bad146635ef1bac3feb8fb7b1f2d5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 12:34:24 -0400 Subject: [PATCH 102/152] Fix whitespace and test names in get_phrase tests. --- .../test_email/test__header_value_parser.py | 59 +++++++++++-------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 5ac4a03b5981daf..2787046fb09a623 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2700,15 +2700,15 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): )(params_test_get_word), ), - test_get_phrase_simple = C( + simple = C( '"Fred A. Johnson" is his name, oh.', '"Fred A. Johnson" is his name', 'Fred A. Johnson is his name', [], - ', oh.') - , + ', oh.', + ), - test_get_phrase_complex = C( + complex = C( ' (A) bird (in (my|your)) "hand " is messy\t<>\t', ' (A) bird (in (my|your)) "hand " is messy\t', ' bird hand is messy ', @@ -2717,7 +2717,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): ['A', 'in (my|your)'], ), - test_get_phrase_obsolete = C( + obsolete = C( 'Fred A.(weird).O Johnson', 'Fred A.(weird).O Johnson', 'Fred A. .O Johnson', @@ -2727,7 +2727,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): ), #self.assertEqual(len(phrase), 7) - test_get_phrase_pharse_must_start_with_word = C( + must_start_with_word = C( '(even weirder).name', '(even weirder).name', ' .name', @@ -2737,7 +2737,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): ), #self.assertEqual(len(phrase), 3) - test_get_phrase_ending_with_obsolete = C( + ending_with_obsolete = C( 'simple phrase.(with trailing comment):boo', 'simple phrase.(with trailing comment)', 'simple phrase. ', @@ -2747,64 +2747,73 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): ), #self.assertEqual(len(phrase), 4) - test_get_phrase_adjacent_ew = C( # "'linear-white-space' that separates a pair of adjacent # 'encoded-word's is ignored" (rfc2047 section 6.2) - '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '') - , - test_get_phrase_adjacent_ew_different_encodings = C( - '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], '' - ), + adjacent_ew = C( + '=?ascii?q?Joi?= \t =?ascii?q?ned?=', + 'Joined', + 'Joined', + [], + '', + ), - test_get_phrase_adjacent_ew_encoded_spaces = C( + adjacent_ew_different_encodings = C( + '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', + 'Bérénice', + 'Bérénice', + [], + '' + ), + + adjacent_ew_encoded_spaces = C( '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', 'Encoded spaces preserved', 'Encoded spaces preserved', [], '' - ), + ), - test_get_phrase_adjacent_ew_comment_is_not_linear_white_space = C( + adjacent_ew_comment_is_not_linear_white_space = C( '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=', 'Comment (is not) linear-white-space', 'Comment linear-white-space', [], '', comments=['is not'], - ), + ), - test_get_phrase_adjacent_ew_no_error_on_defects = C( + adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', 'Defect still joins', 'Defect still joins', [errors.InvalidHeaderDefect], # whitespace inside encoded word '' - ), + ), - test_get_phrase_adjacent_ew_ignore_non_ew = C( + adjacent_ew_ignore_non_ew = C( '=?ascii?q?No?= =?join?= for non-ew', 'No =?join?= for non-ew', 'No =?join?= for non-ew', [], '' - ), + ), - test_get_phrase_adjacent_ew_ignore_invalid_ew = C( + adjacent_ew_ignore_invalid_ew = C( '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew', 'No =?ascii?rot13?wbva= for invalid ew', 'No =?ascii?rot13?wbva= for invalid ew', [], '' - ), + ), - test_get_phrase_adjacent_ew_missing_space = C( + adjacent_ew_missing_space = C( '=?ascii?q?Joi?==?ascii?q?ned?=', 'Joined', 'Joined', [errors.InvalidHeaderDefect], # missing trailing whitespace '' - ), + ), ) From d51948bbac38a4a61be21d6dda53ea64aa3317a6 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 12:36:39 -0400 Subject: [PATCH 103/152] Convert get_phrase tests to keyword form. --- .../test_email/test__header_value_parser.py | 83 ++++++------------- 1 file changed, 27 insertions(+), 56 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 2787046fb09a623..76782eaa8891eed 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2702,48 +2702,39 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): simple = C( '"Fred A. Johnson" is his name, oh.', - '"Fred A. Johnson" is his name', - 'Fred A. Johnson is his name', - [], - ', oh.', + value='Fred A. Johnson is his name', + remainder=', oh.', ), complex = C( ' (A) bird (in (my|your)) "hand " is messy\t<>\t', - ' (A) bird (in (my|your)) "hand " is messy\t', - ' bird hand is messy ', - [], - '<>\t', - ['A', 'in (my|your)'], + value=' bird hand is messy ', + remainder='<>\t', + comments=['A', 'in (my|your)'], ), obsolete = C( 'Fred A.(weird).O Johnson', - 'Fred A.(weird).O Johnson', - 'Fred A. .O Johnson', - [errors.ObsoleteHeaderDefect]*3, - '', - ['weird'], + value='Fred A. .O Johnson', + defects=[errors.ObsoleteHeaderDefect]*3, + comments=['weird'], ), #self.assertEqual(len(phrase), 7) must_start_with_word = C( '(even weirder).name', - '(even weirder).name', - ' .name', - [errors.InvalidHeaderDefect] + [errors.ObsoleteHeaderDefect]*2, - '', - ['even weirder'], + value=' .name', + defects=[errors.InvalidHeaderDefect] + [errors.ObsoleteHeaderDefect]*2, + comments=['even weirder'], ), #self.assertEqual(len(phrase), 3) ending_with_obsolete = C( 'simple phrase.(with trailing comment):boo', - 'simple phrase.(with trailing comment)', - 'simple phrase. ', - [errors.ObsoleteHeaderDefect]*2, - ':boo', - ['with trailing comment'], + value='simple phrase. ', + defects=[errors.ObsoleteHeaderDefect]*2, + remainder=':boo', + comments=['with trailing comment'], ), #self.assertEqual(len(phrase), 4) @@ -2752,68 +2743,48 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): adjacent_ew = C( '=?ascii?q?Joi?= \t =?ascii?q?ned?=', - 'Joined', - 'Joined', - [], - '', + stringified='Joined', ), adjacent_ew_different_encodings = C( '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', - 'Bérénice', - 'Bérénice', - [], - '' + stringified='Bérénice', ), adjacent_ew_encoded_spaces = C( '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', - 'Encoded spaces preserved', - 'Encoded spaces preserved', - [], - '' + stringified='Encoded spaces preserved', ), adjacent_ew_comment_is_not_linear_white_space = C( '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=', - 'Comment (is not) linear-white-space', - 'Comment linear-white-space', - [], - '', + stringified='Comment (is not) linear-white-space', + value='Comment linear-white-space', comments=['is not'], ), adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', - 'Defect still joins', - 'Defect still joins', - [errors.InvalidHeaderDefect], # whitespace inside encoded word - '' + stringified='Defect still joins', + defects=[errors.InvalidHeaderDefect], # whitespace inside encoded word ), adjacent_ew_ignore_non_ew = C( '=?ascii?q?No?= =?join?= for non-ew', - 'No =?join?= for non-ew', - 'No =?join?= for non-ew', - [], - '' + stringified='No =?join?= for non-ew', ), adjacent_ew_ignore_invalid_ew = C( '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew', - 'No =?ascii?rot13?wbva= for invalid ew', - 'No =?ascii?rot13?wbva= for invalid ew', - [], - '' + stringified='No =?ascii?rot13?wbva= for invalid ew', ), adjacent_ew_missing_space = C( '=?ascii?q?Joi?==?ascii?q?ned?=', - 'Joined', - 'Joined', - [errors.InvalidHeaderDefect], # missing trailing whitespace - '' + stringified='Joined', + defects=[errors.InvalidHeaderDefect], # missing trailing whitespace ), + ) From 9559b0bcb65f324fcf44745499ff271d5dc752d1 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 12:45:46 -0400 Subject: [PATCH 104/152] Add equivalent of get_phrase test length check. It doesn't make sense to constrain the length here, since changes at a lower layer could affect the exact number of tokens. The reason these checks existed was to make sure that the obsolete dots are their own token, so I converted to doing that check instead of a length check. This way we also make sure there are no unexpected dot tokens. --- Lib/test/test_email/test__header_value_parser.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 76782eaa8891eed..858274c4bbce83d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2647,11 +2647,16 @@ def adapt_get_quoted_string_tests_for_get_word(*args, **kw): # get_phrase @params - def test_get_phrase(self, s, *args, **kw): + def test_get_phrase(self, s, *args, obs_dots=0, **kw): phrase = self._test_parse(parser.get_phrase, C(s), *args, **kw) if 'exception' in kw: return self.assertIsInstance(phrase, parser.Phrase) + self.assertEqual( + len([x for x in phrase if x.token_type == 'dot']), + obs_dots, + phrase.ppstr(), + ) self.verify_terminal_types(phrase, 'dot', 'atext', 'ptext', 'fws', 'vtext') @params_map(with_namelist=True) @@ -2718,16 +2723,16 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value='Fred A. .O Johnson', defects=[errors.ObsoleteHeaderDefect]*3, comments=['weird'], + obs_dots=2, ), - #self.assertEqual(len(phrase), 7) must_start_with_word = C( '(even weirder).name', value=' .name', defects=[errors.InvalidHeaderDefect] + [errors.ObsoleteHeaderDefect]*2, comments=['even weirder'], + obs_dots=1, ), - #self.assertEqual(len(phrase), 3) ending_with_obsolete = C( 'simple phrase.(with trailing comment):boo', @@ -2735,8 +2740,8 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): defects=[errors.ObsoleteHeaderDefect]*2, remainder=':boo', comments=['with trailing comment'], + obs_dots=1, ), - #self.assertEqual(len(phrase), 4) # "'linear-white-space' that separates a pair of adjacent # 'encoded-word's is ignored" (rfc2047 section 6.2) From 5328ef8f844617b2622e57fa7c37793cb06ad018 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 12:56:33 -0400 Subject: [PATCH 105/152] Improve the get_phrase test defect checking. --- .../test_email/test__header_value_parser.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 858274c4bbce83d..a69b61a18e5645a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2721,7 +2721,10 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): obsolete = C( 'Fred A.(weird).O Johnson', value='Fred A. .O Johnson', - defects=[errors.ObsoleteHeaderDefect]*3, + defects=[ + *[period_in_phrase_obs_defect]*2, + comment_without_atom_in_phrase_obs_defect, + ], comments=['weird'], obs_dots=2, ), @@ -2729,7 +2732,11 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): must_start_with_word = C( '(even weirder).name', value=' .name', - defects=[errors.InvalidHeaderDefect] + [errors.ObsoleteHeaderDefect]*2, + defects=[ + non_word_phrase_start_defect, + comment_without_atom_in_phrase_obs_defect, + period_in_phrase_obs_defect, + ], comments=['even weirder'], obs_dots=1, ), @@ -2737,7 +2744,10 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): ending_with_obsolete = C( 'simple phrase.(with trailing comment):boo', value='simple phrase. ', - defects=[errors.ObsoleteHeaderDefect]*2, + defects=[ + period_in_phrase_obs_defect, + comment_without_atom_in_phrase_obs_defect, + ], remainder=':boo', comments=['with trailing comment'], obs_dots=1, @@ -2771,7 +2781,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', stringified='Defect still joins', - defects=[errors.InvalidHeaderDefect], # whitespace inside encoded word + defects=[whitespace_inside_ew_defect], ), adjacent_ew_ignore_non_ew = C( @@ -2787,7 +2797,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): adjacent_ew_missing_space = C( '=?ascii?q?Joi?==?ascii?q?ned?=', stringified='Joined', - defects=[errors.InvalidHeaderDefect], # missing trailing whitespace + defects=[missing_whitespace_after_ew_defect], ), ) From cdfbab741a21f2727cdcb2d0ad3336335e9e3ac5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 12:59:43 -0400 Subject: [PATCH 106/152] Improve the get_phrase test names. --- Lib/test/test_email/test__header_value_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index a69b61a18e5645a..78324cc95ae54fd 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2705,20 +2705,20 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): )(params_test_get_word), ), - simple = C( + simple_phrase = C( '"Fred A. Johnson" is his name, oh.', value='Fred A. Johnson is his name', remainder=', oh.', ), - complex = C( + complex_phrase = C( ' (A) bird (in (my|your)) "hand " is messy\t<>\t', value=' bird hand is messy ', remainder='<>\t', comments=['A', 'in (my|your)'], ), - obsolete = C( + obsolete_phrase = C( 'Fred A.(weird).O Johnson', value='Fred A. .O Johnson', defects=[ @@ -2729,7 +2729,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): obs_dots=2, ), - must_start_with_word = C( + should_start_with_word = C( '(even weirder).name', value=' .name', defects=[ @@ -2741,7 +2741,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): obs_dots=1, ), - ending_with_obsolete = C( + obsolete_ending = C( 'simple phrase.(with trailing comment):boo', value='simple phrase. ', defects=[ From e834e009f2bc34f95ed4d50c5da00222a9a79616 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 11 Mar 2026 13:33:28 -0400 Subject: [PATCH 107/152] Additional tests for get_phrase. --- .../test_email/test__header_value_parser.py | 83 ++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 78324cc95ae54fd..1db1b28c04cf841 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2800,7 +2800,88 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): defects=[missing_whitespace_after_ew_defect], ), - ) + ew_before_quoted_string_missing_space = C( + '=?ascii?q?disjoin?="=?ascii?q?ted?="', + stringified='disjoin"ted"', + value='disjointed', + defects=[ + # XXX XXX After refactoring there should be one 'after' defect + missing_whitespace_after_ew_defect, + missing_whitespace_after_ew_defect, + ew_inside_quoted_string_defect, + ], + ), + + ew_after_quoted_string_missing_space = C( + '"=?ascii?q?disjoin?="=?ascii?q?ted?=', + stringified='"disjoin"ted', + value='disjointed', + defects=[ + # XXX XXX After refactoring 'after' should become 'before' + missing_whitespace_after_ew_defect, + ew_inside_quoted_string_defect, + ], + ), + + **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + '."')( + ends_at_special = C( + 'complex (obsolete). "phrase" {char}foo', + value='complex . phrase ', + defects=[period_in_phrase_obs_defect], + remainder='{char}foo', + comments=['obsolete'], + obs_dots=1, + ), + ), + + # While these violate the RFC in several ways, allowing the '.' + # as the value of the phrase is the only sensible recovery. + + obsolete_dot_only = C( + '.', + defects=[ + non_word_phrase_start_defect, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + obsolete_dot_with_wsp = C( + '\t . ', + value=' . ', + defects=[ + non_word_phrase_start_defect, + *[comment_without_atom_in_phrase_obs_defect]*2, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + obsolete_dot_and_comments_only = C( + '(foo).(bar)', + value=' . ', + comments=['foo', 'bar'], + defects=[ + non_word_phrase_start_defect, + *[comment_without_atom_in_phrase_obs_defect]*2, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + obsolete_dot_and_comments_with_fws = C( + ' (foo). (bar) ', + value=' . ', + comments=['foo', 'bar'], + defects=[ + non_word_phrase_start_defect, + *[comment_without_atom_in_phrase_obs_defect]*2, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + ) # get_local_part From c35a5531611c74f08cdcefa12f91a79d468b946f Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 28 Mar 2026 15:47:32 -0400 Subject: [PATCH 108/152] Begin conversion of get_local_part tests. --- .../test_email/test__header_value_parser.py | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 1db1b28c04cf841..54687023079562f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2886,6 +2886,108 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): # get_local_part + @params + def test_get_local_part(self, s, *args, local_part=None, **kw): + lp = self._test_parse(parser.get_local_part, C(s), *args, **kw) + if 'exception' in kw: + return + self.verify_terminal_types( + lp, + 'dot', + 'atext', + 'ptext', + 'fws', + 'vtext', + ) + if local_part != ...: + self.assertEqual(lp.local_part, local_part) + + @params_map(with_namelist=True) + def adapt_get_dot_atom_tests_for_get_local_part(nl, s, *args, **kw): + r = kw.get('remainder') + if 'value' in kw: + local_part = kw['value'] + else: + local_part = kw.get('stringified', s[:-len(r)] if r else s) + if not nl.has_any('ew_only', 'rfc2047_atom'): + # Except for the above two tests, the leading and trailing + # whitespace in the 'value' is the 'semantic blank' it produces + # for leading and trailing cfws, which local_part doesn't include. + # For those two ew tests the blank comes from inside the ew. + local_part = local_part.removeprefix(' ').removesuffix(' ') + kw['local_part'] = local_part + yield '', C(s, *args, **kw) + + @params_map + def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): + if 'quoted_value' in kw: + kw['value'] = kw.pop('quoted_value') + if 'exception' not in kw: + kw['local_part'] = kw.pop('content') + yield '', C(*args, **kw) + + params_test_get_local_part = old_api_only( + + # An RFC compliant local part can be a dot atom or a quoted string, so + # it should pass some of the tests for those. + + adapt_get_dot_atom_tests_for_get_local_part( + include_unless( + lambda n, *a, **k: + n.has_any( + # Get local part handles multiple atoms. + 'two_ew_two_atoms', + 'atom_ends_at_noncfws', + # There are some things get_dot_atom raises for that + # get_local_part treats as obs-local-part. + 'two_dots_raises', + 'trailing_dot_raises', + 'space_ends_dot_atom', + # XXX XXX These tests should pass after the refactoring + # of get_dot_atom. + 'two_ew_with_dot', + 'multiple_ew_no_ws', + ) + or + # get_local_part handles quoted strings (tested above), + # and leading dots or \ are handled as obs-local-part. + n.has_any( + 'up_to_special', + 'leading_special_raises', + 'no_atom_before_special', + 'no_atext_before_special_or_wsp', + 'atom_ends_at_special', + 'ends_at_special_after_comment', + 'ends_at_special', + ) + and n.has_any( + 'reverse_solidus', + 'quotation_mark', + 'full_stop', + ), + label='from_test_get_dot_atom', + )(params_test_get_dot_atom), + ), + + adapt_get_quoted_string_tests_for_get_local_part( + include_unless( + lambda n, *a, **k: n.has_any( + # These tests have an atom first; get_quoted_string raises, + # but get_local_part parses the atom. Atoms are tested above. + 'no_quoted_string', + 'no_leading_dquote_before_non_ws', + # A local part only ends at specials other than " and . + 'qs_ends_at_noncfws', + 'ew_after_dquote', + 'encoded_word_after_dquote_with_no_ws', + 'end_dquote_mid_word', + ), + label='from_test_get_quoted_string', + )(params_test_get_quoted_string), + ), + + ) + def test_get_local_part_simple(self): local_part = self._test_get_x(parser.get_local_part, 'dinsdale@python.org', 'dinsdale', 'dinsdale', [], '@python.org') From 148238774780c7a6da676faca56d4f03f0e5818a Mon Sep 17 00:00:00 2001 From: R David Murray Date: Mon, 30 Mar 2026 14:40:05 -0400 Subject: [PATCH 109/152] Rough refactoring of get_local_part tests. --- .../test_email/test__header_value_parser.py | 158 +++++++++--------- 1 file changed, 81 insertions(+), 77 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 54687023079562f..757a19cf6ba3a48 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2898,8 +2898,9 @@ def test_get_local_part(self, s, *args, local_part=None, **kw): 'ptext', 'fws', 'vtext', + 'misplaced-special', ) - if local_part != ...: + if local_part and local_part != ...: self.assertEqual(lp.local_part, local_part) @params_map(with_namelist=True) @@ -2986,180 +2987,183 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): )(params_test_get_quoted_string), ), - ) - - def test_get_local_part_simple(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_simple = C( 'dinsdale@python.org', 'dinsdale', 'dinsdale', [], '@python.org') - self.assertEqual(local_part.token_type, 'local-part') - self.assertEqual(local_part.local_part, 'dinsdale') + , + #self.assertEqual(local_part.local_part, 'dinsdale') - def test_get_local_part_with_dot(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_with_dot = C( 'Fred.A.Johnson@python.org', 'Fred.A.Johnson', 'Fred.A.Johnson', [], '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + , + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - def test_get_local_part_with_whitespace(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_with_whitespace = C( ' Fred.A.Johnson @python.org', ' Fred.A.Johnson ', ' Fred.A.Johnson ', [], '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + , + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - def test_get_local_part_with_cfws(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_with_cfws = C( ' (foo) Fred.A.Johnson (bar (bird)) @python.org', ' (foo) Fred.A.Johnson (bar (bird)) ', ' Fred.A.Johnson ', [], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - self.assertEqual(local_part[0][0].comments, ['foo']) - self.assertEqual(local_part[0][2].comments, ['bar (bird)']) + '@python.org', + ['foo', 'bar (bird)'], + ), + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - def test_get_local_part_simple_quoted(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_simple_quoted = C( '"dinsdale"@python.org', '"dinsdale"', '"dinsdale"', [], '@python.org') - self.assertEqual(local_part.token_type, 'local-part') - self.assertEqual(local_part.local_part, 'dinsdale') + , + #self.assertEqual(local_part.local_part, 'dinsdale') - def test_get_local_part_with_quoted_dot(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_with_quoted_dot = C( '"Fred.A.Johnson"@python.org', '"Fred.A.Johnson"', '"Fred.A.Johnson"', [], '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + , + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - def test_get_local_part_quoted_with_whitespace(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_quoted_with_whitespace = C( ' "Fred A. Johnson" @python.org', ' "Fred A. Johnson" ', ' "Fred A. Johnson" ', [], '@python.org') - self.assertEqual(local_part.local_part, 'Fred A. Johnson') + , + #self.assertEqual(local_part.local_part, 'Fred A. Johnson') - def test_get_local_part_quoted_with_cfws(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_quoted_with_cfws = C( ' (foo) " Fred A. Johnson " (bar (bird)) @python.org', ' (foo) " Fred A. Johnson " (bar (bird)) ', ' " Fred A. Johnson " ', [], - '@python.org') - self.assertEqual(local_part.local_part, ' Fred A. Johnson ') - self.assertEqual(local_part[0][0].comments, ['foo']) - self.assertEqual(local_part[0][2].comments, ['bar (bird)']) + '@python.org', + ['foo', 'bar (bird)'], + ), + #self.assertEqual(local_part.local_part, ' Fred A. Johnson ') - def test_get_local_part_simple_obsolete(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_simple_obsolete = C( 'Fred. A.Johnson@python.org', 'Fred. A.Johnson', 'Fred. A.Johnson', [errors.ObsoleteHeaderDefect], '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + , + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - def test_get_local_part_complex_obsolete_1(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_complex_obsolete_1 = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "', ' Fred . A. Johnson.and dogs ', [errors.ObsoleteHeaderDefect], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson.and dogs ') + '@python.org', + ['foo ', 'bar', 'bird', 'sheep'], + ), + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson.and dogs ') - def test_get_local_part_complex_obsolete_invalid(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_complex_obsolete_invalid = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"', ' Fred . A. Johnson and dogs', [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson and dogs') + '@python.org', + ['foo ', 'bar', 'bird', 'sheep'], + ), + #self.assertEqual(local_part.local_part, 'Fred.A.Johnson and dogs') - def test_get_local_part_empty_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_local_part('') + test_get_local_part_empty_raises = C( + '', + exception=( errors.HeaderParseError, '.*'), + ), - def test_get_local_part_no_part_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_local_part(' (foo) ') + test_get_local_part_no_part_raises = C( + ' (foo) ', + exception=( errors.HeaderParseError, '.*'), + ), - def test_get_local_part_special_instead_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_local_part(' (foo) @python.org') + test_get_local_part_special_instead_raises = C( + ' (foo) @python.org', + exception=( errors.HeaderParseError, '.*'), + ), - def test_get_local_part_trailing_dot(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_trailing_dot = C( ' borris.@python.org', ' borris.', ' borris.', [errors.InvalidHeaderDefect]*2, '@python.org') - self.assertEqual(local_part.local_part, 'borris.') + , + #self.assertEqual(local_part.local_part, 'borris.') - def test_get_local_part_trailing_dot_with_ws(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_trailing_dot_with_ws = C( ' borris. @python.org', ' borris. ', ' borris. ', [errors.InvalidHeaderDefect]*2, '@python.org') - self.assertEqual(local_part.local_part, 'borris.') + , + #self.assertEqual(local_part.local_part, 'borris.') - def test_get_local_part_leading_dot(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_leading_dot = C( '.borris@python.org', '.borris', '.borris', [errors.InvalidHeaderDefect]*2, '@python.org') - self.assertEqual(local_part.local_part, '.borris') + , + #self.assertEqual(local_part.local_part, '.borris') - def test_get_local_part_leading_dot_after_ws(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_leading_dot_after_ws = C( ' .borris@python.org', ' .borris', ' .borris', [errors.InvalidHeaderDefect]*2, '@python.org') - self.assertEqual(local_part.local_part, '.borris') + , + #self.assertEqual(local_part.local_part, '.borris') - def test_get_local_part_double_dot_raises(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_double_dot_raises = C( ' borris.(foo).natasha@python.org', ' borris.(foo).natasha', ' borris. .natasha', [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, 'borris..natasha') + '@python.org', + ['foo'], + ), + #self.assertEqual(local_part.local_part, 'borris..natasha') - def test_get_local_part_quoted_strings_in_atom_list(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_quoted_strings_in_atom_list = C( '""example" example"@example.com', '""example" example"', 'example example', [errors.InvalidHeaderDefect]*3, '@example.com') - self.assertEqual(local_part.local_part, 'example example') + , + #self.assertEqual(local_part.local_part, 'example example') - def test_get_local_part_valid_and_invalid_qp_in_atom_list(self): - local_part = self._test_get_x(parser.get_local_part, + test_get_local_part_valid_and_invalid_qp_in_atom_list = C( r'"\\"example\\" example"@example.com', r'"\\"example\\" example"', r'\example\\ example', [errors.InvalidHeaderDefect]*5, '@example.com') - self.assertEqual(local_part.local_part, r'\example\\ example') + , + #self.assertEqual(local_part.local_part, r'\example\\ example') + + ) + # get_dtext From fc6d542683df61b1d4ceddb8997516a7781a4c49 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Mon, 30 Mar 2026 16:17:51 -0400 Subject: [PATCH 110/152] Fix whitespace and test names in get_local_part tests. I renamed one test whose name was obviously wrong ('raises' when it doesn't raise, probably a copy and paste error). --- .../test_email/test__header_value_parser.py | 114 ++++++++++-------- 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 757a19cf6ba3a48..40d9fe01b69b25e 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2987,30 +2987,34 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): )(params_test_get_quoted_string), ), - test_get_local_part_simple = C( - 'dinsdale@python.org', 'dinsdale', 'dinsdale', [], '@python.org') - , + simple = C( + 'dinsdale@python.org', + 'dinsdale', + 'dinsdale', + [], + '@python.org', + ), #self.assertEqual(local_part.local_part, 'dinsdale') - test_get_local_part_with_dot = C( + with_dot = C( 'Fred.A.Johnson@python.org', 'Fred.A.Johnson', 'Fred.A.Johnson', [], - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - test_get_local_part_with_whitespace = C( + with_whitespace = C( ' Fred.A.Johnson @python.org', ' Fred.A.Johnson ', ' Fred.A.Johnson ', [], - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - test_get_local_part_with_cfws = C( + with_cfws = C( ' (foo) Fred.A.Johnson (bar (bird)) @python.org', ' (foo) Fred.A.Johnson (bar (bird)) ', ' Fred.A.Johnson ', @@ -3020,30 +3024,34 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - test_get_local_part_simple_quoted = C( - '"dinsdale"@python.org', '"dinsdale"', '"dinsdale"', [], '@python.org') - , + simple_quoted = C( + '"dinsdale"@python.org', + '"dinsdale"', + '"dinsdale"', + [], + '@python.org', + ), #self.assertEqual(local_part.local_part, 'dinsdale') - test_get_local_part_with_quoted_dot = C( + with_quoted_dot = C( '"Fred.A.Johnson"@python.org', '"Fred.A.Johnson"', '"Fred.A.Johnson"', [], - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - test_get_local_part_quoted_with_whitespace = C( + quoted_with_whitespace = C( ' "Fred A. Johnson" @python.org', ' "Fred A. Johnson" ', ' "Fred A. Johnson" ', [], - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'Fred A. Johnson') - test_get_local_part_quoted_with_cfws = C( + quoted_with_cfws = C( ' (foo) " Fred A. Johnson " (bar (bird)) @python.org', ' (foo) " Fred A. Johnson " (bar (bird)) ', ' " Fred A. Johnson " ', @@ -3054,16 +3062,16 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): #self.assertEqual(local_part.local_part, ' Fred A. Johnson ') - test_get_local_part_simple_obsolete = C( + simple_obsolete = C( 'Fred. A.Johnson@python.org', 'Fred. A.Johnson', 'Fred. A.Johnson', [errors.ObsoleteHeaderDefect], - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - test_get_local_part_complex_obsolete_1 = C( + complex_obsolete_1 = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "', ' Fred . A. Johnson.and dogs ', @@ -3073,7 +3081,7 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson.and dogs ') - test_get_local_part_complex_obsolete_invalid = C( + complex_obsolete_invalid = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"', ' Fred . A. Johnson and dogs', @@ -3083,58 +3091,58 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): ), #self.assertEqual(local_part.local_part, 'Fred.A.Johnson and dogs') - test_get_local_part_empty_raises = C( - '', - exception=( errors.HeaderParseError, '.*'), + empty_raises = C( + '', + exception=(errors.HeaderParseError, '.*'), ), - test_get_local_part_no_part_raises = C( - ' (foo) ', - exception=( errors.HeaderParseError, '.*'), + no_part_raises = C( + ' (foo) ', + exception=(errors.HeaderParseError, '.*'), ), - test_get_local_part_special_instead_raises = C( - ' (foo) @python.org', - exception=( errors.HeaderParseError, '.*'), + special_instead_raises = C( + ' (foo) @python.org', + exception=(errors.HeaderParseError, '.*'), ), - test_get_local_part_trailing_dot = C( + trailing_dot = C( ' borris.@python.org', ' borris.', ' borris.', [errors.InvalidHeaderDefect]*2, - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'borris.') - test_get_local_part_trailing_dot_with_ws = C( + trailing_dot_with_ws = C( ' borris. @python.org', ' borris. ', ' borris. ', [errors.InvalidHeaderDefect]*2, - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, 'borris.') - test_get_local_part_leading_dot = C( + leading_dot = C( '.borris@python.org', '.borris', '.borris', [errors.InvalidHeaderDefect]*2, - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, '.borris') - test_get_local_part_leading_dot_after_ws = C( + leading_dot_after_ws = C( ' .borris@python.org', ' .borris', ' .borris', [errors.InvalidHeaderDefect]*2, - '@python.org') - , + '@python.org', + ), #self.assertEqual(local_part.local_part, '.borris') - test_get_local_part_double_dot_raises = C( + dots_around_comment = C( ' borris.(foo).natasha@python.org', ' borris.(foo).natasha', ' borris. .natasha', @@ -3144,22 +3152,22 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): ), #self.assertEqual(local_part.local_part, 'borris..natasha') - test_get_local_part_quoted_strings_in_atom_list = C( + quoted_strings_in_atom_list = C( '""example" example"@example.com', '""example" example"', 'example example', [errors.InvalidHeaderDefect]*3, - '@example.com') - , + '@example.com', + ), #self.assertEqual(local_part.local_part, 'example example') - test_get_local_part_valid_and_invalid_qp_in_atom_list = C( + valid_and_invalid_qp_in_atom_list = C( r'"\\"example\\" example"@example.com', r'"\\"example\\" example"', r'\example\\ example', [errors.InvalidHeaderDefect]*5, - '@example.com') - , + '@example.com', + ), #self.assertEqual(local_part.local_part, r'\example\\ example') ) From 8b174c221aa75e31c22eb98dac74a37fe82c1753 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Mon, 30 Mar 2026 17:21:54 -0400 Subject: [PATCH 111/152] Convert get_local_part tests to keyword form. --- .../test_email/test__header_value_parser.py | 158 +++++++----------- 1 file changed, 61 insertions(+), 97 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 40d9fe01b69b25e..c4dc8654c3c57f3 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2900,8 +2900,7 @@ def test_get_local_part(self, s, *args, local_part=None, **kw): 'vtext', 'misplaced-special', ) - if local_part and local_part != ...: - self.assertEqual(lp.local_part, local_part) + self.assertEqual(lp.local_part, local_part) @params_map(with_namelist=True) def adapt_get_dot_atom_tests_for_get_local_part(nl, s, *args, **kw): @@ -2989,107 +2988,83 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): simple = C( 'dinsdale@python.org', - 'dinsdale', - 'dinsdale', - [], - '@python.org', + remainder='@python.org', + local_part='dinsdale', ), - #self.assertEqual(local_part.local_part, 'dinsdale') with_dot = C( 'Fred.A.Johnson@python.org', - 'Fred.A.Johnson', - 'Fred.A.Johnson', - [], - '@python.org', + remainder='@python.org', + local_part='Fred.A.Johnson', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') with_whitespace = C( ' Fred.A.Johnson @python.org', - ' Fred.A.Johnson ', - ' Fred.A.Johnson ', - [], - '@python.org', + value=' Fred.A.Johnson ', + remainder='@python.org', + local_part='Fred.A.Johnson', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') with_cfws = C( ' (foo) Fred.A.Johnson (bar (bird)) @python.org', - ' (foo) Fred.A.Johnson (bar (bird)) ', - ' Fred.A.Johnson ', - [], - '@python.org', - ['foo', 'bar (bird)'], + value=' Fred.A.Johnson ', + remainder='@python.org', + comments=['foo', 'bar (bird)'], + local_part='Fred.A.Johnson', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') simple_quoted = C( '"dinsdale"@python.org', - '"dinsdale"', - '"dinsdale"', - [], - '@python.org', + remainder='@python.org', + local_part='dinsdale', ), - #self.assertEqual(local_part.local_part, 'dinsdale') with_quoted_dot = C( '"Fred.A.Johnson"@python.org', - '"Fred.A.Johnson"', - '"Fred.A.Johnson"', - [], - '@python.org', + remainder='@python.org', + local_part='Fred.A.Johnson', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') quoted_with_whitespace = C( ' "Fred A. Johnson" @python.org', - ' "Fred A. Johnson" ', - ' "Fred A. Johnson" ', - [], - '@python.org', + value=' "Fred A. Johnson" ', + remainder='@python.org', + local_part='Fred A. Johnson', ), - #self.assertEqual(local_part.local_part, 'Fred A. Johnson') quoted_with_cfws = C( ' (foo) " Fred A. Johnson " (bar (bird)) @python.org', - ' (foo) " Fred A. Johnson " (bar (bird)) ', - ' " Fred A. Johnson " ', - [], - '@python.org', - ['foo', 'bar (bird)'], + value=' " Fred A. Johnson " ', + remainder='@python.org', + comments=['foo', 'bar (bird)'], + local_part=' Fred A. Johnson ', ), - #self.assertEqual(local_part.local_part, ' Fred A. Johnson ') simple_obsolete = C( 'Fred. A.Johnson@python.org', - 'Fred. A.Johnson', - 'Fred. A.Johnson', - [errors.ObsoleteHeaderDefect], - '@python.org', + defects=[errors.ObsoleteHeaderDefect], + remainder='@python.org', + local_part='Fred.A.Johnson', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson') complex_obsolete_1 = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', - ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "', - ' Fred . A. Johnson.and dogs ', - [errors.ObsoleteHeaderDefect], - '@python.org', - ['foo ', 'bar', 'bird', 'sheep'], + value=' Fred . A. Johnson.and dogs ', + defects=[errors.ObsoleteHeaderDefect], + remainder='@python.org', + comments=['foo ', 'bar', 'bird', 'sheep'], + local_part='Fred.A.Johnson.and dogs ', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson.and dogs ') complex_obsolete_invalid = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', - ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"', - ' Fred . A. Johnson and dogs', - [errors.InvalidHeaderDefect]*2, - '@python.org', - ['foo ', 'bar', 'bird', 'sheep'], + value=' Fred . A. Johnson and dogs', + defects=[errors.InvalidHeaderDefect]*2, + remainder='@python.org', + comments=['foo ', 'bar', 'bird', 'sheep'], + local_part='Fred.A.Johnson and dogs', ), - #self.assertEqual(local_part.local_part, 'Fred.A.Johnson and dogs') empty_raises = C( '', @@ -3108,67 +3083,56 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): trailing_dot = C( ' borris.@python.org', - ' borris.', - ' borris.', - [errors.InvalidHeaderDefect]*2, - '@python.org', + defects=[errors.InvalidHeaderDefect]*2, + remainder='@python.org', + local_part='borris.', ), - #self.assertEqual(local_part.local_part, 'borris.') trailing_dot_with_ws = C( ' borris. @python.org', - ' borris. ', - ' borris. ', - [errors.InvalidHeaderDefect]*2, - '@python.org', + defects=[errors.InvalidHeaderDefect]*2, + remainder='@python.org', + local_part='borris.', ), - #self.assertEqual(local_part.local_part, 'borris.') leading_dot = C( '.borris@python.org', - '.borris', - '.borris', - [errors.InvalidHeaderDefect]*2, - '@python.org', + defects=[errors.InvalidHeaderDefect]*2, + remainder='@python.org', + local_part='.borris', ), - #self.assertEqual(local_part.local_part, '.borris') leading_dot_after_ws = C( ' .borris@python.org', - ' .borris', - ' .borris', - [errors.InvalidHeaderDefect]*2, - '@python.org', + defects=[errors.InvalidHeaderDefect]*2, + remainder='@python.org', + local_part='.borris', ), - #self.assertEqual(local_part.local_part, '.borris') dots_around_comment = C( ' borris.(foo).natasha@python.org', - ' borris.(foo).natasha', - ' borris. .natasha', - [errors.InvalidHeaderDefect]*2, - '@python.org', - ['foo'], + value=' borris. .natasha', + defects=[errors.InvalidHeaderDefect]*2, + remainder='@python.org', + comments=['foo'], + local_part='borris..natasha', ), - #self.assertEqual(local_part.local_part, 'borris..natasha') quoted_strings_in_atom_list = C( '""example" example"@example.com', - '""example" example"', - 'example example', - [errors.InvalidHeaderDefect]*3, - '@example.com', + value='example example', + defects=[errors.InvalidHeaderDefect]*3, + remainder='@example.com', + local_part="example example", ), - #self.assertEqual(local_part.local_part, 'example example') valid_and_invalid_qp_in_atom_list = C( r'"\\"example\\" example"@example.com', - r'"\\"example\\" example"', - r'\example\\ example', - [errors.InvalidHeaderDefect]*5, - '@example.com', + value=r'\example\\ example', + defects=[errors.InvalidHeaderDefect]*5, + remainder='@example.com', + local_part=r'\example\\ example', ), - #self.assertEqual(local_part.local_part, r'\example\\ example') ) From 084ae3c6fa37c1c2feebb0a7a2da74abfe31b7e6 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Tue, 31 Mar 2026 16:45:50 -0400 Subject: [PATCH 112/152] Add defect message checks to get_local_part tests. --- .../test_email/test__header_value_parser.py | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index c4dc8654c3c57f3..c1ec1adf31beb02 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3043,7 +3043,7 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): simple_obsolete = C( 'Fred. A.Johnson@python.org', - defects=[errors.ObsoleteHeaderDefect], + defects=[non_dot_atom_local_part_obs_defect], remainder='@python.org', local_part='Fred.A.Johnson', ), @@ -3051,7 +3051,7 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): complex_obsolete_1 = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', value=' Fred . A. Johnson.and dogs ', - defects=[errors.ObsoleteHeaderDefect], + defects=[non_dot_atom_local_part_obs_defect], remainder='@python.org', comments=['foo ', 'bar', 'bird', 'sheep'], local_part='Fred.A.Johnson.and dogs ', @@ -3060,7 +3060,10 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): complex_obsolete_invalid = C( ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', value=' Fred . A. Johnson and dogs', - defects=[errors.InvalidHeaderDefect]*2, + defects=[ + not_even_obs_local_part_defect, + missing_dot_in_local_part_defect, + ], remainder='@python.org', comments=['foo ', 'bar', 'bird', 'sheep'], local_part='Fred.A.Johnson and dogs', @@ -3083,28 +3086,40 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): trailing_dot = C( ' borris.@python.org', - defects=[errors.InvalidHeaderDefect]*2, + defects=[ + not_even_obs_local_part_defect, + trailing_dot_in_local_part_defect, + ], remainder='@python.org', local_part='borris.', ), trailing_dot_with_ws = C( ' borris. @python.org', - defects=[errors.InvalidHeaderDefect]*2, + defects=[ + not_even_obs_local_part_defect, + trailing_dot_in_local_part_defect, + ], remainder='@python.org', local_part='borris.', ), leading_dot = C( '.borris@python.org', - defects=[errors.InvalidHeaderDefect]*2, + defects=[ + not_even_obs_local_part_defect, + leading_dot_in_local_part_defect, + ], remainder='@python.org', local_part='.borris', ), leading_dot_after_ws = C( ' .borris@python.org', - defects=[errors.InvalidHeaderDefect]*2, + defects=[ + not_even_obs_local_part_defect, + leading_dot_in_local_part_defect, + ], remainder='@python.org', local_part='.borris', ), @@ -3112,7 +3127,10 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): dots_around_comment = C( ' borris.(foo).natasha@python.org', value=' borris. .natasha', - defects=[errors.InvalidHeaderDefect]*2, + defects=[ + not_even_obs_local_part_defect, + repeated_dot_in_local_part_defect, + ], remainder='@python.org', comments=['foo'], local_part='borris..natasha', @@ -3121,7 +3139,10 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): quoted_strings_in_atom_list = C( '""example" example"@example.com', value='example example', - defects=[errors.InvalidHeaderDefect]*3, + defects=[ + not_even_obs_local_part_defect, + *[missing_dot_in_local_part_defect]*2, + ], remainder='@example.com', local_part="example example", ), @@ -3129,7 +3150,11 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): valid_and_invalid_qp_in_atom_list = C( r'"\\"example\\" example"@example.com', value=r'\example\\ example', - defects=[errors.InvalidHeaderDefect]*5, + defects=[ + not_even_obs_local_part_defect, + *[missing_dot_in_local_part_defect]*2, + *[misplaced_backslash_defect]*2, + ], remainder='@example.com', local_part=r'\example\\ example', ), From 7de9fdbfd33b23760cd264f30a05839c4b957db1 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 3 Apr 2026 13:44:05 -0400 Subject: [PATCH 113/152] Move obs tests to new get_obs_local_part tests section. And make get_local_part pass those tests. You can't tell from the git diff, but if you extract the changes and compare before and after, there is exactly one kind of change made to the tests that are moved: the defects that are added by get_local_part depending on the results of the call to get_obs_local_part are removed from the expected defects list. They are then added back by the adapter that runs those tests through get_local_part. --- Lib/email/_header_value_parser.py | 1 + .../test_email/test__header_value_parser.py | 238 ++++++++++-------- 2 files changed, 135 insertions(+), 104 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index afccea8fcb87735..affe44d27f07f22 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1540,6 +1540,7 @@ def get_obs_local_part(value): value = value[1:] continue elif value[0]=='\\': + # RFC 5322 doesn't allow \, but the old email code parsed it. obs_local_part.append(ValueTerminal(value[0], 'misplaced-special')) value = value[1:] diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index c1ec1adf31beb02..89d0f97046ba082 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -14,6 +14,7 @@ parameterize, ) from test.test_email.params import ( + add_label, C, include_unless, params, @@ -2884,6 +2885,112 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): ) + # get_obs_local_part + + @params + def test_get_obs_local_part(self, s, *args, local_part=None, **kw): + lp = self._test_parse(parser.get_obs_local_part, C(s), *args, **kw) + if 'exception' in kw: + return + self.verify_terminal_types( + lp, + 'dot', + 'atext', + 'ptext', + 'fws', + 'vtext', + 'misplaced-special', + ) + + # This function should only get called when the non-obs expressions have + # already been checked for, so we are only testing the obs syntax handling, + # not what it does with non-obs syntax. Anything else is "don't care". + # The 'local_part' specs are checked by the get_local_part tests, since the + # token list returned by get_obs_local_part doesn't have that attribute. + params_test_get_obs_local_part = old_api_only( + + simple_obsolete = C( + 'Fred. A.Johnson@python.org', + remainder='@python.org', + local_part='Fred.A.Johnson', + ), + + complex_obsolete_1 = C( + ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', + value=' Fred . A. Johnson.and dogs ', + remainder='@python.org', + comments=['foo ', 'bar', 'bird', 'sheep'], + local_part='Fred.A.Johnson.and dogs ', + ), + + complex_obsolete_invalid = C( + ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', + value=' Fred . A. Johnson and dogs', + defects=[missing_dot_in_local_part_defect], + remainder='@python.org', + comments=['foo ', 'bar', 'bird', 'sheep'], + local_part='Fred.A.Johnson and dogs', + ), + + trailing_dot = C( + ' borris.@python.org', + defects=[trailing_dot_in_local_part_defect], + remainder='@python.org', + local_part='borris.', + ), + + trailing_dot_with_ws = C( + ' borris. @python.org', + defects=[trailing_dot_in_local_part_defect], + remainder='@python.org', + local_part='borris.', + ), + + leading_dot = C( + '.borris@python.org', + defects=[leading_dot_in_local_part_defect], + remainder='@python.org', + local_part='.borris', + ), + + leading_dot_after_ws = C( + ' .borris@python.org', + defects=[leading_dot_in_local_part_defect], + remainder='@python.org', + local_part='.borris', + ), + + dots_around_comment = C( + ' borris.(foo).natasha@python.org', + value=' borris. .natasha', + defects=[repeated_dot_in_local_part_defect], + remainder='@python.org', + comments=['foo'], + local_part='borris..natasha', + ), + + quoted_strings_in_atom_list = C( + '""example" example"@example.com', + value='example example', + defects=[*[missing_dot_in_local_part_defect]*2], + remainder='@example.com', + local_part="example example", + ), + + valid_and_invalid_qp_in_atom_list = C( + r'"\\"example\\" example"@example.com', + value=r'\example\\ example', + defects=[ + *[missing_dot_in_local_part_defect]*2, + *[misplaced_backslash_defect]*2, + ], + remainder='@example.com', + local_part=r'\example\\ example', + ), + + ) + + # get_local_part @params @@ -2926,6 +3033,27 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): kw['local_part'] = kw.pop('content') yield '', C(*args, **kw) + @params_map + def adapt_get_obs_local_part_tests_for_get_local_part( + *args, + defects=[], + **kw, + ): + defects = list(defects) + if any( + x in ( + repeated_dot_in_local_part_defect, + misplaced_backslash_defect, + missing_dot_in_local_part_defect, + leading_dot_in_local_part_defect, + trailing_dot_in_local_part_defect, + ) for x in defects + ): + defects.append(not_even_obs_local_part_defect) + else: + defects.append(non_dot_atom_local_part_obs_defect) + yield '', C(*args, defects=defects, **kw) + params_test_get_local_part = old_api_only( # An RFC compliant local part can be a dot atom or a quoted string, so @@ -2986,6 +3114,12 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): )(params_test_get_quoted_string), ), + add_label('from_test_get_obs_local_part')( + adapt_get_obs_local_part_tests_for_get_local_part( + params_test_get_obs_local_part, + ), + ), + simple = C( 'dinsdale@python.org', remainder='@python.org', @@ -3040,35 +3174,6 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): local_part=' Fred A. Johnson ', ), - - simple_obsolete = C( - 'Fred. A.Johnson@python.org', - defects=[non_dot_atom_local_part_obs_defect], - remainder='@python.org', - local_part='Fred.A.Johnson', - ), - - complex_obsolete_1 = C( - ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', - value=' Fred . A. Johnson.and dogs ', - defects=[non_dot_atom_local_part_obs_defect], - remainder='@python.org', - comments=['foo ', 'bar', 'bird', 'sheep'], - local_part='Fred.A.Johnson.and dogs ', - ), - - complex_obsolete_invalid = C( - ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', - value=' Fred . A. Johnson and dogs', - defects=[ - not_even_obs_local_part_defect, - missing_dot_in_local_part_defect, - ], - remainder='@python.org', - comments=['foo ', 'bar', 'bird', 'sheep'], - local_part='Fred.A.Johnson and dogs', - ), - empty_raises = C( '', exception=(errors.HeaderParseError, '.*'), @@ -3084,81 +3189,6 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): exception=(errors.HeaderParseError, '.*'), ), - trailing_dot = C( - ' borris.@python.org', - defects=[ - not_even_obs_local_part_defect, - trailing_dot_in_local_part_defect, - ], - remainder='@python.org', - local_part='borris.', - ), - - trailing_dot_with_ws = C( - ' borris. @python.org', - defects=[ - not_even_obs_local_part_defect, - trailing_dot_in_local_part_defect, - ], - remainder='@python.org', - local_part='borris.', - ), - - leading_dot = C( - '.borris@python.org', - defects=[ - not_even_obs_local_part_defect, - leading_dot_in_local_part_defect, - ], - remainder='@python.org', - local_part='.borris', - ), - - leading_dot_after_ws = C( - ' .borris@python.org', - defects=[ - not_even_obs_local_part_defect, - leading_dot_in_local_part_defect, - ], - remainder='@python.org', - local_part='.borris', - ), - - dots_around_comment = C( - ' borris.(foo).natasha@python.org', - value=' borris. .natasha', - defects=[ - not_even_obs_local_part_defect, - repeated_dot_in_local_part_defect, - ], - remainder='@python.org', - comments=['foo'], - local_part='borris..natasha', - ), - - quoted_strings_in_atom_list = C( - '""example" example"@example.com', - value='example example', - defects=[ - not_even_obs_local_part_defect, - *[missing_dot_in_local_part_defect]*2, - ], - remainder='@example.com', - local_part="example example", - ), - - valid_and_invalid_qp_in_atom_list = C( - r'"\\"example\\" example"@example.com', - value=r'\example\\ example', - defects=[ - not_even_obs_local_part_defect, - *[missing_dot_in_local_part_defect]*2, - *[misplaced_backslash_defect]*2, - ], - remainder='@example.com', - local_part=r'\example\\ example', - ), - ) From bc6eb90c0f10a8788dac409aa00f7930d86b0c19 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 3 Apr 2026 14:55:38 -0400 Subject: [PATCH 114/152] Move get_obs_local_part before get_local_part. All of the other functions come before the functions that use them, so this one should too to be consistent. --- Lib/email/_header_value_parser.py | 68 +++++++++++++++---------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index affe44d27f07f22..5c20af060fde3d6 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1491,40 +1491,6 @@ def get_phrase(value): phrase.append(token) return phrase, value -def get_local_part(value): - """ local-part = dot-atom / quoted-string / obs-local-part - - """ - local_part = LocalPart() - leader = None - if value and value[0] in CFWS_LEADER: - leader, value = get_cfws(value) - if not value: - raise errors.HeaderParseError( - "expected local-part but found '{}'".format(value)) - try: - token, value = get_dot_atom(value) - except errors.HeaderParseError: - try: - token, value = get_word(value) - except errors.HeaderParseError: - if value[0] != '\\' and value[0] in PHRASE_ENDS: - raise - token = TokenList() - if leader is not None: - token[:0] = [leader] - local_part.append(token) - if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): - obs_local_part, value = get_obs_local_part(str(local_part) + value) - if obs_local_part.token_type == 'invalid-obs-local-part': - local_part.defects.append(errors.InvalidHeaderDefect( - "local-part is not dot-atom, quoted-string, or obs-local-part")) - else: - local_part.defects.append(errors.ObsoleteHeaderDefect( - "local-part is not a dot-atom (contains CFWS)")) - local_part[0] = obs_local_part - return local_part, value - def get_obs_local_part(value): """ obs-local-part = word *("." word) """ @@ -1578,6 +1544,40 @@ def get_obs_local_part(value): obs_local_part.token_type = 'invalid-obs-local-part' return obs_local_part, value +def get_local_part(value): + """ local-part = dot-atom / quoted-string / obs-local-part + + """ + local_part = LocalPart() + leader = None + if value and value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + raise errors.HeaderParseError( + "expected local-part but found '{}'".format(value)) + try: + token, value = get_dot_atom(value) + except errors.HeaderParseError: + try: + token, value = get_word(value) + except errors.HeaderParseError: + if value[0] != '\\' and value[0] in PHRASE_ENDS: + raise + token = TokenList() + if leader is not None: + token[:0] = [leader] + local_part.append(token) + if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): + obs_local_part, value = get_obs_local_part(str(local_part) + value) + if obs_local_part.token_type == 'invalid-obs-local-part': + local_part.defects.append(errors.InvalidHeaderDefect( + "local-part is not dot-atom, quoted-string, or obs-local-part")) + else: + local_part.defects.append(errors.ObsoleteHeaderDefect( + "local-part is not a dot-atom (contains CFWS)")) + local_part[0] = obs_local_part + return local_part, value + def get_dtext(value): r""" dtext = / obs-dtext obs-dtext = obs-NO-WS-CTL / quoted-pair From 45aa7bfb4c1a9479e90e7c19001bed9378caf60e Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 3 Apr 2026 14:04:25 -0400 Subject: [PATCH 115/152] Enhance the get_obs_local_part tests. Including some explanatory comments. I'm not sure exactly how often I've stared at the valid_and_invalid_qp_in_atom_list test and wondered why local_part had the value it did, but if *I* did it several times, almost anybody else might think it is a bug without an explanation. --- .../test_email/test__header_value_parser.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 89d0f97046ba082..3c676c22197e5f9 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2977,6 +2977,11 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): local_part="example example", ), + # This is intentionally a weird one: first there is a quoted string + # consisting of a single quoted pair resolving to a single backslash. + # Then there is unquoted atext and an invalid quoted pair that + # therefore gets interpreted as two backslashes. Then there is a + # quoted string containing 'example' with a leading space. valid_and_invalid_qp_in_atom_list = C( r'"\\"example\\" example"@example.com', value=r'\example\\ example', @@ -2988,6 +2993,62 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): local_part=r'\example\\ example', ), + # We do want to check that it raises on an empty input, even + # though it should never be called with one. + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + quoted_words_but_no_ws = C( + '"words"."separated".by.dots', + value='words.separated.by.dots', + local_part='words.separated.by.dots', + ), + + backlashes_in_various_places = C( + r"\invali\d\.\really" + '\\', + local_part=r'\invali\d\.\really' + '\\', + defects=[ + *[misplaced_backslash_defect]*5, + *[missing_dot_in_local_part_defect]*3, + ], + ), + + double_dot_no_ws = C( + ' borris..natasha@python.org', + value=' borris..natasha', + defects=[repeated_dot_in_local_part_defect], + remainder='@python.org', + local_part='borris..natasha', + ), + + # The end of this is treated as a quoted string, so the stringified + # version has a trailing quote added, but the local_part attribute + # does not include the quotes. + looks_like_qp_quote_but_quote_is_respected = C( + r'invalid.\"for.sure', + stringified=r'invalid.\"for.sure"', + value=r'invalid.\for.sure', + local_part=r'invalid.\for.sure', + defects=[ + end_inside_quoted_string_defect, + misplaced_backslash_defect, + missing_dot_in_local_part_defect, + ], + ), + + # obs_local_part parses anything that can be in a phrase (cfws + # atoms and quoted strings), plus \ and dots. + **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + r'\."')( + ends_at_phrase_ends = C( + 'doted.words. and . space{char}', + local_part='doted.words.and.space', + remainder='{char}', + ), + ), + + ) From f08920bc5c32ebdc4fd7de0d69404c86a3463cf4 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 3 Apr 2026 17:42:16 -0400 Subject: [PATCH 116/152] New get_obs_local_part ew test, fix bug. BUGFIX: Previously get_local_part would sometimes lose track of the fact that a particular portion of a local part was an encoded word when it turned out that the local part could only be parsed as an obs-local-part. It now correctly re-parses the entire local part from scratch when parsing it as an obs-local-part, preserving the knowledge that some parts were (invalidly!) encoded words. --- Lib/email/_header_value_parser.py | 3 ++- .../test_email/test__header_value_parser.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 5c20af060fde3d6..68e6907ea2b442c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1549,6 +1549,7 @@ def get_local_part(value): """ local_part = LocalPart() + orig_value = value leader = None if value and value[0] in CFWS_LEADER: leader, value = get_cfws(value) @@ -1568,7 +1569,7 @@ def get_local_part(value): token[:0] = [leader] local_part.append(token) if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): - obs_local_part, value = get_obs_local_part(str(local_part) + value) + obs_local_part, value = get_obs_local_part(orig_value) if obs_local_part.token_type == 'invalid-obs-local-part': local_part.defects.append(errors.InvalidHeaderDefect( "local-part is not dot-atom, quoted-string, or obs-local-part")) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 3c676c22197e5f9..7ec1f21abf56035 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3048,6 +3048,25 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): ), ), + # Encoded words are not legitimate in local-part, but we decode + # them anyway. + + invalid_ew_atoms = C( + '=?utf-8?q?foo_?="=?utf-8?q?_bar?=".bird', + # It's not clear this str is the best choice. It's + # a consequence of the underlying parsed structures. + stringified='foo " bar".bird', + value="foo bar.bird", + local_part="foo bar.bird", + defects=[ + # XXX XXX There should be exactly one ew whitespace defect + # here, but the number generated will change during refactor, + # until it is fixed when get_obs_local_part is refactored. + *[missing_whitespace_after_ew_defect]*2, + missing_dot_in_local_part_defect, + ew_inside_quoted_string_defect, + ], + ), ) From cafe72c28a3e77303981756d6109123c05828df2 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 4 Apr 2026 09:53:25 -0400 Subject: [PATCH 117/152] More get_obs_local_part ew tests. --- .../test_email/test__header_value_parser.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 7ec1f21abf56035..da1d9296f112328 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3068,6 +3068,34 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): ], ), + less_invalid_ew_atoms = C( + '=?utf-8?q?foo_?= . (=?utf-8?q?test?=) =?utf-8?q?_bar?= .bird', + # XXX XXX after refactoring the comment ew will also be decoded. + #stringified='foo . (test) bar .bird', + stringified='foo . (=?utf-8?q?test?=) bar .bird', + value="foo . bar .bird", + local_part="foo . bar.bird", + # XXX XXX after refactoring the comment ew will also be decoded. + # comments=['test'] + comments=['=?utf-8?q?test?='], + ), + + # XXX XXX Since we've decided to decode encoded words, this becomes a + # "valid" dot-atom, which it will be treated as after the refactoring. + # But if you clear up the whitespace defects by adding whitespace, it + # turns into an obs_local_part because of the whitespace. + sort_of_valid_ew_dot_atom = C( + '=?utf-8?q?foo_?=.=?utf-8?q?_bar?=.bird', + stringified='foo . bar.bird', + value="foo . bar.bird", + local_part="foo . bar.bird", + defects=[ + # XXX XXX the whitespace defects will change during refactoring + missing_whitespace_after_ew_defect, + missing_whitespace_after_ew_defect, + ], + ), + ) From 5087fd5bc616939f32176a3e8e60e3f4ce327da6 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Tue, 7 Apr 2026 16:25:00 -0400 Subject: [PATCH 118/152] Add unicode and ew get_local_part tests. Unicode local parts are legal if utf8=True during serialization. EWs are *not* legal, but we don't currently register a defect for that. There is question whether we should decode them at all, but given that they could be serialized correctly with utf8=True it seems reasonable. It may never happen in the wild. --- .../test_email/test__header_value_parser.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index da1d9296f112328..6d8e142c614d8bf 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3297,6 +3297,25 @@ def adapt_get_obs_local_part_tests_for_get_local_part( exception=(errors.HeaderParseError, '.*'), ), + unicode = C( + 'exámple@example.com', + remainder='@example.com', + local_part='exámple', + ), + + ew_non_ascii = C( + '=?utf-8?q?ex=c3=a1mple?=@example.com', + stringified='exámple', + remainder='@example.com', + defects=[ + # XXX XXX there should be exactly one missing whitespace here, + # but it will change until we refactor get_local_part. + missing_whitespace_after_ew_defect, + # XXX XXX there should be a defect for there being an EW at all. + ], + local_part='exámple', + ), + ) From 50834cc72f506c207572f23f50b36c4b23830987 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 25 Dec 2025 16:51:53 -0500 Subject: [PATCH 119/152] Document new API pattern, add deprecation helpers. The goal of this refactoring is to solve a non-linear complexity issue that arises from the fact that the current API does at *least* one substring slice per function call, and the depth of the call stack can get fairly deep when dealing with a complex header. (The complexity isn't quadratic, since not every character in a header triggers a slice, and python tries to optimize string manipulations, but it is non-linear.) The current parser is also inefficient both in memory use and processing speed. This rewrite, which I had been planning since writing the first version of the parser way back when but hadn't gotten to until now, is quite worthwhile even without the goal of reducing the non-linearity. The old API for almost all the 'get' functions takes a 'value', parses starting at its beginning, and returns an object representing the parsed value plus the substring of value that wasn't parsed (that string copy is the non-linearity). The new API takes the full string being parsed as 'value' plus a 'start' point for parsing, and instead of the trailing substring returns a pointer to the character after the last one that was parsed, or to after the end of the string (the len of value). This commit updates the prefix comment for the parser to discuss the target API rather than the old API. It also defines deprecation helpers and tests for them. We'll deprecate things by adding '_deprecated_' as prefix to the name, and suggest replacements when possible by adding to the _REPLACED_NAMES dictionary (the _replaced_with decorator does this for deprecated functions). This handles functions and constants that will be obsolete by the end of the refactoring. A function decorator, _deprecate_old_api, provides a way to support both the old and the new API, issuing a deprecation message if the call is in the form of the old API. This only works on functions that conform to the description above, though it can accommodate additional arguments beyond the value or additional return values beyond the (value, remainder) pair, as long as the second additional argument isn't an integer. Fortunately none of the existing functions has an integer as a second argument. There are few functions that currently return an empty value if given empty or invalid input (value[start] isn't the beginning of what they are supposed to parse). They *should* be raising an error to stay consistent with the API of the rest of the functions. A specialized decorator, _deprecate_old_api_and_lack_of_raise_on_invalid_input, handles this specific case, allowing us to deprecate the buggy behavior while having new style API calls get the exception. *During* refactoring, the decorator '_deprecate' will be used to indicate the intent to deprecate a function, while allowing existing calls to it to continue to use the old name. This will be replaced by the '_deprecated_' prefix and/or the '_replaced_with` decorator by the end of refactoring. The refactoring procedure I settled on after several iterations is the preceding refactoring of all of the tests to use the new test framework, followed by refactoring the parser functions one by one, in order, since their order in the file is bottom up (there was one exception, which we've fixed.) For the refactoring process, if the caller of a function has also been refactored so that it is passing an index, the decorator will just call the decorated function. But if the caller hasn't been refactored yet, the decorator converts the arguments into something the refactored function will do the right thing with, and converts the results into something the caller expects. This allows for both a stepwise refactoring process and for supporting a deprecation period in case anyone has been brave enough to use this internal API in their own programs. The constraints on the refactoring are that the function needs to continue to return an object representing an extracted substring for whatever result it is supposed to produce, and that we maintain the pattern that these are independent functions (not object methods) that take the string we're operating on as input and return an updated position as output. --- Lib/email/_header_value_parser.py | 125 ++++++++++++- .../test_email/test__header_value_parser.py | 169 ++++++++++++++++++ 2 files changed, 285 insertions(+), 9 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 68e6907ea2b442c..673ba33e2a28047 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -75,6 +75,7 @@ from email import _encoded_words as _ew from email import errors from email import utils +from functools import wraps # # Useful constants and functions @@ -985,26 +986,132 @@ class _InvalidEwError(errors.HeaderParseError): ListSeparator.syntactic_break = False RouteComponentMarker = ValueTerminal('@', 'route-component-marker') + +# XXX POSTDEP: Remove from here... +# +# Temporary backward compatibility and deprecation support. Although this is +# an internal module and not a public API, and therefore we *will* eventually +# remove the backward compatibility support, we're still doing backward +# compatibility to minimize disruption for anyone who made use of these +# internal APIs. +# + +OLDAPIREMVER = (3, 18) + +_REPLACED_NAMES = dict( + ) + +def __getattr__(name): + from warnings import _deprecated, _DEPRECATED_MSG + if f'_deprecated_{name}' not in globals(): + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + if name in _REPLACED_NAMES: + _deprecated( + name, + _DEPRECATED_MSG + f", try {_REPLACED_NAMES[name]!r} instead", + remove=OLDAPIREMVER, + ) + else: + _deprecated(name, remove=OLDAPIREMVER) + return globals()[f'_deprecated_{name}'] + +def _replaced_with(name): + def _(func): + _REPLACED_NAMES[func.__name__.removeprefix('_deprecated_')] = name + return func + return _ + +_API_CHANGE_MSG = ( + "The API of the internal function {name!r} has changed; the backward" + " compatibility wrapper will be removed in {remove}" + ) + +def _deprecate_old_api(func): + @wraps(func) + def dispatch(value, *args, **kw): + if args and isinstance(args[0], int): + return func(value, *args, **kw) + # The runtime error is going to say the function should be removed, but + # it's only the decorator that needs to be removed. + from warnings import _deprecated + _deprecated(func.__name__, _API_CHANGE_MSG, remove=OLDAPIREMVER) + result, start, *other = func(value, 0, *args, **kw) + return result, value[start:], *other + return dispatch + +# A specialized deprecation for some functions that should be raising +# errors when handed input that is empty or doesn't contain the expected +# value, but current return an empty object instead. The return signature +# of the wrapped function must be either (result, start) or (result, start, +# exception, warning). If present, 'exception' will be raised from the new +# api, and 'warning' will be passed to 'warn' as a DeprecationWarning for +# the old api. +def _deprecate_old_api_and_lack_of_raise_on_invalid_input(func): + @wraps(func) + def dispatch(value, *args, **kw): + if args and isinstance(args[0], int): + result, start, *error = func(value, *args, **kw) + if error: + raise error[0] + return result, start + from warnings import _deprecated, warn + _deprecated(func.__name__, _API_CHANGE_MSG, remove=OLDAPIREMVER) + result, start, *error = func(value, 0, *args, **kw) + if error: + warn(error[1], DeprecationWarning, stacklevel=2) + return result, value[start:] + return dispatch + +# XXX XXX By the end of the refactoring, calls to _deprecate will be replaced by +# renaming the functions with _deprecated_ in front and adding any new names to +# _REPLACED_NAMES. The deprecation testing will need to be adjusted. This +# decorator should not exist in the final version of the branch. + +from functools import singledispatch +from collections.abc import Callable + +def __deprecate(msg, new_name=None): + def _(func): + @wraps(func) + def deprecate(*args, **kw): + from warnings import _deprecated + _deprecated(func.__name__, msg, remove=OLDAPIREMVER) + return func(*args, **kw) + return deprecate + return _ + +@singledispatch +def _deprecate(new_name): + from warnings import _DEPRECATED_MSG + return __deprecate(_DEPRECATED_MSG + f", try {new_name} instead") + +@_deprecate.register(Callable) +def _(func): + from warnings import _DEPRECATED_MSG + return __deprecate(_DEPRECATED_MSG)(func) + +# XXX POSTDEP: ...to here. + + # # Parser # # Parse strings according to RFC822/2047/2822/5322 rules. # -# This is a stateless parser. Each get_XXX function accepts a string and -# returns either a Terminal or a TokenList representing the RFC object named -# by the method and a string containing the remaining unparsed characters -# from the input. Thus a parser method consumes the next syntactic construct -# of a given type and returns a token representing the construct plus the -# unparsed remainder of the input string. +# This is a stateless parser. Each get_XXX function accepts a string and a +# starting position and returns either a Terminal or a TokenList representing +# the RFC (or local concept) object named by the method and a pointer to +# remaining unparsed characters in the string. Thus a parser method consumes +# the next syntactic construct of a given type and returns a token representing +# the construct plus a pointer to the unparsed remainder of the input string. # # For example, if the first element of a structured header is a 'phrase', # then: # -# phrase, value = get_phrase(value) +# phrase, rest = get_phrase(value, start) # -# returns the complete phrase from the start of the string value, plus any -# characters left in the string after the phrase is removed. +# returns a complete 'phrase' from 'start' to 'rest' in the value. _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 6d8e142c614d8bf..b6e259e0b2b7246 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -5,7 +5,9 @@ from email import _header_value_parser as parser from email import errors from email import policy +from importlib import import_module from random import choices, randint, sample +from test.support.import_helper import CleanImport from test.test_email import ( charname, check_all_warnings, @@ -15,11 +17,13 @@ ) from test.test_email.params import ( add_label, + as_value, C, include_unless, params, Params, params_map, + with_names, ) # https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 @@ -183,6 +187,171 @@ def charset_defect(chars): # ---> End Defect Expectations +# XXX POSTDEP: Delete this test case, from here... + +class TestDeprecations(TestEmailBase): + + def test___getattr___attribute_error(self): + nonsense = 'this_does_not_exist' + with self.assertRaisesRegex(AttributeError, nonsense): + getattr(parser, nonsense) + + def test___getattr___deprecation(self): + with CleanImport(parser.__name__): + foo = import_module(parser.__name__) + foo._deprecated_foo = lambda: 42 + foo._deprecated_bar = 1 + with check_all_warnings(( + r"(?=.*'bar')(?=.*is deprecated)", + DeprecationWarning, + )): + self.assertEqual(foo.bar, 1) + self.assertEqual(foo.foo(), 42) + + def test___getattr___replacement(self): + with CleanImport(parser.__name__): + foo = import_module(parser.__name__) + a_func = lambda: 42 + foo._deprecated_foo = a_func + foo._REPLACED_NAMES['foo'] = 'bird' + foo._deprecated_bar = 2 + foo._REPLACED_NAMES['bar'] = 'brain' + with check_all_warnings(( + r"(?=.*'foo')(?=.*is deprecated)(?=.*'bird')", + DeprecationWarning, + )): + self.assertEqual(foo.foo, a_func) + self.assertEqual(foo.foo(), 42) + with check_all_warnings(( + r"(?=.*'bar')(?=.*is deprecated)(?=.*'brain')", + DeprecationWarning, + )): + self.assertEqual(foo.bar, 2) + + def test__replaced_with(self): + with CleanImport(parser.__name__): + p = import_module(parser.__name__) + @p._replaced_with('foo') + def _deprecated_bar(a): + return a + p._deprecated_bar = _deprecated_bar + with check_all_warnings(( + r"(?=.*'bar')(?=.*is deprecated)(?=.*'foo')", + DeprecationWarning, + )): + self.assertEqual(p.bar(2), 2) + + @params(as_value( + # XXX XXX make sure this is completely filled in with all the + # names we expect to be deprecated. + )) + def test_deprecated_names(self, name): + with check_all_warnings(( + rf'(?=.*{name})(?=.*is.*deprecated)', + DeprecationWarning, + )): + getattr(parser, name) + + @params(with_names( + # XXX XXX make sure this is completely filled in with all the names + # we've replaced. + )) + def test_replaced_names(self, oldname, newname): + with check_all_warnings(( + rf'(?=.*{oldname!r}.*is deprecated)(?=.*{newname})', + DeprecationWarning, + )): + getattr(parser, oldname) + + @params( + old_simple = C('foo x', '', res=('f', 'oo x', 9), warn=True), + old_with_arg = C('foo x', ' ', res=('fo', 'o x', 9), warn=True), + old_with_kw = C('foo x', '', b=2, res=('foo', ' x', 9), warn=True), + new_with_zero = C('foo x', 0, '', res=('f', 1, 9)), + new_with_nonzero = C('foo x', 3, '', res=(' ', 4, 9)), + new_with_arg = C('foo x', 1, ' ', res=('oo', 3, 9)), + new_with_kw = C('foo x', 2, '', b=2, res=('o x', 5, 9)), + ) + def test__deprecate_old_api(self, value, *args, b=0, warn=False, res): + @parser._deprecate_old_api + def t(value, start, a, b=0): + end = start + 1 + len(a) + b + return value[start:end], end, 9 + warnings = [] + if warn: + warnings += [( + r"(?=.*'t')(?=.*API)(?=.*has changed)", + DeprecationWarning, + )] + with check_all_warnings(*warnings): + self.assertEqual(t(value, *args, b=b), res) + + @params( + old_api_no_error = C(C('abc')), + new_api_no_error = C(C('abc', 0)), + old_api_error = C(C(''), warning=True), + new_api_error = C(C('', 0), exception=True), + new_api_no_error_with_non_zero_start = C(C('abc', 2)), + new_api_error_with_non_zero_start = C(C('abc', 3), exception=True), + ) + def test__deprecate_old_api_and_lack_of_raise_on_invalid_input( + self, + callspec, + exception=False, + warning=False, + ): + @parser._deprecate_old_api_and_lack_of_raise_on_invalid_input + def foo(value, start): + if not value[start:]: + return parser.TokenList(['']), start, Exception('bar'), 'bird' + return parser.TokenList([value]), start + len(value) + value, *start = callspec.args + warnings = [] + if start == []: + warnings += [ + (r"(?=.*'foo')(?=.*API)(?=.*has changed)", DeprecationWarning) + ] + if warning: + warnings += [('bird', DeprecationWarning)] + if exception: + exceptioncheck = self.assertRaisesRegex(Exception, 'bar') + else: + exceptioncheck = ExitStack() + with exceptioncheck: + with check_all_warnings(*warnings): + tl, rest = callspec(foo) + if exception: + return + start = start[0] if start else 0 + self.assertEqual(tl, parser.TokenList([value])) + rest = (len(value) - len(rest)) if hasattr(rest, 'encode') else rest + self.assertEqual(rest, start + len(tl[0])) + + # XXX XXX _deprecate will go away by the end of refactoring. + + def test__deprecate_no_arg(self): + @parser._deprecate + def t(a, b): + return a, b + with self.assertWarnsRegex( + DeprecationWarning, + r"(?=.*'t'.*is deprecated)", + ): + self.assertEqual(t(1, 2), (1, 2)) + + def test__deprecate_with_arg(self): + @parser._deprecate('t2') + def t(a, b): + return a, b + with self.assertWarnsRegex( + DeprecationWarning, + r"(?=.*'t'.*is deprecated)(?=.*t2)", + ): + self.assertEqual(t(1, 2), (1, 2)) + +# XXX POSTDEP: ...to here + + class TestTokenList(TestEmailBase): @params( From d2d019af36915957079a663ee59592f3f818bbfd Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 2 May 2026 10:22:33 -0400 Subject: [PATCH 120/152] Deprecate NonASCIILocalPartDefect. This could have been done as part of the issue that removed its use, but it should be on the same deprecation cycle as the parser rewrite, so maybe it is better for it to be in this PR. --- Lib/email/errors.py | 11 ++++++++++- Lib/test/test_email/test_defect_handling.py | 10 ++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 859307dd85be111..a836bf6efeb891b 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -2,6 +2,15 @@ # Author: Barry Warsaw # Contact: email-sig@python.org +def __getattr__(name): + if name == "NonASCIILocalPartDefect": + import warnings + warnings._deprecated( + "email.errors.NonASCIILocalPartDefect", + remove=(3, 17), + ) + return _NonASCIILocalPartDefect + """email package exception classes.""" @@ -108,7 +117,7 @@ def __str__(self): class ObsoleteHeaderDefect(HeaderDefect): """Header uses syntax declared obsolete by RFC 5322""" -class NonASCIILocalPartDefect(HeaderDefect): +class _NonASCIILocalPartDefect(HeaderDefect): """Unused. Note: this error is deprecated and may be removed in the future.""" # RFC 6532 permits a non-ASCII local-part. _header_value_parser previously # treated this as a parse-time defect (when parsing Unicode, but not bytes). diff --git a/Lib/test/test_email/test_defect_handling.py b/Lib/test/test_email/test_defect_handling.py index acc4accccac7566..64cd4d3d750af67 100644 --- a/Lib/test/test_email/test_defect_handling.py +++ b/Lib/test/test_email/test_defect_handling.py @@ -375,5 +375,15 @@ def get_defects(self, obj): return obj.defects +class TestDefectDeprecation(TestEmailBase): + + def test_non_ascii_defect_deprecated(self): + with self.assertWarnsRegex( + DeprecationWarning, + rf'(?i)(?=.*NonASCIILocalPartDefect)(?=.*is deprecated)', + ): + errors.NonASCIILocalPartDefect + + if __name__ == '__main__': unittest.main() From 1a3d976810151566afbac58368abbfb2b2587efc Mon Sep 17 00:00:00 2001 From: R David Murray Date: Thu, 19 Mar 2026 20:57:45 -0400 Subject: [PATCH 121/152] Refactor get_fws. BUGFIX: When passed an empty string or one with no leading whitespace, get_fws would return a WhiteSpaceTerminal consisting of the empty string. This is a nonsensical result, and it now raises a HeaderParseError in these cases. When called using the old API it issues a deprecation warning and continues to return the buggy value. This makes get_fws consistent with most of the other get routines that blow up if the expected input isn't there. The existing code never calls it that way, and hopefully no one else does either. --- Lib/email/_header_value_parser.py | 36 +++++++++++++++---- .../test_email/test__header_value_parser.py | 34 ++++++++++++++++-- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 673ba33e2a28047..ee97c4a23d82ab3 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -81,6 +81,7 @@ # Useful constants and functions # +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 _WSP = ' \t' WSP = set(_WSP) CFWS_LEADER = WSP | set('(') @@ -1167,17 +1168,38 @@ def _get_ptext_to_endchars(value, endchars): pos = pos + 1 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp -def get_fws(value): +_wsp_matcher = re.compile(fr'[{_WSP}]+').match +@_deprecate_old_api_and_lack_of_raise_on_invalid_input +def get_fws(value, start): """FWS = 1*WSP - This isn't the RFC definition. We're using fws to represent tokens where - folding can be done, but when we are parsing the *un*folding has already - been done so we don't need to watch out for CRLF. + If start does not point to a WSP character in value, raise a HeaderParse + error. Otherwise, return a WhiteSpaceTerminal of token_type 'fws' + containing all of the WSP characters from start to the next non-WSP + character (or the end of value), and the index of the non-WSP character (or + the len of value). + + This is a subset of the RFC 5322 definition of FWS: the strings passed to + the parser should already have been unfolded, so there should be no + legitimate CRLF characters in value. """ - newvalue = value.lstrip() - fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') - return fws, newvalue + m = _wsp_matcher(value, start) + if m is None: + # XXX POSTDEP: change this to raise the exception. + return ( + WhiteSpaceTerminal('', 'fws'), + start, + errors.HeaderParseError( + f'expected whitespace but found {value[start:]!r}' + ), + ( + "Calling get_fws when there is no whitespace at the start" + " is deprecated and will raise an error in the future." + ), + ) + fws = WhiteSpaceTerminal(m.group(), 'fws') + return fws, m.end() def get_encoded_word(value, terminal_type='vtext'): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b6e259e0b2b7246..e54b8d06f30e787 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -20,6 +20,7 @@ as_value, C, include_unless, + only, params, Params, params_map, @@ -818,7 +819,17 @@ def test_get_fws(self, s, *args, **kw): self.assertIsInstance(fws, parser.WhiteSpaceTerminal) self.assertEqual(fws.token_type, 'fws') - params_test_get_fws = old_api_only( + # XXX POSTDEP: delete from here... + @params_map + def deprecate_oldapi_no_raise_behavior(*args, **kw): + kw['warnings'] = kw.get('warnings', []) + [ + (DeprecationWarning, r'.*API.*has changed'), + (DeprecationWarning, r'(?i).*raise'), + ] + yield 'oldapi', C(*args, **kw, test_start=False) + # XXX POSTDEP: ...to here. + + params_test_get_fws = for_each_api( wsp_run = C(' \t '), @@ -830,18 +841,35 @@ def test_get_fws(self, s, *args, **kw): ends_at_non_wsp_after_wsp_run = C(' \t{char} ', remainder='{char} '), ), + # XXX POSTDEP: delete from here... ) - # XXX XXX: these ought to error, but get_fws should never be called this way + # These ought to error, but get_fws should never be called this way # We'll deprecate the lack of raise during the refactor. params_test_get_fws.update( - old_api_only( + deprecate_oldapi_no_raise_behavior( empty = C(''), no_wsp = C('foo', remainder='foo'), no_leading_wsp = C('foo bar', remainder='foo bar'), ), ) + params_test_get_fws.update( + add_label('newapi')( + # XXX POStDEP: ... to here. And fix the indentation below. + **params_map( + lambda s, **k: only( + C(s, exception=(errors.HeaderParseError, '(?i)expected')) + ) + )( + empty='', + no_wsp='foo', + no_leading_wsp='foo bar', + ) + ), # XXX POSTDEP: delete this line + + ) + # get_encoded_word From 6d4e05777af065979a663477e8b3bb8263874c43 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 10 May 2026 15:40:30 -0400 Subject: [PATCH 122/152] Add _make_xtext. There is a common pattern in how _validate_xtext is used: the value terminal always gets created right before it is called. So it makes sense to combine the two operations into one function. And as the removed comment required, it passes the _validate_xtext tests. --- Lib/email/_header_value_parser.py | 19 +++++++- .../test_email/test__header_value_parser.py | 48 +++++++++++++++++-- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ee97c4a23d82ab3..77f931d6b525341 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1117,7 +1117,6 @@ def _(func): _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(ATOM_ENDS)))).match -_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall _non_token_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(TOKEN_ENDS)))).match _non_attribute_end_matcher = re.compile(r"[^{}]+".format( @@ -1125,6 +1124,24 @@ def _(func): _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 for non_printable. +_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall +def _make_xtext(text, terminal_class, token_type): + """Return text wrapped in terminal_class of token_type, with defects if any. + + If text contains non-printable ASCII or undecodable bytes, add those + defects to the returned terminal_class object. + + """ + vt = terminal_class(text, token_type=token_type) + non_printables = _non_printable_finder(text) + if non_printables: + vt.defects.append(errors.NonPrintableDefect(non_printables)) + if utils._has_surrogates(text): + vt.defects.append(errors.UndecodableBytesDefect( + "Non-ASCII characters found in header token")) + return vt + def _validate_xtext(xtext): """If input token contains ASCII non-printables, register a defect.""" diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e54b8d06f30e787..5187f753324c173 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -633,11 +633,45 @@ def test__wsp_splitter(self, s, res): ) - # _validate_xtext + # _make_xtext - # As an internal method these tests are not API requirements; however, the - # behavior they check must be verified one way or another, so if the - # implementation changes there need to be equivalent tests. + @params + def test__make_xtext( + self, + s, + terminal_class=parser.ValueTerminal, + token_type='test', + **kw, + ): + vt = self._test_parse( + parser._make_xtext, + C(s, terminal_class, token_type), + stringified=('' if terminal_class.__name__.startswith('EW') + else None), + value=' ' if terminal_class.__name__.startswith('White') else None, + test_start=False, + **kw, + ) + self.assertEqual(vt.token_type, token_type) + + @params_map + def for_each_terminal_type(*args, **kw): + vt_types = ( + parser.ValueTerminal, + parser.WhiteSpaceTerminal, + parser.EWWhiteSpaceTerminal, + ) + for vt_type in vt_types: + yield vt_type.__name__, C(*args, **kw, terminal_class=vt_type) + + params_test__make_xtext = for_each_terminal_type( + + token_type = C('foo', token_type='bar'), + + ) + + + # _validate_xtext @params def test__validate_xtext(self, s, defects=[]): @@ -681,6 +715,12 @@ def test__validate_xtext(self, s, defects=[]): ) + params_test__make_xtext.update( + add_label('from_test_validate_xtext')( + for_each_terminal_type(params_test__validate_xtext), + ), + ) + # _get_ptext_to_endchars From 47988baa3633e0d457e58dd075d3301e9991a40b Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 10 May 2026 15:43:08 -0400 Subject: [PATCH 123/152] Preliminary deprecation of _validate_xtext. Although we haven't removed the calls to _validate_xtext, that is the goal, so this changeset deprecates it. That way the tests of refactored functions will raise errors if they use it. --- Lib/email/_header_value_parser.py | 1 + Lib/test/test_email/test__header_value_parser.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 77f931d6b525341..89026e863f97e6f 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1142,6 +1142,7 @@ def _make_xtext(text, terminal_class, token_type): "Non-ASCII characters found in header token")) return vt +@_deprecate('_get_xtext') def _validate_xtext(xtext): """If input token contains ASCII non-printables, register a defect.""" diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 5187f753324c173..92355c204f3fc45 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -668,6 +668,8 @@ def for_each_terminal_type(*args, **kw): token_type = C('foo', token_type='bar'), + # XXX POSTDEP: delete from here... + ) @@ -681,6 +683,8 @@ def test__validate_xtext(self, s, defects=[]): params_test__validate_xtext = Params( + # XXX POSTDEP: ...to here + valid = C('foo'), # Although it looks a bit odd for unicode to be acceptable when we have @@ -715,11 +719,13 @@ def test__validate_xtext(self, s, defects=[]): ) + # XXX POSTDEP: delete from here... params_test__make_xtext.update( add_label('from_test_validate_xtext')( for_each_terminal_type(params_test__validate_xtext), ), ) + # XXX POSTDEP: ...to here. # _get_ptext_to_endchars From bb459a20625597c8108175b27cfca6a12c512a16 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Tue, 13 Jan 2026 12:32:29 -0500 Subject: [PATCH 124/152] Add _get_xtext function. In addition to the "wrap text in Terminal and validate" pattern, there is an additional common pattern of finding that text via a regex and doing the wrapping and validating. _get_xtext layers on top of _make_xtext to encapsulate that pattern. --- Lib/email/_header_value_parser.py | 16 +++++ .../test_email/test__header_value_parser.py | 61 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 89026e863f97e6f..35a368056a623ab 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1153,6 +1153,22 @@ def _validate_xtext(xtext): xtext.defects.append(errors.UndecodableBytesDefect( "Non-ASCII characters found in header token")) +# _make_non_match_re is for use by the callers of _get_xtext. +_make_non_match_re = lambda s: re.compile(rf'[^{re.escape(s)}]+') +def _get_xtext(value, start, regex, terminal_class, token_type, err=None): + """Return text matching regex via _make_xtext, raise err if no match. + + Use the regex 'match' to identify a substring. If there is no match, raise + err. If there is a match, pass it to _make_xtext to create a + terminal_class of terminal_type. Return the terminal and the index of the + end of the match. + + """ + m = regex.match(value, start) + if m is None: + raise err + return _make_xtext(m.group(), terminal_class, token_type), m.end() + def _get_ptext_to_endchars(value, endchars): """Scan printables/quoted-pairs until endchars and return unquoted ptext. diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 92355c204f3fc45..3387744791b78f7 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -728,6 +728,67 @@ def test__validate_xtext(self, s, defects=[]): # XXX POSTDEP: ...to here. + # _get_xtext + + @params + def test__get_xtext( + self, + s, + # DOTALL allows the LF in our first test set to pass...in the + # normal use of _get_xtext LF will terminate the matches we use, + # leaving the LF (which shouldn't normally happen) for later code. + regex=re.compile('.*', re.DOTALL), + terminal_class=parser.ValueTerminal, + token_type='test', + err=None, + **kw, + ): + vt = self._test_parse( + parser._get_xtext, + C(s, regex, terminal_class, token_type, err=err), + stringified=('' if terminal_class.__name__.startswith('EW') + else None), + value=' ' if terminal_class.__name__.startswith('White') else None, + **kw, + ) + if 'exception' in kw: + return + self.assertEqual(vt.token_type, token_type) + + params_test__get_xtext__regex = Params( + + params_test__make_xtext, + + raises_on_no_match = C( + 'foo bar', + regex=re.compile(r'x'), + err=Exception('foo'), + exception=(Exception, 'foo'), + ), + + returns_match = C( + 'foo bar', + regex=re.compile(r'[^ ]+'), + remainder=' bar', + ), + + ignores_non_printable_after_match = C( + 'foobar\x00', + regex=re.compile(r'[^b]+'), + remainder='bar\x00', + ), + + **for_each_character(RFC_WSP + '()')( + regex_from_make_non_match_re = C( + 'foo{char}bar', + regex=parser._make_non_match_re(RFC_WSP + '()'), + remainder='{char}bar', + ), + ), + + ) + + # _get_ptext_to_endchars # As an internal method these tests are not API requirements; however, the From fe046755d931cc1091ac1c76656a39c567f8698c Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 2 Jan 2026 16:59:10 -0500 Subject: [PATCH 125/152] Factor a _decode out of decode in _encoded_words. We want this sub-function because our refactored encoded word parsing is going to already have parsed the encoded word components. The changes to the encode docstring are to (hopefully) make the description of its existing functionality more correct. --- Lib/email/_encoded_words.py | 22 ++++- Lib/test/test_email/test__encoded_words.py | 108 ++++++++++++++------- 2 files changed, 91 insertions(+), 39 deletions(-) diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py index 05a34a4c1052336..65a6ddd2386f0ff 100644 --- a/Lib/email/_encoded_words.py +++ b/Lib/email/_encoded_words.py @@ -163,8 +163,8 @@ def decode(ew): the encoded_string decoded first from its Content Transfer Encoding and then from the resulting bytes into unicode using the specified charset. If the cte-decoded string does not successfully decode using the specified - character set, a defect is added to the defects list and the unknown octets - are replaced by the unicode 'unknown' character \\uFDFF. + character set, a defect is added to the defects list. If the charset + is invalid or not found, a defect is added to the defects list. The specified charset and language are returned. The default for language, which is rarely if ever encountered, is the empty string. @@ -172,6 +172,20 @@ def decode(ew): """ _, charset, cte, cte_string, _ = ew.split('?') charset, _, lang = charset.partition('*') + string, defects = _decode(charset, cte, cte_string) + return string, charset, lang, defects + + +def _decode(charset, cte, cte_string): + """Return cte_string decoded using cte and charset and a list of defects. + + Use cte to turn cte_string into bytes, then decode those bytes using + charset and the surrogateescape error handler. Return a possibly empty + list of defects: return a CharsetError if the charset name is invalid or + unknown, and an UndecodableBytesDefect if there are any bytes the charset + cannot decode. + + """ cte = cte.lower() # Recover the original bytes and do CTE decoding. bstring = cte_string.encode('ascii', 'surrogateescape') @@ -184,11 +198,13 @@ def decode(ew): f"contains bytes not decodable using {charset!r} charset")) string = bstring.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): + # In this context a UnicodeEncodeError results when the charset name is + # not a valid ASCII string. string = bstring.decode('ascii', 'surrogateescape') if charset.lower() != 'unknown-8bit': defects.append(errors.CharsetError(f"Unknown charset {charset!r} " f"in encoded word; decoded as unknown bytes")) - return string, charset, lang, defects + return string, defects _cte_encoders = { diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index d8884bc8a69189f..bcbdce9ac4ecffd 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -86,57 +86,81 @@ def test_decode_raises_if_value(self, value, exception=ValueError): @params - def test_decode( + def test__decode( self, - source, + cte_encoded, + *, + cte, + charset='us-ascii', result, + defects=[], + ): + actual, actual_defects = _ew._decode(charset, cte, cte_encoded) + self.assertEqual(actual, result) + self.assertDefectsEqual(actual_defects, defects) + + @params + def test_decode( + self, + cte_encoded, + *, + cte, charset='us-ascii', lang='', + result, defects=[], ): - actual, actual_charset, actual_lang, actual_defects = _ew.decode(source) + ew = f'=?{charset}{lang and '*'}{lang}?{cte}?{cte_encoded}?=' + actual, actual_charset, actual_lang, actual_defects = _ew.decode(ew) self.assertEqual(actual, result) self.assertEqual(actual_charset, charset) self.assertEqual(actual_lang, lang) self.assertDefectsEqual(actual_defects, defects) - params_test_decode = Params( + params_test__decode = params_test_decode = Params( simple_q = C( - '=?us-ascii?q?foo?=', 'foo', + cte='q', + result='foo', ), simple_b = C( - '=?us-ascii?b?dmk=?=', - 'vi', + 'dmk=', + cte='b', + result='vi', ), q_case_ignored = C( - '=?us-ascii?Q?foo?=', 'foo', + cte='Q', + result='foo', ), b_case_ignored = C( - '=?us-ascii?B?dmk=?=', - 'vi', + 'dmk=', + cte='B', + result='vi', ), non_trivial_q = C( - '=?latin-1?q?=20F=fcr=20Elise=20?=', - ' Für Elise ', + '=20F=fcr=20Elise=20', + cte='q', + result=' Für Elise ', charset='latin-1', ), q_escaped_bytes_preserved = C( - b'=?us-ascii?q?=20\xACfoo?='.decode('us-ascii', 'surrogateescape'), - ' \uDCACfoo', + b'=20\xACfoo'.decode('us-ascii', 'surrogateescape'), + cte='q', + result=' \uDCACfoo', defects=[errors.UndecodableBytesDefect], ), b_undecodable_bytes_ignored_with_defect = C( - b'=?us-ascii?b?dm\xACk?='.decode('us-ascii', 'surrogateescape'), - 'vi', + b'dm\xACk'.decode('us-ascii', 'surrogateescape'), + cte='b', + result='vi', defects=[ errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect, @@ -144,14 +168,16 @@ def test_decode( ), b_invalid_bytes_ignored_with_defect = C( - '=?us-ascii?b?dm\x01k===?=', - 'vi', + 'dm\x01k===', + cte='b', + result='vi', defects=[errors.InvalidBase64CharactersDefect], ), b_invalid_bytes_incorrect_padding = C( - '=?us-ascii?b?dm\x01k?=', - 'vi', + 'dm\x01k', + cte='b', + result='vi', defects=[ errors.InvalidBase64CharactersDefect, errors.InvalidBase64PaddingDefect, @@ -159,48 +185,58 @@ def test_decode( ), b_padding_defect = C( - '=?us-ascii?b?dmk?=', - 'vi', + 'dmk', + cte='b', + result='vi', defects=[errors.InvalidBase64PaddingDefect], ), - nonnull_lang = C( - '=?us-ascii*jive?q?test?=', - 'test', - lang='jive', - ), - unknown_8bit_charset = C( - '=?unknown-8bit?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), + 'foo=ACbar', + cte='q', + result=b'foo\xacbar'.decode('ascii', 'surrogateescape'), charset='unknown-8bit', defects=[], ), unknown_charset = C( - '=?foobar?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), + 'foo=ACbar', + cte='q', + result=b'foo\xacbar'.decode('ascii', 'surrogateescape'), charset='foobar', # XXX Should this be a new Defect instead? defects=[errors.CharsetError], ), invalid_character_in_charset = C( - '=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), + 'foo=ACbar', + cte='q', + result=b'foo\xacbar'.decode('ascii', 'surrogateescape'), charset='utf-8\udce2\udc80\udc9d', # XXX Should this be a new Defect instead? defects=[errors.CharsetError], ), q_nonascii = C( - '=?utf-8?q?=C3=89ric?=', - 'Éric', + '=C3=89ric', + cte='q', + result='Éric', charset='utf-8', ), ) + params_test_decode__lang = Params( + + nonnull_lang = C( + 'test', + cte='q', + result='test', + lang='jive', + ), + + ) + class TestEncoders(TestEmailBase): From 52cf8ebfa95f0c5164869ae6cf1f1450906e5fea Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Tue, 30 Dec 2025 14:43:31 -0500 Subject: [PATCH 126/152] Refactor get_encoded_word. In addition to the 'start' API change, this function's API also turns from exception based to LBYL: it now returns None if it finds no encoded word, instead of raising an exception. The reason is that the way this will now be used it is equally or more likely that the "no encoded word" case will be the path taken, making LBYL more efficient than an exception. It now requires that the terminal_type be specified, since I've decided that 'vtext' should go away and be replaced by the specification of which context the encoded word was decoded in. The folder uses that "source" knowledge, so rather than it looking for the default 'vtext' it should (by the time I'm done) look for the actual context name (atext, ptext, utext). I'm also dropping the check for trailing whitespace on the encoded word from this function. That was a "practicality" hack; to be logically correct the code that is parsing the structure where the encoded word is found should be deciding whether or not a trailing whitespace is required. In addition I tweaked the defect message for whitespace inside an encoded word to use the rfc notation "encoded-word" instead of "encoded word". Efficiency is improved at several layers, since we we now using 're' once to gather all the information about whether or not there even is an encoded word, and if there is, everything we need to have to decode it using the new _decode function of _encoded_words. This eliminates a number of string slices. It also means that we are now passing the stripped charset name to '_decode' instead of 'decode' extracting the charset itself, which is why the ws_only_charset_leads_to_undecodable_bytes_with_non_ascii test changed its expectations slightly. The defect message content now matches what the 'charset' attribute would report, so I consider that an improvement. BUGFIX: if any '=' that is not part of 'q' CTE encoding sequence, or any '?'s, appear in the content of a 'q' encoded word, or the charset name is empty, the word will now be decoded from the CTE if possible. Previously these cases were treated as always undecodable. This is a return to the more generous decoding policy of the older email API. Because of this bugfix, the 'too_many_qm' test is removed from the invalid encoded word list, and instead the '?' being accepted is now checked by the q_content_may_contain test. BUGFIX: previously the parser would register both a NonPrintableDefect and a InvalidHeaderDefect for "whitespace inside encoded word" when the body of the encoded word contained a non-rfc-whitespace non-printable that the python 'split' function considers to be whitespace (horizontal tab, new line, vertical tab, form feed, carriage return, file separator, group separator, record separator, and unit separator). The redundant InvalidHeaderDefect is no longer registered. Test changes for this bugfix were marked by comments that are now resolved. --- Lib/email/_header_value_parser.py | 134 ++++++++++++------ .../test_email/test__header_value_parser.py | 110 ++++++++------ 2 files changed, 153 insertions(+), 91 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 35a368056a623ab..32c7be3229da7a8 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -70,7 +70,6 @@ import re import sys import urllib # For urllib.parse.unquote -from string import hexdigits from operator import itemgetter from email import _encoded_words as _ew from email import errors @@ -1235,58 +1234,104 @@ def get_fws(value, start): fws = WhiteSpaceTerminal(m.group(), 'fws') return fws, m.end() -def get_encoded_word(value, terminal_type='vtext'): +# We need a custom deprecation for this one because we want terminal_type to be +# required, a return of None instead of exceptions, and for the trailing +# whitespace defect addition to move elsewhere. +def _deprecate_old_encoded_word_api(func): + @wraps(func) + def dispatch(value, *args, **kw): + if args and isinstance(args[0], int): + return func(value, *args, **kw) + from warnings import _deprecated + _deprecated(func.__name__, _API_CHANGE_MSG, remove=OLDAPIREMVER) + kw.setdefault('terminal_type', args[0] if args else 'vtext') + result = func(value, 0, **kw) + if result is None: + raise _InvalidEwError(f"expected encoded word but found {value}") + result, start = result + ew, value = result, value[start:] + if value and value[0] not in WSP: + ew.defects.append(errors.InvalidHeaderDefect( + "missing trailing whitespace after encoded-word")) + return ew, value + return dispatch + +# This match is generous; defects are detected during ew parsing. +_ew_finder = re.compile(r''' + =\? # literal =? + ( # We might have 'charset' or 'charset*lang' next. + ( # First case: no * + (?P[^?*]*?) # non-greedy to next ? if no * is the charset + \? # literal ? + ) + | + ( # Second case: charset*lang + (?P[^?*]*?) # non-greedy to * is the charset + \* # literal * + (?P[^?]*?) # non-greedy to next ? is the lang + \? # literal ? + ) + ) + (?P[^?]*?) # non-greedy up to the next ? is the CTE + \? # literal ? + (?P.*?) # non-greedy to next ?= is the encoded string + \?= # literal ?= + ''', re.VERBOSE | re.DOTALL).match +_wsp_finder = re.compile(rf'[{_WSP}]+').search +_non_wsp_re = _make_non_match_re(_WSP) +@_deprecate_old_encoded_word_api +def get_encoded_word(value, start, terminal_type): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + If something interpretable as an encoded word occurs starting at start, + return an EncodedWord token list with the decoded text decomposed into + whitespace and non-whitespace value terminals, and the index of the last + character of the encoded word (the '=') plus one. Register a defect if + there is un-encoded whitespace inside the encoded word, and register + defects for any any non-printable or invalid characters in the + non-whitespace ValueTerminals. + + If the characters starting at start are not interpretable as an encoded + word such that it can be decoded from the content transfer encoding, return + None. + """ + ew_match = _ew_finder(value, start) + if ew_match is None: + return ew = EncodedWord() - if not value.startswith('=?'): - raise errors.HeaderParseError( - "expected encoded word but found {}".format(value)) - tok, *remainder = value[2:].split('?=', 1) - if tok == value[2:]: - raise errors.HeaderParseError( - "expected encoded word but found {}".format(value)) - remstr = ''.join(remainder) - if (len(remstr) > 1 and - remstr[0] in hexdigits and - remstr[1] in hexdigits and - tok.count('?') < 2): - # The ? after the CTE was followed by an encoded word escape (=XX). - rest, *remainder = remstr.split('?=', 1) - tok = tok + '?=' + rest - if len(tok.split()) > 1: - ew.defects.append(errors.InvalidHeaderDefect( - "whitespace inside encoded word")) - ew.cte = value - value = ''.join(remainder) - try: - text, charset, lang, defects = _ew.decode('=?' + tok + '?=') - except (ValueError, KeyError): - raise _InvalidEwError( - "encoded word format invalid: '{}'".format(ew.cte)) - if any(isinstance(x, errors.InvalidBase64LengthDefect) for x in defects): - raise _InvalidEwError( - "encoded word could not be decoded: '{}'".format(ew.cte), - ) + csnolang, cslang, lang, cte, encoded = ew_match.group( + 'csnolang', 'cslang', 'lang', 'cte', 'encoded') + charset, lang = cslang or csnolang or '', lang or '' ew.charset = charset.strip() ew.lang = lang.strip() + try: + text, defects = _ew._decode(ew.charset, cte, encoded) + except KeyError: + # With an unknown CTE we can't decode the content. We could just + # return it, but that would be less clear than leaving the ew alone. + return None + if any(isinstance(x, errors.InvalidBase64LengthDefect) for x in defects): + return None ew.defects.extend(defects) - while text: - if text[0] in WSP: - token, text = get_fws(text) + if _wsp_finder(ew_match.group()): + ew.defects.append(errors.InvalidHeaderDefect( + "whitespace inside encoded-word")) + tptr, tlen = 0, len(text) + while tptr < tlen: + if text[tptr] in WSP: + token, tptr = get_fws(text, tptr) ew.append(token) continue - chars, *remainder = _wsp_splitter(text, 1) - vtext = ValueTerminal(chars, terminal_type) - _validate_xtext(vtext) - ew.append(vtext) - text = ''.join(remainder) - # Encoded words should be followed by a WS - if value and value[0] not in WSP: - ew.defects.append(errors.InvalidHeaderDefect( - "missing trailing whitespace after encoded-word")) - return ew, value + t, tptr = _get_xtext( + text, + tptr, + _non_wsp_re, + ValueTerminal, + terminal_type, + ) + ew.append(t) + return ew, ew_match.end() def get_unstructured(value): """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct @@ -1310,7 +1355,6 @@ def get_unstructured(value): # XXX: but what about bare CR and LF? They might signal the start or # end of an encoded word. YAGNI for now, since our current parsers # will never send us strings with bare CR or LF. - unstructured = UnstructuredTokenList() while value: if value[0] in WSP: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 3387744791b78f7..48c6955a9ec3d04 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -86,7 +86,7 @@ def nonprintable_defect(chars): whitespace_inside_ew_defect = ( errors.InvalidHeaderDefect, - 'whitespace inside encoded word', + 'whitespace inside encoded-word', ) missing_whitespace_before_ew_defect = ( @@ -518,6 +518,8 @@ def _test_parse( defects = [] if defects is None else defects with warningscheck: result = callspec(method) + if result is None: + return if isinstance(result, (parser.TokenList, parser.Terminal)): other = [] else: @@ -987,32 +989,58 @@ def test_get_encoded_word( *args, charset='us-ascii', lang='', + # XXX POSTDEP: delete the following line: terminal_type=None, + # XXX POSTDEP: uncomment following line: + # terminal_type='ttext', + prefix=None, + expect_none=False, **kw, ): - callspec = C(s) if terminal_type is None else C(s, terminal_type) - res = self._test_parse(parser.get_encoded_word, callspec, *args, **kw) + # XXX POSTDEP: delete from here... + if 'warnings' in kw: + # old api + callspec = C(s) if terminal_type is None else C(s, terminal_type) + terminal_type = terminal_type or 'vtext' + if (r := kw.get('remainder')) and r[0] not in RFC_WSP: + kw['defects'] = kw.get('defects', []) + [ + missing_whitespace_after_ew_defect, + ] + else: + terminal_type = terminal_type or 'ttext' + callspec = C(s, terminal_type) + # XXX POSTDEP: ...to here + # XXX POSTDEP: uncomment the following line: + #callspec = C(s, terminal_type) + ew = self._test_parse(parser.get_encoded_word, callspec, *args, **kw) if 'exception' in kw: return - self.assertEqual(res.charset, charset) - self.assertEqual(res.lang, lang) - terminal_type = 'vtext' if terminal_type is None else terminal_type - self.verify_terminal_types(res, terminal_type, 'fws') + if expect_none: + self.assertIsNone(ew) + return + self.assertEqual(ew.charset, charset) + self.assertEqual(ew.lang, lang) + self.verify_terminal_types(ew, terminal_type, 'fws') # This params_map will handle either single strings or C objects. @params_map - def expect_get_encoded_word_raise(v, *args, **kw): + def invalid_encoded_words(v, *args, **kw): + # XXX POSTDEP: change 'newapi' to '' in the next line. + yield 'newapi', C(v, expect_none=True) + # XXX POSTDEP: delete from here... newspec = C( v, *args, # "expected encoded word but found '...'" exception=(errors.HeaderParseError, re.escape(v)), + warnings=[(DeprecationWarning, r"(?=.*API)(?=.*has changed)")], test_start=False, **kw, ) yield 'oldapi', newspec + # XXX POSTDEP: ...to here - params_test_get_encoded_word__invalid_input = expect_get_encoded_word_raise( + params_test_get_encoded_word__invalid_input = invalid_encoded_words( null_string = '', no_chrome = 'content', eq_only = '=content', @@ -1039,7 +1067,6 @@ def expect_get_encoded_word_raise(v, *args, **kw): unknown_cte = '=?UTF-8?X?content?=', invalid_base64_length = '=?utf-8?b?abcde?=', multicharacter_cte = '=?UTF-8?qq?content?=', - too_many_qm = '=?UTF-8?q?q?content?=', empty_lang = '=?UTF-8*??q?content?=', lang_with_empty_charset = '=?*foo??q?content?=', **for_each_character(ALL_ASCII)( @@ -1047,7 +1074,18 @@ def expect_get_encoded_word_raise(v, *args, **kw): ), ) - params_test_get_encoded_word = old_api_only( + # XXX POSTDEP: delete from here... + def test_get_encoded_word_old_api_supports_keywords(self): + self._test_parse( + parser.get_encoded_word, + C('=?UTF-8?q?foo?=', terminal_type='a'), + stringified='foo', + warnings=[(DeprecationWarning, r"(?=.*API)(?=.*has changed)")], + test_start=False, + ) + # XXX POSTDEP: ...to here. + + params_test_get_encoded_word = for_each_api( valid_ew = C( '=?us-ascii?q?this_is_a_test?= bird', @@ -1055,15 +1093,11 @@ def expect_get_encoded_word_raise(v, *args, **kw): remainder=' bird', ), - # XXX XXX the skip for the RFC_WSP will go away after refactor. It's - # here because it would be a pain to handle the lack of the defect, - # which will go away in the refactor. - **for_each_character(ALL_ASCII, skip=RFC_WSP)( + **for_each_character(ALL_ASCII)( ew_followed_by = C( '=?us-ascii?q?foo?={char}', stringified='foo', remainder='{char}', - defects=[missing_whitespace_after_ew_defect], ), ), @@ -1071,8 +1105,7 @@ def expect_get_encoded_word_raise(v, *args, **kw): # the context from which get_encoded_word is called (ex: ()s are # illegal in comment encoded words), but but at least at the moment # that it isn't worth the effort to implement. - # XXX XXX the skip for ? is a bug which will be fixed in the refactor - **for_each_character(RFC_PRINTABLES, skip='_?')( + **for_each_character(RFC_PRINTABLES, skip='_')( q_content_may_contain = C( '=?us-ascii?q?foo_{char}_bar_{char}?=', stringified='foo {char} bar {char}', @@ -1093,12 +1126,9 @@ def expect_get_encoded_word_raise(v, *args, **kw): remainder=' =?utf-8?q?second?=', ), - # XXX XXX This defect will also go away (gets detected higher up) only_gets_first_ew_even_if_no_space = C( '=?us-ascii?q?first?==?utf-8?q?second?=', stringified='first', - # 'missing trailing whitespace after encoded-word' - defects=[missing_whitespace_after_ew_defect], remainder='=?utf-8?q?second?=', ), @@ -1114,12 +1144,7 @@ def expect_get_encoded_word_raise(v, *args, **kw): charset='utf-8', ), - **for_each_character( - RFC_NONPRINTABLES, - # XXX XXX skip things split considers whitespace. This is buggy. - # US RS GS FS - skip=RFC_WSP + '\r\n\v\f\x1f\x1e\x1d\x1c', - )( + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( non_printable_defect = C( '=?us-ascii?q?first{char}second?=', stringified='first{char}second', @@ -1211,7 +1236,7 @@ def expect_get_encoded_word_raise(v, *args, **kw): stringified='\udcc3\udc89ric', charset='', defects=[ - charset_defect(' '), + charset_defect(''), undecodable_bytes_defect, whitespace_inside_ew_defect, ], @@ -1265,7 +1290,7 @@ def test_get_unstructured(self, s, *args, **kw): # so it should correctly handle most get_encoded_word parameters. @params_map(with_namelist=True) def adapt_get_encoded_word_tests_for_get_unstructured(nl, *args, **kw): - kw.pop('test_start') + kw.pop('test_start', None) kw.pop('charset', None) kw.pop('terminal_type', None) kw.pop('lang', None) @@ -1283,9 +1308,10 @@ def adapt_get_encoded_word_tests_for_get_unstructured(nl, *args, **kw): rstripped = remainder.lstrip(RFC_WSP) if remainder != rstripped: kw['value'] = kw.get('value', stringified) + ' ' + rstripped - # Drop the 'warning=...' added by only_old_api; we're doing it ourselves - # in the test method. - kw.pop('warnings') + if 'oldapi' in nl: + # get_encoded_word is checking for warnings about its old api being + # deprecated, but parse_unstructured don't have an API change. + kw.pop('warnings') yield 'from_test_get_encoded_word', C(*args, **kw) @params_map(with_namelist=True) @@ -1465,8 +1491,11 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): defects=[missing_whitespace_after_ew_defect], ), + # Although this is technically invalid (unencoded =) we handle it anyway + # XXX there should be a defect, which is currently missing. invalid_ew2 = C( '=?utf-8?q?=somevalue?=', + '=somevalue', ), **for_each_character(RFC_PRINTABLES)( @@ -1476,20 +1505,14 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): ), ), - # XXX XXX the '?=' skip is a sort-of bug the refactoring will fix. - **for_each_character(RFC_PRINTABLES, skip='_?=')( + **for_each_character(RFC_PRINTABLES, skip='_')( printable_inside_ews = C( '=?utf-8?q?rock{char}?= =?utf-8?q?{char}hard_place?=', stringified='rock{char}{char}hard place', ), ), - **for_each_character( - RFC_NONPRINTABLES, - # XXX XXX skip things split considers whitespace. This is buggy. - # US RS GS FS - skip=RFC_WSP + '\r\n\v\f\x1f\x1e\x1d\x1c', - )( + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( non_wsp_non_printable = C( 'some {char} text', stringified='some {char} text', @@ -2197,12 +2220,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): commenttree=[['foo '], [' bar']], ), - **for_each_character( - ALL_ASCII, - # XXX XXX skip things split considers whitespace. This is buggy. - # US RS GS FS - skip=CFWS_LEADER + '\r\n\v\f\x1f\x1e\x1d\x1c', - )( + **for_each_character(ALL_ASCII, skip=CFWS_LEADER)( ends_at_non_comment_non_ws = C( '(foo) {char}', remainder='{char}', From 5ede05d8dd63728ed4d2d89d37a1f514a7162e2a Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 5 Jan 2026 16:34:25 -0500 Subject: [PATCH 127/152] Refactor get_unstructured into parse_unstructured. As one of the XXX comments said, this routine really should have been named parse_unstructured from the beginning. This is an opportunity to make that change. There are going to be more places we'll be adding whitespace defects, since the other places they need to be detected are at higher syntactic levels, so I've introduced a couple constants for the two versions of the defect to make sure things stay consistent and simplify the code. This also changes the text of an existing defect messages, but only slightly (removing a redundant word, changing "encoded word" to the rfc terminology "encoded-word"). _InvalidEwError is no longer used except by the backward compatibility code, so that gets deprecated (deprecating it is overkill, but why not). rfc2047_matcher is now entirely unused. It is sort of replaced by _ew_finder, but the two are different enough, and the latter marked as internal while the former wasn't, that I'm not suggesting it as a replacement. --- Lib/email/_header_value_parser.py | 132 +++++++++--------- .../test_email/test__header_value_parser.py | 45 ++++-- 2 files changed, 100 insertions(+), 77 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 32c7be3229da7a8..75402ca05e56459 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -115,7 +115,7 @@ def quote_string(value): # Match a RFC 2047 word, looks like =?utf-8?q?someword?= -rfc2047_matcher = re.compile(r''' +_deprecated_rfc2047_matcher = re.compile(r''' =\? # literal =? [^?]* # charset \? # literal ? @@ -973,7 +973,7 @@ def __str__(self): return '' -class _InvalidEwError(errors.HeaderParseError): +class _deprecated__InvalidEwError(errors.HeaderParseError): """Invalid encoded word found while parsing headers.""" @@ -1113,6 +1113,16 @@ def _(func): # # returns a complete 'phrase' from 'start' to 'rest' in the value. +# Often used Defects. XXX These could become subclasses. + +_MissingWhitespaceBeforeEWDefect = errors.InvalidHeaderDefect( + "missing whitespace before encoded-word", + ) + +_MissingWhitespaceAfterEWDefect = errors.InvalidHeaderDefect( + "missing whitespace after encoded-word", + ) + _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(ATOM_ENDS)))).match @@ -1247,12 +1257,13 @@ def dispatch(value, *args, **kw): kw.setdefault('terminal_type', args[0] if args else 'vtext') result = func(value, 0, **kw) if result is None: - raise _InvalidEwError(f"expected encoded word but found {value}") + raise _deprecated__InvalidEwError( + f"expected encoded word but found {value}", + ) result, start = result ew, value = result, value[start:] if value and value[0] not in WSP: - ew.defects.append(errors.InvalidHeaderDefect( - "missing trailing whitespace after encoded-word")) + ew.defects.append(_MissingWhitespaceAfterEWDefect) return ew, value return dispatch @@ -1333,71 +1344,63 @@ def get_encoded_word(value, start, terminal_type): ew.append(t) return ew, ew_match.end() -def get_unstructured(value): +pre_ew_re = re.compile(rf'[^{_WSP + '='}]*') +def parse_unstructured(value): """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) obs-utext = %d0 / obs-NO-WS-CTL / LF / CR + obs-NO-WS-CTL = - obs-NO-WS-CTL is control characters except WSP/CR/LF. - - So, basically, we have printable runs, plus control characters or nulls in - the obsolete syntax, separated by whitespace. Since RFC 2047 uses the - obsolete syntax in its specification, but requires whitespace on either - side of the encoded words, I can see no reason to need to separate the - non-printable-non-whitespace from the printable runs if they occur, so we - parse this into xtext tokens separated by WSP tokens. - - Because an 'unstructured' value must by definition constitute the entire - value, this 'get' routine does not return a remaining value, only the - parsed TokenList. + Return an UnstructuredTokenList containing whitespace and non-whitespace + tokens obtained from value, decoding any encoded words found, regardless of + whitespace, into EncodedWord tokens lists. Register defects if the encoded + words are not correctly surrounded by whitespace or the ends of the value + or have internal whitespace. Register defects if the non-whitespace tokens + contain any non-printable or invalid characters. All ValueTerminals + should have the token_type 'utext'. """ - # XXX: but what about bare CR and LF? They might signal the start or - # end of an encoded word. YAGNI for now, since our current parsers - # will never send us strings with bare CR or LF. - unstructured = UnstructuredTokenList() - while value: - if value[0] in WSP: - token, value = get_fws(value) - unstructured.append(token) + # We don't actually accept LF or CR, we treat them as a non-printable + # defect. This is because the parser is designed to process strings where + # unfolding has already been done, pre-handling legal CR/LF characters. + tl = UnstructuredTokenList() + start, vlen = 0, len(value) + while start < vlen: + if value[start] in WSP: + token, start = get_fws(value, start) + tl.append(token) continue - valid_ew = True - if value.startswith('=?'): - try: - token, value = get_encoded_word(value, 'utext') - except _InvalidEwError: - valid_ew = False - except errors.HeaderParseError: - # XXX: Need to figure out how to register defects when - # appropriate here. - pass - else: - have_ws = True - if len(unstructured) > 0: - if unstructured[-1].token_type != 'fws': - unstructured.defects.append(errors.InvalidHeaderDefect( - "missing whitespace before encoded word")) - have_ws = False - if have_ws and len(unstructured) > 1: - if unstructured[-2].token_type == 'encoded-word': - unstructured[-1] = EWWhiteSpaceTerminal( - unstructured[-1], 'fws') - unstructured.append(token) - continue - tok, *remainder = _wsp_splitter(value, 1) - # Split in the middle of an atom if there is a rfc2047 encoded word - # which does not have WSP on both sides. The defect will be registered - # the next time through the loop. - # This needs to only be performed when the encoded word is valid; - # otherwise, performing it on an invalid encoded word can cause - # the parser to go in an infinite loop. - if valid_ew and rfc2047_matcher.search(tok): - tok, *remainder = value.partition('=?') - vtext = ValueTerminal(tok, 'utext') - _validate_xtext(vtext) - unstructured.append(vtext) - value = ''.join(remainder) - return unstructured + ew = None + m = pre_ew_re.match(value, start) + end = m.end() + if end < vlen: + if value[end] == '=': + res = get_encoded_word(value, end, 'utext') + if res: + ew, end = res + else: + m = _non_wsp_re.match(value, start) + ew, end = None, m.end() + text = m.group() + # At this point we have text, an ew, or both; we can't have neither. + if tl and tl[-1].token_type == 'encoded-word': + tl.defects.append(_MissingWhitespaceAfterEWDefect) + if text: + tl.append(_make_xtext(text, ValueTerminal, 'utext')) + if ew: + if tl: + if tl[-1].token_type == 'fws': + if len(tl) > 1 and tl[-2].token_type == 'encoded-word': + tl[-1] = EWWhiteSpaceTerminal(tl[-1], 'fws') + else: + tl.defects.append(_MissingWhitespaceBeforeEWDefect) + tl.append(ew) + start = end + return tl + +@_deprecate('parse_unstructured') +def get_unstructured(value): + return parse_unstructured(value) def get_qp_ctext(value): r"""ctext = @@ -2433,8 +2436,7 @@ def parse_message_ids(value): # XXX: As I begin to add additional header parsers, I'm realizing we probably # have two level of parser routines: the get_XXX methods that get a token in # the grammar, and parse_XXX methods that parse an entire field value. So -# get_address_list above should really be a parse_ method, as probably should -# be get_unstructured. +# get_address_list above should really be a parse_ method. # def parse_mime_version(value): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 48c6955a9ec3d04..b4830518cbcc409 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -91,12 +91,12 @@ def nonprintable_defect(chars): missing_whitespace_before_ew_defect = ( errors.InvalidHeaderDefect, - 'missing whitespace before encoded word', + 'missing whitespace before encoded-word', ) missing_whitespace_after_ew_defect = ( errors.InvalidHeaderDefect, - 'missing trailing whitespace after encoded-word', + 'missing whitespace after encoded-word', ) def charset_defect(chars): @@ -245,6 +245,8 @@ def _deprecated_bar(a): @params(as_value( # XXX XXX make sure this is completely filled in with all the # names we expect to be deprecated. + '_InvalidEwError', + 'rfc2047_matcher', )) def test_deprecated_names(self, name): with check_all_warnings(( @@ -1271,8 +1273,22 @@ def test_get_encoded_word_old_api_supports_keywords(self): ) - # get_unstructured + # parse_unstructured + @params + def test_parse_unstructured(self, s, *args, **kw): + result = self._test_parse( + parser.parse_unstructured, + C(s), + *args, + test_start=False, + no_end=True, + **kw, + ) + self.assertIsInstance(result, parser.UnstructuredTokenList) + self.verify_terminal_types(result, 'utext', 'fws') + + # XXX POSTDEP: delete from here... @params def test_get_unstructured(self, s, *args, **kw): result = self._test_parse( @@ -1280,25 +1296,29 @@ def test_get_unstructured(self, s, *args, **kw): C(s), *args, test_start=False, - warnings=..., # XXX XXX ignore warnings until after refactor. + no_end=True, + warnings=[ + (DeprecationWarning, r".*is.*deprecated.*parse_unstructured"), + ], **kw, ) self.assertIsInstance(result, parser.UnstructuredTokenList) self.verify_terminal_types(result, 'utext', 'fws') + # XXX POSTDEP: ...to here - # get_unstructured should correctly decode anything get_encoded_word does, + # parse_unstructured should correctly decode anything get_encoded_word does, # so it should correctly handle most get_encoded_word parameters. @params_map(with_namelist=True) - def adapt_get_encoded_word_tests_for_get_unstructured(nl, *args, **kw): + def adapt_get_encoded_word_tests_for_parse_unstructured(nl, *args, **kw): kw.pop('test_start', None) kw.pop('charset', None) kw.pop('terminal_type', None) kw.pop('lang', None) - # get_unstructured parses all of its input, so it will also parse and + # parse_unstructured parses all of its input, so it will also parse and # return anything get_encoded_word treats as a remainder. remainder = kw.pop('remainder', '') if '=?' in remainder or 'ew_followed_by' in nl: - # The remainder includes something get_unstructured would decode, + # The remainder includes something parse_unstructured would decode, # or might contain something it would treat as a defect. Either # way, parse_unstructured isn't expected to handle those parameters. return @@ -1315,7 +1335,7 @@ def adapt_get_encoded_word_tests_for_get_unstructured(nl, *args, **kw): yield 'from_test_get_encoded_word', C(*args, **kw) @params_map(with_namelist=True) - def adapt_get_encoded_word_invalid_input_for_get_unstructured(nl, s, **kw): + def adapt_get_encoded_word_invalid_input_for_parse_unstructured(nl, s, **kw): # Get unstructured should return the inputs unaltered, # except for the ones where the ew itself is valid. if 'character_before_valid_ew' in nl: @@ -1336,13 +1356,14 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): } yield '', C(s, *args, **kw) - params_test_get_unstructured = Params( + # XXX POSTDEP: remove 'params_test_get_unstructured' from next line. + params_test_get_unstructured = params_test_parse_unstructured = Params( add_unstructured_prefix_and_suffix( - adapt_get_encoded_word_tests_for_get_unstructured( + adapt_get_encoded_word_tests_for_parse_unstructured( params_test_get_encoded_word, ), - adapt_get_encoded_word_invalid_input_for_get_unstructured( + adapt_get_encoded_word_invalid_input_for_parse_unstructured( params_test_get_encoded_word__invalid_input, ), ), From 7301ea28cb20a7c4e07e3603bbfed7bd0421454b Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 31 Jan 2026 10:51:31 -0500 Subject: [PATCH 128/152] Deprecate _wsp_splitter. This function is no longer used outside of the deprecated _get_ptext_to_endchars. Adding a deprecation warning is probably overkill, but it's easy to do, so why not. --- Lib/email/_header_value_parser.py | 4 ++-- Lib/test/test_email/test__header_value_parser.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 75402ca05e56459..41eaacfee72d55c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1123,7 +1123,7 @@ def _(func): "missing whitespace after encoded-word", ) -_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split +_deprecated__wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(ATOM_ENDS)))).match _non_token_end_matcher = re.compile(r"[^{}]+".format( @@ -1189,7 +1189,7 @@ def _get_ptext_to_endchars(value, endchars): """ if not value: return '', '', False - fragment, *remainder = _wsp_splitter(value, 1) + fragment, *remainder = _deprecated__wsp_splitter(value, 1) vchars = [] escape = False had_qp = False diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b4830518cbcc409..e4006007458f603 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -247,6 +247,7 @@ def _deprecated_bar(a): # names we expect to be deprecated. '_InvalidEwError', 'rfc2047_matcher', + '_wsp_splitter', )) def test_deprecated_names(self, name): with check_all_warnings(( @@ -624,11 +625,13 @@ class TestParser(TestParserMixin, TestEmailBase): rfc_printable_ascii = bytes(range(33, 127)).decode('ascii') rfc_dtext_chars = rfc_printable_ascii.translate(str.maketrans('','',r'\[]')) + # XXX POSTDEP: delete from here... + # # _wsp_splitter @params def test__wsp_splitter(self, s, res): - self.assertEqual(parser._wsp_splitter(s, 1), res) + self.assertEqual(parser._deprecated__wsp_splitter(s, 1), res) params_test__wsp_splitter = Params( one_word = C('foo', ['foo']), @@ -636,6 +639,8 @@ def test__wsp_splitter(self, s, res): ws_runs = C('foo \t def jik', ['foo', ' \t ', 'def jik']), ) + # XXX POSTDEP: ...to here + # _make_xtext From 02133dba050adae04245a003b7394e81053821b2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 7 Jan 2026 10:23:23 -0500 Subject: [PATCH 129/152] Start adding a content_getter helper. content_getter will be a more general version of get_unstructured. It does not correspond to any of the RFC BNF entities. While the RFC only allows for unstructured text in limited places, in practice we (and other email parsers) generously decode encoded words in various other contexts. This new function will support that. To start with we want this to support decoding encoded words inside quoted strings, per the bugfix comment on one of the tests. This changeset adds a copy of get_unstructured under the new name, as well as some of the test framework for it with no active tests, to make the diff to following changeset with the actual implementation more useful. It makes it a bit clearer how content_getter is a generalization of the parse_unstructured code. --- Lib/email/_header_value_parser.py | 37 +++++++++++++++++++ .../test_email/test__header_value_parser.py | 37 +++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 41eaacfee72d55c..63a6b07813dac0a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1344,6 +1344,43 @@ def get_encoded_word(value, start, terminal_type): ew.append(t) return ew, ew_match.end() +pre_ew_re = re.compile(rf'[^{_WSP + '='}]*') +def content_getter(value): + tl = UnstructuredTokenList() + start, vlen = 0, len(value) + while start < vlen: + if value[start] in WSP: + token, start = get_fws(value, start) + tl.append(token) + continue + ew = None + m = pre_ew_re.match(value, start) + end = m.end() + if end < vlen: + if value[end] == '=': + res = get_encoded_word(value, end, 'utext') + if res: + ew, end = res + else: + m = _non_wsp_re.match(value, start) + ew, end = None, m.end() + text = m.group() + # At this point we have text, an ew, or both; we can't have neither. + if tl and tl[-1].token_type == 'encoded-word': + tl.defects.append(_MissingWhitespaceAfterEWDefect) + if text: + tl.append(_make_xtext(text, ValueTerminal, 'utext')) + if ew: + if tl: + if tl[-1].token_type == 'fws': + if len(tl) > 1 and tl[-2].token_type == 'encoded-word': + tl[-1] = EWWhiteSpaceTerminal(tl[-1], 'fws') + else: + tl.defects.append(_MissingWhitespaceBeforeEWDefect) + tl.append(ew) + start = end + return tl + pre_ew_re = re.compile(rf'[^{_WSP + '='}]*') def parse_unstructured(value): """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e4006007458f603..0bb00adf49606f0 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1278,6 +1278,43 @@ def test_get_encoded_word_old_api_supports_keywords(self): ) + # content_getter + + @params + def test_content_getter( + self, + s, + *args, + start=0, + tl_class=parser.TokenList, + text_type='ttext', + end_chars='', + qp=False, + ew_err=None, + **kw, + ): + result = self._test_parse( + parser.content_getter( + tl_class, + text_type, + end_chars=end_chars, + qp=qp, + ew_err=ew_err, + ), + C(s, start), + *args, + test_start=False, + **kw, + ) + if 'exception' in kw: + return + self.assertIsInstance(result, tl_class) + self.verify_terminal_types(result, text_type, 'fws') + + params_test_content_getter = Params( + ) + + # parse_unstructured @params From 76fbdf1c65f1dee6f87915c74dd4d3138cadfa9e Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 8 May 2026 09:38:13 -0400 Subject: [PATCH 130/152] Implement content_getter. content_getter of necessity passes all the tests that were written for get_unstructured, and then there are the additional ones to exercise the start, tl_type, text_type, end_chars, and qp keywords, plus test changes to test ew_indexes. content_getter is in many ways the heart of this refactor. It abstracts the lowest level of the parsing and makes it more consistent across RFC token types. It centralizes the handling of encoded words, which will allow us to decode them in more places (for better or worse). It also makes that handling much more efficient than the old code. As the generic workhorse method for the parser, content_getter knows almost nothing about the syntactic units or their requirements. Thus we tell it what token type list to use, what text type to wrap the terminals in, what characters are the stop characters, and whether or not quoted printables are to be decoded. And it tracks the encoded words that it decodes, recording their indexes in the input value. This last is because encoded words can be defects if they appear in various syntactic units, when we've decided that we're going to decode them anyway (which is most places). Only the higher level parsing functions can know if they are defects, so content_getter supplies a list of indexes that the higher level routines can use to generate defects. Right now a boolean flag would be enough, but eventually the defects will contain a pointer to the location of the defect in the value; tracking the index allows us to support that for EW defects.. In order to make it efficient for higher level functions to check for ew defects, this commit also adds support for copying the index list to each higher level token as the token lists are constructed. Otherwise the higher level functions would have to walk the parse tree for every token they process, whether those tokens had encoded words or not. We use a few cycles doing the copy as we go to avoid using a lot more cycles every time we process a token where encoded words are a defect. Getting all the new tests to pass required an addition to the get_encoded_word API: a 'decode_qp' keyword to allow telling it to decode quoted printables in the payload before decoding. content_getter also passes the _get_ptext_to_endchars tests (per the comment on those tests, since it is the replacement for that function), with the exception of the flag that _get_ptext_to_endchars returns about whether there was qp present or not. That flag functionality is to support get_dtext, where obsolete dtext can contain quoted printables but we need to register a defect. We won't be using content_getter for get_dtext, though, because we do *not* want to decode encoded words in domain literals for security reasons. We'll get the qp flag directly via _qp_unquote in get_dtext. In addition one test is changed, because by default content_getter does *not* stop at whitespace (that's a big part of the point of it), while _get_ptext_to_endchars does. Only one test tested for that, and by moving it we maintain the test that whitespace *works* as endchars, which is functionality we will use. The new test position is a slightly less thorough test of the _get_ptext_to_endchars API, but since that API is deprecated that doesn't matter. --- Lib/email/_header_value_parser.py | 111 +++- .../test_email/test__header_value_parser.py | 523 +++++++++++++----- 2 files changed, 490 insertions(+), 144 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 63a6b07813dac0a..0279f493957d27c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -74,7 +74,7 @@ from email import _encoded_words as _ew from email import errors from email import utils -from functools import wraps +from functools import partial, wraps # # Useful constants and functions @@ -139,6 +139,7 @@ class TokenList(list): def __init__(self, *args, **kw): super().__init__(*args, **kw) self.defects = [] + self.ew_indexes = [] def __str__(self): return ''.join(str(x) for x in self) @@ -147,10 +148,22 @@ def __repr__(self): return '{}({})'.format(self.__class__.__name__, super().__repr__()) + def append(self, value): + super().append(value) + if hasattr(value, 'ew_indexes'): + self.ew_indexes += value.ew_indexes + + def push(self, value): + super().insert(0, value) + if hasattr(value, 'ew_indexes'): + self.ew_indexes[:0] = value.ew_indexes + def extend(self, value): super().extend(value) if hasattr(value, 'defects'): self.defects.extend(value.defects) + if hasattr(value, 'ew_indexes'): + self.ew_indexes += value.ew_indexes @property def value(self): @@ -1291,7 +1304,7 @@ def dispatch(value, *args, **kw): _wsp_finder = re.compile(rf'[{_WSP}]+').search _non_wsp_re = _make_non_match_re(_WSP) @_deprecate_old_encoded_word_api -def get_encoded_word(value, start, terminal_type): +def get_encoded_word(value, start, terminal_type, *, decode_qp=False): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" If something interpretable as an encoded word occurs starting at start, @@ -1302,6 +1315,9 @@ def get_encoded_word(value, start, terminal_type): defects for any any non-printable or invalid characters in the non-whitespace ValueTerminals. + If decode_qp is True, decode any quoted pairs in the payload of the encoded + word before decoding. + If the characters starting at start are not interpretable as an encoded word such that it can be decoded from the content transfer encoding, return None. @@ -1316,6 +1332,7 @@ def get_encoded_word(value, start, terminal_type): charset, lang = cslang or csnolang or '', lang or '' ew.charset = charset.strip() ew.lang = lang.strip() + encoded, _ = _qp_unquote(encoded) if decode_qp else (encoded, 0) try: text, defects = _ew._decode(ew.charset, cte, encoded) except KeyError: @@ -1344,11 +1361,82 @@ def get_encoded_word(value, start, terminal_type): ew.append(t) return ew, ew_match.end() -pre_ew_re = re.compile(rf'[^{_WSP + '='}]*') -def content_getter(value): - tl = UnstructuredTokenList() - start, vlen = 0, len(value) +# In theory encoded words should only appear in certain places. In +# practice they tend to appear any where "normal text" tokens appear. This +# outside-the-rfc-grammar function-generator provides the tools to handle that. +_make_content_re = lambda s: re.compile(rf'[^{re.escape(s)}]*') +_make_qp_content_re = lambda s: re.compile( rf"([^{re.escape(s)}\\]|\\.)*") +_qp_finder = re.compile(r'\\(.)') +_qp_unquote = lambda s: _qp_finder.subn(r'\1', s) +def content_getter( + tl_class, + text_type, + end_chars='', + qp=False, + ): + """Return a function that can be used to parse up to certain end chars. + + The returned function has the following contract: + + new_function(value, start) + + Return a token list containing decoded text tokens and WSP. + + Process value from start until the first occurrence of any of the + characters in the iterable end_chars, breaking it up into whitespace and + non-whitespace tokens, and decoding encoded words wherever they are found + regardless of whitespace. Return the resulting list of tokens in an + instance of tl_type and then index of whichever end_char was found first + (or the len of value if none were found). Decoded encoded words should be + EncodedWord token lists, non-encoded word tokens should be of type + ValueTerminal with a token_type text_type, and whitespace tokens should be + WhiteSpaceTerminals or EWWhiteSpaceTerminals, as appropriate. + + Encoded word detection should take precedence over end_chars detection: an + end_char inside an encoded word should be treated as part of the encoded + word content rather than ending the processing. + + If qp is true, ignore end characters that are part of quoted pairs when + looking for the end of the parsable text, and unquote any quoted pairs in + the parsed text. + + if an encoded word is found, set the `has_ew` attribute of the returned + token list to `True`. + + """ + end_chars = ''.join(list(end_chars)) + if qp: + pre_ew_re = _make_qp_content_re(end_chars + _WSP + '=') + post_ew_re = _make_qp_content_re(end_chars + _WSP) + else: + pre_ew_re = _make_content_re(end_chars + _WSP + '=') + post_ew_re = _make_content_re(end_chars + _WSP) + return partial( + _get_content, + tl_class=tl_class, + text_type=text_type, + qp=qp, + end_chars=end_chars, + pre_ew_re=pre_ew_re, + post_ew_re=post_ew_re, + ) + +def _get_content( + value, + start=0, + *, + tl_class, + text_type, + pre_ew_re, + post_ew_re, + end_chars, + qp, + ): + tl = tl_class() + vlen = len(value) while start < vlen: + if value[start] in end_chars: + break if value[start] in WSP: token, start = get_fws(value, start) tl.append(token) @@ -1358,18 +1446,21 @@ def content_getter(value): end = m.end() if end < vlen: if value[end] == '=': - res = get_encoded_word(value, end, 'utext') + res = get_encoded_word(value, end, text_type, decode_qp=qp) if res: + # XXX save the index; some day the defects will use it + tl.ew_indexes.append(end) ew, end = res else: - m = _non_wsp_re.match(value, start) + m = post_ew_re.match(value, start) ew, end = None, m.end() text = m.group() # At this point we have text, an ew, or both; we can't have neither. if tl and tl[-1].token_type == 'encoded-word': tl.defects.append(_MissingWhitespaceAfterEWDefect) if text: - tl.append(_make_xtext(text, ValueTerminal, 'utext')) + text, _ = _qp_unquote(text) if qp else (text, 0) + tl.append(_make_xtext(text, ValueTerminal, text_type)) if ew: if tl: if tl[-1].token_type == 'fws': @@ -1379,7 +1470,7 @@ def content_getter(value): tl.defects.append(_MissingWhitespaceBeforeEWDefect) tl.append(ew) start = end - return tl + return tl, start pre_ew_re = re.compile(rf'[^{_WSP + '='}]*') def parse_unstructured(value): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 0bb00adf49606f0..b77ebc5b65b565a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -19,6 +19,7 @@ add_label, as_value, C, + for_each_name, include_unless, only, params, @@ -386,6 +387,33 @@ def test_extend_with_non_token_list_leaves_defects_unchanged(self): tl.extend(['fake', 'values']) self.assertEqual(tl.defects, defects) + for_each_method = for_each_name('append', 'extend', 'push') + + @params( + for_each_method( + none_none = C([], [] ), + one_none = C([1], [] ), + none_one = C([], [20] ), + one_one = C([1], [20] ), + two_two = C([1, 20], [27, 40] ), + ) + ) + def test_ew_indexes(self, method, existing, new): + expected = new + existing if method == 'push' else existing + new + tl1 = parser.TokenList() + tl1.ew_indexes = list(existing) + tl2 = parser.TokenList(['fake', 'values']) + tl2.ew_indexes = list(new) + getattr(tl1, method)(tl2) + self.assertEqual(tl1.ew_indexes, expected) + + @params(for_each_method(C([1, 20]))) + def test_non_token_list_leaves_ew_indexes_unchanged(self, method, idxs): + tl1 = parser.TokenList() + tl1.ew_indexes = idxs + getattr(tl1, method)(['fake', parser.Terminal('values', 'fake')]) + self.assertEqual(tl1.ew_indexes, idxs) + class TestTokens(TestEmailBase): @@ -438,6 +466,7 @@ def _test_parse( warnings=None, test_start=True, no_end=False, + ew_indexes=[], pprint=False, ): """Call method with callspec, make asserts, and return results of call. @@ -492,11 +521,15 @@ def _test_parse( Assert that the defects attribute of the returned object matches defects. + Assert that the ew_indexes attribute of the returned object matches + ew_indexes. + Return whatever the called method returned. """ s, *args = callspec.args base = s[:-len(remainder)] if remainder else s + prefix_len = 0 if test_start: # XXX I'm not at sure the overhead of this randomization is worth # it. We do at least need to test having a prefix though... @@ -552,6 +585,10 @@ def _test_parse( self.assertEqual(result.comments, comments) if commenttree is not None: self.assertEqual(self.ctree(result), commenttree) + self.assertEqual( + [x - prefix_len for x in result.ew_indexes], + ew_indexes, + ) return (result, *other) if other else result def verify_terminal_types(self, tl, *text_types): @@ -798,133 +835,6 @@ def test__get_xtext( ) - # _get_ptext_to_endchars - - # As an internal method these tests are not API requirements; however, the - # behavior they check must be verified one way or another, so if the - # implementation changes there need to be equivalent tests. - - @params - def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): - ptext, had_qp = self._test_parse( - parser._get_ptext_to_endchars, - C(s, endchars), - test_start=False, - **kw, - ) - self.assertEqual(had_qp, has_qp) - - @params_map - def for_each_endchar_set(*args, **kw): - # The function is general, but these are the ones we actually use. - endchar_sets = dict( - quoted_string='"', - comment='()', - domain_literal='[]', - ) - for name, endchars in endchar_sets.items(): - yield name, C(*args, endchars=endchars, **kw) - - @params_map - def for_each_endchar(*args, **kw): - return for_each_character(kw['endchars'])(C(*args, **kw)).items() - - # This params_map is used on exactly one expression, which has to contain a - # list of characters with no repeats. - @params_map - def stops_at_first_endchar_found(s): - for i in range(len(s)): - endchars = ''.join(sample((r := s[i:]), len(r))) - ec = charname(s[i]) - yield f'stops_at_first_endchar_found__string__{ec}', C( - s, - endchars=endchars, - remainder=r, - ) - yield f'stops_at_first_endchar_found__set__{ec}', C( - s, - endchars=set(endchars), - remainder=r, - ) - - params_test__get_ptext_to_endchars = Params( - - **for_each_endchar( - wsp_can_be_legal_endchars = C( - 'foo{char}bar"', - endchars='()' + RFC_WSP, - remainder='{char}bar"', - ), - ), - - **stops_at_first_endchar_found('(random?{})'), - - **for_each_endchar_set( - - one_word_no_wsp = C( - 'foo', - ), - - escaped_letter = C( - r'bar\s', - stringified='bars', - has_qp=True, - ), - - escaped_escape_char = C( - r'foo\\bar', - stringified=r'foo\bar', - has_qp=True, - ), - - any_printable_may_be_quoted = C( - ''.join(rf'\{c}' for c in RFC_PRINTABLES), - stringified=RFC_PRINTABLES, - has_qp=True, - ), - - ), - - **for_each_endchar( - for_each_endchar_set( - - stops_at_endchar = C( - 'foo{char}bar"', - remainder='{char}bar"', - ), - - quoted_endchar_no_actual_endchar = C( - r'foo\{char}bar', - stringified=r'foo{char}bar', - has_qp=True, - ), - - quoted_endchar_before_actual_endchar = C( - r'foo\{char}bar{char}', - stringified='foo{char}bar', - remainder='{char}', - has_qp=True, - ), - - multiple_qp = C( - r'\{char}\foo\\\{char}\a{char}', - stringified=r'{char}foo\{char}a', - remainder=r'{char}', - has_qp=True, - ), - - no_qp_before_endchar_but_some_after = C( - r'foo{char}a\b\a\r', - remainder=r'{char}a\b\a\r', - has_qp=False, - ), - - ), - ), - - ) - - # get_fws @params @@ -1002,6 +912,7 @@ def test_get_encoded_word( # terminal_type='ttext', prefix=None, expect_none=False, + decode_qp=False, **kw, ): # XXX POSTDEP: delete from here... @@ -1016,9 +927,10 @@ def test_get_encoded_word( else: terminal_type = terminal_type or 'ttext' callspec = C(s, terminal_type) + callspec.kw['decode_qp'] = decode_qp # XXX POSTDEP: ...to here # XXX POSTDEP: uncomment the following line: - #callspec = C(s, terminal_type) + #callspec = C(s, terminal_type, decode_qp=decode_qp) ew = self._test_parse(parser.get_encoded_word, callspec, *args, **kw) if 'exception' in kw: return @@ -1275,6 +1187,24 @@ def test_get_encoded_word_old_api_supports_keywords(self): terminal_type='test', ), + qp_true_no_qp = C( + r'=?us-ascii?q?test?=', + decode_qp=True, + stringified=r'test', + ), + + qp_true_with_qp = C( + r'=?us-ascii?q?tes\t?=', + decode_qp=True, + stringified='test', + ), + + qp_false_with_qp = C( + r'=?us-ascii?q?tes\t?=', + decode_qp=False, + stringified=r'tes\t', + ), + ) @@ -1290,7 +1220,6 @@ def test_content_getter( text_type='ttext', end_chars='', qp=False, - ew_err=None, **kw, ): result = self._test_parse( @@ -1299,7 +1228,6 @@ def test_content_getter( text_type, end_chars=end_chars, qp=qp, - ew_err=ew_err, ), C(s, start), *args, @@ -1311,14 +1239,304 @@ def test_content_getter( self.assertIsInstance(result, tl_class) self.verify_terminal_types(result, text_type, 'fws') + @params_map + def for_each_endchar_set(*args, **kw): + # The function is general, but these are the ones we actually use. + endchar_sets = dict( + quoted_string='"', + comment='()', + domain_literal='[]', + ) + for name, endchars in endchar_sets.items(): + yield name, C(*args, endchars=endchars, **kw) + + @params_map + def for_each_endchar(*args, **kw): + return for_each_character(kw['endchars'])(C(*args, **kw)).items() + + # This params_map is used on exactly one expression, which has to contain a + # list of characters with no repeats. + @params_map + def stops_at_first_endchar_found(s): + for i in range(len(s)): + endchars = ''.join(sample((r := s[i:]), len(r))) + ec = charname(s[i]) + yield f'stops_at_first_endchar_found__string__{ec}', C( + s, + endchars=endchars, + remainder=r, + ) + yield f'stops_at_first_endchar_found__set__{ec}', C( + s, + endchars=set(endchars), + remainder=r, + ) + params_test_content_getter = Params( + + specified_tl_class = C( + 'word', + stringified='"word"', + value='word', + tl_class=parser.BareQuotedString, + ), + + text_type_ew = C( + 'A test =?UTF-8?q?foo?= ', + stringified='A test foo ', + text_type='fake', + ew_indexes = [7], + ), + + text_type_ew_missing_ws = C( + 'Never=?utf8?q?_foo_bar_?=do this', + stringified='Never foo bar do this', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + text_type='fake', + ew_indexes = [5], + ), + + text_type_no_ew_unicode = C( + 'A test Éric', + text_type='fake', + ), + + **for_each_character(ALL_ASCII)( + char_after_end_char = C( + '" a test "{char}', + start=1, + end_chars='"', + stringified=' a test ', + remainder='"{char}', + ), + ), + + start_in_middle_of_ew = C( + '=?UTF-8?q?foo?=', + start=3, + stringified='=?UTF-8?q?foo?='[3:], + ), + + end_in_middle_of_ew = C( + 'foo =?UTF-8?q?foo', + ), + + end_char = C( + '"foo"', + start=1, + end_chars='"', + stringified='foo', + remainder='"', + ), + + end_char_at_start = C( + '"foo"', + start=0, + end_chars='"', + stringified='', + remainder='"foo"', + ), + + no_end_char = C( + 'foo bar', + start=0, + end_chars='"', + stringified='foo bar', + ), + + end_char_inside_ew = C( + '"quoted =?UTF-8?q?q"?=" not', + start=1, + end_chars='"', + stringified='quoted q"', + remainder='" not', + ew_indexes = [8], + ), + + first_end_char_ends_parse = C( + "(a comment)bar", + start=1, + end_chars="()", + stringified="a comment", + remainder=')bar', + ), + + second_end_char_ends_parse = C( + "(a comment(nested))", + start=1, + end_chars="()", + stringified="a comment", + remainder='(nested))', + ), + + endchar_inside_ew_preserved = C( + r'"foo =?UTF-8?q?"bar?="', + start=1, + end_chars='"', + stringified='foo "bar', + remainder='"', + ew_indexes = [5], + ), + + qp_decoded_with_qp_true = C( + r"\fo\o", + qp=True, + stringified="foo", + ), + + qp_quoted_endchar_preserved_with_qp_true = C( + r'"foo\"bar"', + start=1, + end_chars='"', + qp=True, + stringified='foo"bar', + remainder='"', + ), + + qp_quoted_endchar_inside_ew_preserved_and_unquoted_with_qp_true = C( + r'"\foo =?UTF-8?q?\"bar?="', + start=1, + end_chars='"', + qp=True, + stringified='foo "bar', + remainder='"', + ew_indexes = [6], + ), + + qp_remains_quoted_if_qp_false = C( + r'"\foo\ =?UTF-8?q?\"bar?="', + start=1, + end_chars='"', + stringified=r'\foo\ \"bar', + qp=False, + remainder='"', + ew_indexes = [7], + ), + + ) + + + # _get_ptext_to_endchars + + # These tests are also passed by the replacement function, content_getter. + + @params + def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): + ptext, had_qp = self._test_parse( + parser._get_ptext_to_endchars, + C(s, endchars), + test_start=False, + **kw, + ) + self.assertEqual(had_qp, has_qp) + + params_test__get_ptext_to_endchars = Params( + + **for_each_endchar( + wsp_can_be_legal_endchars = C( + 'foo{char}bar"', + endchars='()' + RFC_WSP, + remainder='{char}bar"', + ), + ), + + **stops_at_first_endchar_found('(random?{})'), + + **for_each_endchar_set( + + one_word_no_wsp = C( + 'foo', + ), + + escaped_letter = C( + r'bar\s', + stringified='bars', + has_qp=True, + ), + + escaped_escape_char = C( + r'foo\\bar', + stringified=r'foo\bar', + has_qp=True, + ), + + any_printable_may_be_quoted = C( + ''.join(rf'\{c}' for c in RFC_PRINTABLES), + stringified=RFC_PRINTABLES, + has_qp=True, + ), + + ), + + **for_each_endchar( + for_each_endchar_set( + + stops_at_endchar = C( + 'foo{char}bar"', + remainder='{char}bar"', + ), + + quoted_endchar_no_actual_endchar = C( + r'foo\{char}bar', + stringified=r'foo{char}bar', + has_qp=True, + ), + + quoted_endchar_before_actual_endchar = C( + r'foo\{char}bar{char}', + stringified='foo{char}bar', + remainder='{char}', + has_qp=True, + ), + + multiple_qp = C( + r'\{char}\foo\\\{char}\a{char}', + stringified=r'{char}foo\{char}a', + remainder=r'{char}', + has_qp=True, + ), + + no_qp_before_endchar_but_some_after = C( + r'foo{char}a\b\a\r', + remainder=r'{char}a\b\a\r', + has_qp=False, + ), + + ), + ), + + ) + + # As the replacement function for _get_ptext_to_endchars (among other + # things) content_getter needs to pass the _get_ptext_to_endchars tests, + # which test somewhat different scenarios than the other content_getter + # tests. + @params_map + def adapt_ptext_tests_for_content_getter(*args, **kw): + endchars = kw.pop('endchars') + if 'has_qp' in kw: + # has_qp is intended to test the return flag as to whether qp was + # present. But content_getter doesn't return such a flag...that + # functionality will be handled via _qp_unquote directly. So just + # set qp=True so the qp will be decoded like _get_ptext_to_endchars + # does so the tests pass. + kw['qp'] = kw.pop('has_qp') + yield '', C(*args, end_chars=endchars, **kw) + + params_test_content_getter.update( + adapt_ptext_tests_for_content_getter(params_test__get_ptext_to_endchars) ) # parse_unstructured @params - def test_parse_unstructured(self, s, *args, **kw): + # XXX XXX Ignore ew_indexes until after get_unstructured is refactored. + def test_parse_unstructured(self, s, *args, ew_indexes=None, **kw): + # We ignore kw_indexes, that's for content_getter. result = self._test_parse( parser.parse_unstructured, C(s), @@ -1332,7 +1550,8 @@ def test_parse_unstructured(self, s, *args, **kw): # XXX POSTDEP: delete from here... @params - def test_get_unstructured(self, s, *args, **kw): + # XXX XXX Ignore ew_indexes until after get_unstructured is refactored. + def test_get_unstructured(self, s, *args, ew_indexes=None, **kw): result = self._test_parse( parser.get_unstructured, C(s), @@ -1364,6 +1583,10 @@ def adapt_get_encoded_word_tests_for_parse_unstructured(nl, *args, **kw): # or might contain something it would treat as a defect. Either # way, parse_unstructured isn't expected to handle those parameters. return + if kw.pop('decode_qp', False): + # parse_unstructured does not unquote quoted printables, so skip + # the tests where they are decoded. + return if 'stringified' in kw: stringified = kw['stringified'] kw['stringified'] = stringified + remainder @@ -1388,7 +1611,8 @@ def adapt_get_encoded_word_invalid_input_for_parse_unstructured(nl, s, **kw): def add_unstructured_prefix_and_suffix(s, *args, **kw): # Make sure the reused parameters are correctly interpreted when # intermixed with other text by adding some text. - pad = lambda s: f'pre fix {s} suf fix' + prefix = 'pre fix ' + pad = lambda s: f'{prefix}{s} suf fix' if not s: # null value is a special case, and we already have a test for it. return @@ -1396,7 +1620,10 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): kw = {n: (pad(v) if n in ('stringified', 'value') else v) for n, v in kw.items() } - yield '', C(s, *args, **kw) + ew_indexes, len_prefix = [], len(prefix) + if s != kw.get('stringified', s): + ew_indexes = [len_prefix] + yield '', C(s, *args, ew_indexes=ew_indexes, **kw) # XXX POSTDEP: remove 'params_test_get_unstructured' from next line. params_test_get_unstructured = params_test_parse_unstructured = Params( @@ -1446,61 +1673,72 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): '=?us-ascii?q?bar?=', stringified='bar', value='bar', + ew_indexes = [0], ), one_ew_trailing_ws = C( '=?us-ascii?q?bar?= ', stringified='bar ', value='bar ', + ew_indexes = [0], ), one_valid_ew_trailing_text = C( '=?us-ascii?q?bar?= bird', stringified='bar bird', + ew_indexes = [0], ), phrase_with_ew_in_middle_of_text = C( 'foo =?us-ascii?q?bar?= bird', stringified='foo bar bird', + ew_indexes = [4], ), phrase_with_two_ew = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?=', stringified='foo barbird', + ew_indexes = [4, 23], ), phrase_with_two_ew_trailing_ws = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?= ', stringified='foo barbird ', value='foo barbird ', + ew_indexes = [4, 23], ), phrase_with_ew_with_leading_ws = C( ' =?us-ascii?q?bar?=', stringified=' bar', value=' bar', + ew_indexes = [2], ), phrase_with_two_ew_extra_ws = C( 'foo =?us-ascii?q?bar?= \t =?us-ascii?q?bird?=', stringified='foo barbird', + ew_indexes = [4, 26], ), two_ew_extra_ws_trailing_text = C( '=?us-ascii?q?test?= =?us-ascii?q?foo?= val', stringified='testfoo val', value='testfoo val', + ew_indexes = [0, 22], ), ew_with_internal_ws = C( '=?iso-8859-1?q?hello=20world?=', stringified='hello world', + ew_indexes = [0], ), ew_with_internal_leading_ws = C( ' =?us-ascii?q?=20test?= =?us-ascii?q?=20foo?= val', stringified=' test foo val', value=' test foo val', + ew_indexes = [3, 28], ), invalid_ew = C( @@ -1523,6 +1761,7 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): undecodable_bytes_defect, (undecodable_bytes_in_ew_defect, 'us-ascii'), ], + ew_indexes = [0, 25], ), @@ -1533,18 +1772,21 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): missing_whitespace_after_ew_defect, missing_whitespace_before_ew_defect, ], + ew_indexes = [0, 15], ), ew_without_leading_whitespace = C( 'nowhitespace=?utf-8?q?somevalue?=', stringified='nowhitespacesomevalue', defects=[missing_whitespace_before_ew_defect], + ew_indexes = [12], ), ew_without_trailing_whitespace = C( '=?utf-8?q?somevalue?=nowhitespace', stringified='somevaluenowhitespace', defects=[missing_whitespace_after_ew_defect], + ew_indexes = [0], ), # bpo-37764 @@ -1552,6 +1794,7 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): '=?utf-8?q?somevalue?=aa', stringified='somevalueaa', defects=[missing_whitespace_after_ew_defect], + ew_indexes = [0], ), # Although this is technically invalid (unencoded =) we handle it anyway @@ -1559,12 +1802,14 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): invalid_ew2 = C( '=?utf-8?q?=somevalue?=', '=somevalue', + ew_indexes = [0], ), **for_each_character(RFC_PRINTABLES)( printable_around_and_between_ews = C( '{char} =?utf-8?q?foo?= {char} =?utf-8?q?bar?= {char}', stringified='{char} foo {char} bar {char}', + ew_indexes = [2, 20], ), ), @@ -1572,6 +1817,7 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): printable_inside_ews = C( '=?utf-8?q?rock{char}?= =?utf-8?q?{char}hard_place?=', stringified='rock{char}{char}hard place', + ew_indexes = [0, 18], ), ), @@ -1588,6 +1834,7 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): '=?utf-8?q?some{char}?= text', stringified='some{char} text', defects=[(nonprintable_defect, '{char}')], + ew_indexes = [0], ), ), @@ -1604,15 +1851,23 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): 'a =?invalid?q?=C3=89ric?= b', stringified='a \udcc3\udc89ric b', defects=[charset_defect('invalid'), undecodable_bytes_defect], + ew_indexes = [2], ), ew_start_chrome_before_real_ew = C( 'z=?xx =?UTF-8?Q?foo?=', stringified='z=?xx foo', + ew_indexes = [6], ), ) + # content_getter and parse_unstructured must behave identically for all the + # data parse_unstructured handles. + params_test_content_getter__with_parse_unstructured_params = ( + params_test_parse_unstructured + ) + # get_qp_ctext From a6a9d3165b560eb616883857a677f3d263c76325 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Thu, 19 Mar 2026 19:25:04 -0400 Subject: [PATCH 131/152] Deprecate _get_ptext_to_endchars. We delete one of the _get_ptext_to_endchars tests that was checking for the correct non-detecting of quoted printables that came after the endchar because that test isn't relevant to content_getter. As noted that functionality will be handled directly by _qp_unquote. The fact that there is one less test for _get_ptext_to_endchars doesn't really matter since before this PR there were non at all. --- Lib/email/_header_value_parser.py | 1 + .../test_email/test__header_value_parser.py | 63 ++++++++----------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 0279f493957d27c..4645d0d546a547f 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1191,6 +1191,7 @@ def _get_xtext(value, start, regex, terminal_class, token_type, err=None): raise err return _make_xtext(m.group(), terminal_class, token_type), m.end() +@_deprecate('content_getter') def _get_ptext_to_endchars(value, endchars): """Scan printables/quoted-pairs until endchars and return unquoted ptext. diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b77ebc5b65b565a..df9112580986ecd 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1247,28 +1247,28 @@ def for_each_endchar_set(*args, **kw): comment='()', domain_literal='[]', ) - for name, endchars in endchar_sets.items(): - yield name, C(*args, endchars=endchars, **kw) + for name, end_chars in endchar_sets.items(): + yield name, C(*args, end_chars=end_chars, **kw) @params_map def for_each_endchar(*args, **kw): - return for_each_character(kw['endchars'])(C(*args, **kw)).items() + return for_each_character(kw['end_chars'])(C(*args, **kw)).items() # This params_map is used on exactly one expression, which has to contain a # list of characters with no repeats. @params_map def stops_at_first_endchar_found(s): for i in range(len(s)): - endchars = ''.join(sample((r := s[i:]), len(r))) + end_chars = ''.join(sample((r := s[i:]), len(r))) ec = charname(s[i]) yield f'stops_at_first_endchar_found__string__{ec}', C( s, - endchars=endchars, + end_chars=end_chars, remainder=r, ) yield f'stops_at_first_endchar_found__set__{ec}', C( s, - endchars=set(endchars), + end_chars=set(end_chars), remainder=r, ) @@ -1416,6 +1416,8 @@ def stops_at_first_endchar_found(s): ew_indexes = [7], ), + # XXX POSTDEP: delete from here... + ) @@ -1424,21 +1426,26 @@ def stops_at_first_endchar_found(s): # These tests are also passed by the replacement function, content_getter. @params - def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): + def test__get_ptext_to_endchars(self, s, end_chars, qp=False, **kw): ptext, had_qp = self._test_parse( parser._get_ptext_to_endchars, - C(s, endchars), + C(s, end_chars), + warnings=[ + (DeprecationWarning, '.*deprecated.*content_getter'), + ], test_start=False, **kw, ) - self.assertEqual(had_qp, has_qp) + self.assertEqual(had_qp, qp) params_test__get_ptext_to_endchars = Params( + # XXX POSTDEP: ...to here + **for_each_endchar( wsp_can_be_legal_endchars = C( 'foo{char}bar"', - endchars='()' + RFC_WSP, + end_chars='()' + RFC_WSP, remainder='{char}bar"', ), ), @@ -1454,19 +1461,19 @@ def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): escaped_letter = C( r'bar\s', stringified='bars', - has_qp=True, + qp=True, ), escaped_escape_char = C( r'foo\\bar', stringified=r'foo\bar', - has_qp=True, + qp=True, ), any_printable_may_be_quoted = C( ''.join(rf'\{c}' for c in RFC_PRINTABLES), stringified=RFC_PRINTABLES, - has_qp=True, + qp=True, ), ), @@ -1482,27 +1489,21 @@ def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): quoted_endchar_no_actual_endchar = C( r'foo\{char}bar', stringified=r'foo{char}bar', - has_qp=True, + qp=True, ), quoted_endchar_before_actual_endchar = C( r'foo\{char}bar{char}', stringified='foo{char}bar', remainder='{char}', - has_qp=True, + qp=True, ), multiple_qp = C( r'\{char}\foo\\\{char}\a{char}', stringified=r'{char}foo\{char}a', remainder=r'{char}', - has_qp=True, - ), - - no_qp_before_endchar_but_some_after = C( - r'foo{char}a\b\a\r', - remainder=r'{char}a\b\a\r', - has_qp=False, + qp=True, ), ), @@ -1510,25 +1511,13 @@ def test__get_ptext_to_endchars(self, s, endchars, has_qp=False, **kw): ) + # XXX POSTDEP: delete from here... # As the replacement function for _get_ptext_to_endchars (among other # things) content_getter needs to pass the _get_ptext_to_endchars tests, # which test somewhat different scenarios than the other content_getter # tests. - @params_map - def adapt_ptext_tests_for_content_getter(*args, **kw): - endchars = kw.pop('endchars') - if 'has_qp' in kw: - # has_qp is intended to test the return flag as to whether qp was - # present. But content_getter doesn't return such a flag...that - # functionality will be handled via _qp_unquote directly. So just - # set qp=True so the qp will be decoded like _get_ptext_to_endchars - # does so the tests pass. - kw['qp'] = kw.pop('has_qp') - yield '', C(*args, end_chars=endchars, **kw) - - params_test_content_getter.update( - adapt_ptext_tests_for_content_getter(params_test__get_ptext_to_endchars) - ) + params_test_content_getter.update(params_test__get_ptext_to_endchars) + # XXX POSTDEP: ...to here # parse_unstructured From ce077d48c0a9b5c0309ad8223997d041f805aa4e Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 8 May 2026 09:45:11 -0400 Subject: [PATCH 132/152] Refactor parse_unstructured to use content_getter. --- Lib/email/_header_value_parser.py | 44 +++---------------- .../test_email/test__header_value_parser.py | 6 +-- 2 files changed, 8 insertions(+), 42 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 4645d0d546a547f..b231be87544cabd 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1473,7 +1473,7 @@ def _get_content( start = end return tl, start -pre_ew_re = re.compile(rf'[^{_WSP + '='}]*') +_get_unstructured_content = content_getter(UnstructuredTokenList, 'utext') def parse_unstructured(value): """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) @@ -1489,43 +1489,11 @@ def parse_unstructured(value): should have the token_type 'utext'. """ - # We don't actually accept LF or CR, we treat them as a non-printable - # defect. This is because the parser is designed to process strings where - # unfolding has already been done, pre-handling legal CR/LF characters. - tl = UnstructuredTokenList() - start, vlen = 0, len(value) - while start < vlen: - if value[start] in WSP: - token, start = get_fws(value, start) - tl.append(token) - continue - ew = None - m = pre_ew_re.match(value, start) - end = m.end() - if end < vlen: - if value[end] == '=': - res = get_encoded_word(value, end, 'utext') - if res: - ew, end = res - else: - m = _non_wsp_re.match(value, start) - ew, end = None, m.end() - text = m.group() - # At this point we have text, an ew, or both; we can't have neither. - if tl and tl[-1].token_type == 'encoded-word': - tl.defects.append(_MissingWhitespaceAfterEWDefect) - if text: - tl.append(_make_xtext(text, ValueTerminal, 'utext')) - if ew: - if tl: - if tl[-1].token_type == 'fws': - if len(tl) > 1 and tl[-2].token_type == 'encoded-word': - tl[-1] = EWWhiteSpaceTerminal(tl[-1], 'fws') - else: - tl.defects.append(_MissingWhitespaceBeforeEWDefect) - tl.append(ew) - start = end - return tl + # We don't actually handle CR or LF in obs, instead we treat them as a + # non-printable defect. Normally they won't even appear in value, since + # the code that calls the parser will have done header unfolding. + unstructured, _ = _get_unstructured_content(value, 0) + return unstructured @_deprecate('parse_unstructured') def get_unstructured(value): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index df9112580986ecd..750007daf19c751 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1523,8 +1523,7 @@ def test__get_ptext_to_endchars(self, s, end_chars, qp=False, **kw): # parse_unstructured @params - # XXX XXX Ignore ew_indexes until after get_unstructured is refactored. - def test_parse_unstructured(self, s, *args, ew_indexes=None, **kw): + def test_parse_unstructured(self, s, *args, **kw): # We ignore kw_indexes, that's for content_getter. result = self._test_parse( parser.parse_unstructured, @@ -1539,8 +1538,7 @@ def test_parse_unstructured(self, s, *args, ew_indexes=None, **kw): # XXX POSTDEP: delete from here... @params - # XXX XXX Ignore ew_indexes until after get_unstructured is refactored. - def test_get_unstructured(self, s, *args, ew_indexes=None, **kw): + def test_get_unstructured(self, s, *args, **kw): result = self._test_parse( parser.get_unstructured, C(s), From e541d1cd1d9e746ef90b4070728cb799f4cc7a6e Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 6 Jun 2026 16:09:02 -0400 Subject: [PATCH 133/152] Add get_ccontent_sequence. This function is only RFC BNF adjacent, but is no further off than get_qp_ctext, which it will replace. Its tests add whitespace and ew support testing and reuses most of the tests from get_qp_ctext. --- Lib/email/_header_value_parser.py | 28 ++++ .../test_email/test__header_value_parser.py | 125 +++++++++++++++++- 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b231be87544cabd..72f70eeaea8b229 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1499,6 +1499,34 @@ def parse_unstructured(value): def get_unstructured(value): return parse_unstructured(value) +_get_ccontent_content = content_getter( + TokenList, + 'ptext', + end_chars='()', + qp=True, + ) +def get_ccontent_sequence(value, start): + """ccontent_sequence = *([FWS] qp_ctext / encoded_word [FWS]) + + This bridges the RFC ctext, ccontent, and comment into something that + makes recovery from errors in the input easier. + + Return a (possibly empty) TokenList containing all characters up to the + next unquoted open or close parenthesis outside of an encoded word (or the + end of value if there isn't one) and the index of that parenthesis (or the + len of value), unquoting any quoted pairs and decoding any encoded words. + All ValueTerminals returned should have the token_type 'ptext'. + + Encoded words should be decoded even if there is non-whitespace around + them, and whether or not they contain any RFC invalid whitespace. Register + defects for any internal or missing whitespace. + + Register defects if there are any non-printable or undecodable characters + in the non-whitespace tokens. + + """ + return _get_ccontent_content(value, start) + def get_qp_ctext(value): r"""ctext = diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 750007daf19c751..b003c7e182d497d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1856,6 +1856,115 @@ def add_unstructured_prefix_and_suffix(s, *args, **kw): ) + # get_ccontent_sequence + + @params + def test_get_ccontent_sequence(self, s, *args, **kw): + tl = self._test_parse( + parser.get_ccontent_sequence, + C(s), + *args, + **kw, + ) + self.assertIsInstance(tl, parser.TokenList) + self.verify_terminal_types(tl, 'ptext', 'fws') + + params_test_get_ccontent_sequence = Params( + + **for_each_character(RFC_WSP)( + two_words = C( + 'foo{char}de', + value='foo de', + ), + ), + + wsp_before_close_paren = C( + 'foo \t)', + value='foo ', + remainder=')', + ), + + up_to_open_paren_only = C( + 'foo(', + remainder='(', + ), + + wsp_before_open_paren = C( + 'foo \t(', + value='foo ', + remainder='(', + ), + + ew = C( + '=?UTF-8?q?test?=', + stringified='test', + ew_indexes=[0], + ), + + ws_around_ew = C( + ' =?UTF-8?q?test?= ', + stringified=' test ', + ew_indexes=[1], + ), + + ws_inside_ew = C( + '=?UTF-8?q? Test ?=', + stringified=' Test ', + defects=[whitespace_inside_ew_defect], + ew_indexes=[0], + ), + + non_ws_around_ew = C( + 'foo=?UTF-8?q?bar_?=bird', + stringified='foobar bird', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[3], + ), + + multiple_ew = C( + 'foo =?UTF-8?q?a?= =?UTF-8?q?t?=', + stringified='foo at', + ew_indexes=[4, 18], + ), + + ew_missing_whitespace_between_ews = C( + 'foo =?UTF-8?q?a?==?UTF-8?q?t?=', + stringified='foo at', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[4, 17], + ), + + **for_each_character(RFC_WSP)( + inter_ew_whitespace_handled_correctly = C( + '{char}=?UTF-8?q?_foo_?={char}{char}=?UTF-8?q?bar_?= ', + stringified='{char} foo bar ', + value=' foo bar ', + ew_indexes=[1, 20], + ), + ), + + qp_inside_ew = C( + r'=?UTF-8?q?\test\)_?= =?UTF-8?q?\(test?=', + stringified=r'test) (test', + ew_indexes=[0, 21], + ), + + unquoted_parens_inside_ew = C( + '=?UTF-8?q?test)_?= =?UTF-8?q?(test?=) foo', + stringified=r'test) (test', + remainder=') foo', + ew_indexes=[0, 19], + ), + + ) + + # get_qp_ctext @params @@ -1865,12 +1974,14 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): C(s), *args, value=value, + warnings=..., + test_start=False, **kw, ) self.assertIsInstance(ptext, parser.Terminal) self.assertEqual(ptext.token_type, 'ptext') - params_test_get_qp_ctext = old_api_only( + params_test_get_qp_ctext = Params( value_ends_at_input_end = C( 'foobar', @@ -1974,6 +2085,18 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): ) + params_test_get_ccontent_sequence.update( + + # get_ccontent_sequence is handling a superset of what get_qp_ctext + # used to handle. It should pass the get_qp_ctext tests that don't + # involve whitespace, which get_qp_ctext stops at. + include_unless( + lambda n, *a, **k: 'wsp' in str(n) or 'two_words_gets_first' in n, + label='from_test_get_qp_ctext', + )(params_test_get_qp_ctext) + + ) + # get_qcontent From 2c3e1bb3bf8106acbd0262223baa0cff2d08c11f Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 6 Jun 2026 16:09:27 -0400 Subject: [PATCH 134/152] Have get_comment reuse the get_ccontent_sequence tests. This doesn't add a lot of tests, but it does mean that any tests added to get_ccontent_sequence will have to be consciously skipped if they don't pass get_comment. --- .../test_email/test__header_value_parser.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index b003c7e182d497d..ee2ad2bccb65460 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2415,8 +2415,36 @@ def test_get_comment(self, self.assertEqual(cmt.token_type, 'comment') self.verify_terminal_types(cmt, 'ptext', 'fws') + @params_map(with_name=True) + def adapt_get_ccontent_sequence_tests_for_get_comment( + name, + s, + *args, + stringified=None, + remainder='', + **kw, + ): + # get_comment parses parens, and quotes them differently in str, so + # tests involving parens in the test string won't pass here. + if '(' in s or ')' in s: + return + # XXX XXX (most) ew tests will work after get_comment is refactored. + if 'ew' in name: + return + if stringified: + kw['comments'] = [stringified] + kw['stringified'] = f"({stringified})" + else: + kw['comments'] = [s] + kw.pop('value', None) + yield 'from_test_get_ccontent_sequence', C(f'({s})', *args, **kw) + params_test_get_comment = old_api_only( + adapt_get_ccontent_sequence_tests_for_get_comment( + params_test_get_ccontent_sequence, + ), + simple_comment_only = C( '(comment)', comments=['comment'], From 5e07f120fadb6cb5ac88b8034996c1024a19df0a Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 9 May 2026 10:07:25 -0400 Subject: [PATCH 135/152] Preliminary deprecation of get_qp_ctext. Reorganize the tests to put the ones involving whitespace together, so we can split those out in the next step. --- Lib/email/_header_value_parser.py | 1 + .../test_email/test__header_value_parser.py | 48 +++++++++++-------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 72f70eeaea8b229..6ce972a63a56dc8 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1527,6 +1527,7 @@ def get_ccontent_sequence(value, start): """ return _get_ccontent_content(value, start) +@_deprecate('get_ccontent_sequence') def get_qp_ctext(value): r"""ctext = diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index ee2ad2bccb65460..16c84096c9fbda3 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1965,6 +1965,8 @@ def test_get_ccontent_sequence(self, s, *args, **kw): ) + # XXX XXX add the POSTDEP comment after reorganizing the tests. + # # get_qp_ctext @params @@ -1974,7 +1976,11 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): C(s), *args, value=value, - warnings=..., + warnings=[ + (DeprecationWarning, '.*deprecated.*get_ccontent_sequence'), + (DeprecationWarning, '.*ptext.*deprecated'), + (DeprecationWarning, '.*validate.*deprecated'), + ], test_start=False, **kw, ) @@ -1983,16 +1989,6 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): params_test_get_qp_ctext = Params( - value_ends_at_input_end = C( - 'foobar', - ), - - all_printables = C( - RFC_PRINTABLES. - replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)'), - stringified=RFC_PRINTABLES, - ), - two_words_gets_first = C( 'foo de', remainder=' de', @@ -2003,16 +1999,31 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): remainder=' \t\tde', ), - up_to_close_paren_only = C( - 'foo)', - remainder=')', - ), - wsp_before_close_paren_preserved = C( 'foo )', remainder=' )', ), + wsp_before_open_paren_preserved = C( + 'foo (', + remainder=' (', + ), + + value_ends_at_input_end = C( + 'foobar', + ), + + all_printables = C( + RFC_PRINTABLES. + replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)'), + stringified=RFC_PRINTABLES, + ), + + up_to_close_paren_only = C( + 'foo)', + remainder=')', + ), + close_paren_mid_word = C( 'foo)bar', remainder=')bar', @@ -2023,11 +2034,6 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): remainder='(', ), - wsp_before_open_paren_preserved = C( - 'foo (', - remainder=' (', - ), - open_paren_mid_word = C( 'foo(bar', remainder='(bar', From 016047cf73fc1845057bdf3e60cd18fc6640a661 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 9 May 2026 10:16:30 -0400 Subject: [PATCH 136/152] Set up easy POSTDEP removal for get_qp_ctext. --- .../test_email/test__header_value_parser.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 16c84096c9fbda3..7efc444b1e658b4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1962,11 +1962,11 @@ def test_get_ccontent_sequence(self, s, *args, **kw): ew_indexes=[0, 19], ), + # XXX POSTDEP: delete from here... + ) - # XXX XXX add the POSTDEP comment after reorganizing the tests. - # # get_qp_ctext @params @@ -1987,7 +1987,7 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): self.assertIsInstance(ptext, parser.Terminal) self.assertEqual(ptext.token_type, 'ptext') - params_test_get_qp_ctext = Params( + params_test_get_qp_ctext__wsp_cases = Params( two_words_gets_first = C( 'foo de', @@ -2009,6 +2009,12 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): remainder=' (', ), + ) + + params_test_get_qp_ctext = Params( + + # XXX POSTDEP: ...to here. + value_ends_at_input_end = C( 'foobar', ), @@ -2091,17 +2097,14 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): ) + # XXX POSTDEP: delete from here... + # get_ccontent_sequence is handling a superset of what get_qp_ctext used to + # handle. It should pass this subset of get_qp_ctext tests that don't + # involve whitespace. params_test_get_ccontent_sequence.update( - - # get_ccontent_sequence is handling a superset of what get_qp_ctext - # used to handle. It should pass the get_qp_ctext tests that don't - # involve whitespace, which get_qp_ctext stops at. - include_unless( - lambda n, *a, **k: 'wsp' in str(n) or 'two_words_gets_first' in n, - label='from_test_get_qp_ctext', - )(params_test_get_qp_ctext) - + add_label('from_test_get_qp_ctext')(params_test_get_qp_ctext) ) + # XXX POSDEP: ...to here. # get_qcontent From cb1b9f46f1888bff4fe1bbfcc8cab1636d709e16 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 15 Jan 2026 16:55:24 -0500 Subject: [PATCH 137/152] Refactor get_bare_quoted_string. get_bare_quoted string becomes the quoted-string equivalent of get_ccontent_sequence, implemented now via content_getter. It will replace get_qcontent the same way get_ccontent_sequence replaces get_qp_ctext. It is more closely aligned with the RFC BNF than get_ccontent_sequence, but now fully handles "dirty data" in the form of improperly encoded words. Unlike for get_ccontent_sequence, get_bare_quoted_string, while it is a functional replacement for get_content, is not a *direct* replacement: semantically it is different in that it also handles the required leading a trailing quotes (or the absence of the latter). We therefore can't re-use the get_qcontent tests, but I have verified that there are equivalent tests for each of them in the existing get_bare_quoted_string tests. BUGFIX: The fix for bpo-16983 enabled decoding of encoded words inside quoted strings, where they are technically invalid. That fix did not handle the not uncommon case of there being whitespace missing before such encoded words, which is now fixed. BUGFIX: The fix for bpo-16983 incorrectly registered a missing whitespace defect if an encoded word ended just before the trailing quote in a quoted string. This defect is no longer registered for that case. This changeset also tweaks the defect message for encoded word inside a quoted string to use the RFC notation "encoded-word" instead of "encoded word". --- Lib/email/_header_value_parser.py | 88 +++++++++++-------- .../test_email/test__header_value_parser.py | 41 ++++++--- Lib/test/test_email/test_headerregistry.py | 5 +- 3 files changed, 80 insertions(+), 54 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 6ce972a63a56dc8..b6b59eb49fd3c67 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1576,47 +1576,61 @@ def get_atext(value): _validate_xtext(atext) return atext, value -def get_bare_quoted_string(value): +_get_bare_quoted_string_content = content_getter( + BareQuotedString, + 'ptext', + end_chars='"', + qp=True, + ) +@_deprecate_old_api +def get_bare_quoted_string(value, start): """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE - A quoted-string without the leading or trailing white space. Its - value is the text between the quote marks, with whitespace - preserved and quoted pairs decoded. + This is a subset of the RFC 5322 quoted-string: the quoted string without + any of the CFWS that might come before or after the '"'s. + + If start does not point to a double quote in value, raise an error. + Otherwise return a (possibly empty) BareQuotedString incorporating all + characters up to the next unquoted double quote (or the end of value if + there is no double quote) and the index of the character after the double + quote (or the len of value), unquoting any quoted pairs. The returned + BareQuotedString should not contain any ValueTerminals for the double quote + marks, but when stringified the quotes should be added, whether the + trailing quote was present in value or not. If the trailing quote is not + present register a defect. + + If the content after quoted pair decoding contains any RFC 2047 encoded + words, decode them, whether they are correctly bracketed by whitespace + or not, and whether they contain internal whitespace or not. Register + a defect for the presence of any such word, as well as defects for + any whitespace issues. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ - if not value or value[0] != '"': + # This implementation bypasses the RFC qcontent BNF element in favor of + # using our generic content_getter to decode (RFC invalid) encoded words. + vlen = len(value) + if start >= vlen or value[start] != '"': raise errors.HeaderParseError( - "expected '\"' but found '{}'".format(value)) - bare_quoted_string = BareQuotedString() - value = value[1:] - if value and value[0] == '"': - return bare_quoted_string, value[1:] - while value and value[0] != '"': - if value[0] in WSP: - token, value = get_fws(value) - elif value[:2] == '=?': - valid_ew = False - try: - token, value = get_encoded_word(value, terminal_type='ptext') - bare_quoted_string.defects.append(errors.InvalidHeaderDefect( - "encoded word inside quoted string")) - valid_ew = True - except errors.HeaderParseError: - token, value = get_qcontent(value) - # Collapse the whitespace between two encoded words that occur in a - # bare-quoted-string. - if valid_ew and len(bare_quoted_string) > 1: - if (bare_quoted_string[-1].token_type == 'fws' and - bare_quoted_string[-2].token_type == 'encoded-word'): - bare_quoted_string[-1] = EWWhiteSpaceTerminal( - bare_quoted_string[-1], 'fws') - else: - token, value = get_qcontent(value) - bare_quoted_string.append(token) - if not value: - bare_quoted_string.defects.append(errors.InvalidHeaderDefect( - "end of header inside quoted string")) - return bare_quoted_string, value - return bare_quoted_string, value[1:] + f"expected '\"' but found {value[start:]!r}" + ) + start += 1 + bare_quoted_string, start = _get_bare_quoted_string_content(value, start) + if bare_quoted_string.ew_indexes: + # XXX some day we'll put each index into its own defect. + bare_quoted_string.defects.extend( + [ + errors.InvalidHeaderDefect('encoded-word inside quoted string'), + ] * len(bare_quoted_string.ew_indexes) + ) + if start < vlen: + return bare_quoted_string, start + 1 + bare_quoted_string.defects.append( + errors.InvalidHeaderDefect("end of header inside quoted string"), + ) + return bare_quoted_string, start def get_comment(value): """comment = "(" *([FWS] ccontent) [FWS] ")" diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 7efc444b1e658b4..e42c85c1db51668 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -128,7 +128,7 @@ def charset_defect(chars): ew_inside_quoted_string_defect = ( errors.InvalidHeaderDefect, - 'encoded word inside quoted string', + 'encoded-word inside quoted string', ) end_inside_comment_defect = ( @@ -2255,7 +2255,7 @@ def test_get_bare_quoted_string(self, s, *args, **kw): self.assertEqual(bqs.token_type, 'bare-quoted-string') self.verify_terminal_types(bqs, 'ptext', 'fws') - params_test_get_bare_quoted_string = old_api_only( + params_test_get_bare_quoted_string = for_each_api( non_ws = C( '"foo"', @@ -2351,22 +2351,21 @@ def test_get_bare_quoted_string(self, s, *args, **kw): '"=?utf-8?Q?not_really_valid?="', stringified='"not really valid"', value='not really valid', - defects=[ - ew_inside_quoted_string_defect, - missing_whitespace_after_ew_defect, - ], + defects=[ew_inside_quoted_string_defect], + ew_indexes=[1], ), - # XXX XXX The decode failure here will be fixed in the refactor. mixed_encoded_words_and_regular_text = C( '"This has=?utf-8?Q?multiple?= =?utf-8?q?errors?=in it', - stringified='"This has=?utf-8?Q?multiple?= errorsin it"', - value='This has=?utf-8?Q?multiple?= errorsin it', + stringified='"This hasmultipleerrorsin it"', + value='This hasmultipleerrorsin it', defects=[ - ew_inside_quoted_string_defect, + *[ew_inside_quoted_string_defect]*2, missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, end_inside_quoted_string_defect, ], + ew_indexes=[9, 30], ), encoded_word_after_dquote_with_no_ws = C( @@ -2383,6 +2382,7 @@ def test_get_bare_quoted_string(self, s, *args, **kw): ew_inside_quoted_string_defect, charset_defect('foo'), ], + ew_indexes=[1], ), empty = C( @@ -3595,9 +3595,11 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): defects=[ # XXX XXX After refactoring there should be one 'after' defect missing_whitespace_after_ew_defect, - missing_whitespace_after_ew_defect, ew_inside_quoted_string_defect, ], + # XXX XXX this will change during refactoring. Currently only + # get_bare_quoted_string is adding indexes. + ew_indexes=[1], ), ew_after_quoted_string_missing_space = C( @@ -3606,9 +3608,12 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value='disjointed', defects=[ # XXX XXX After refactoring 'after' should become 'before' - missing_whitespace_after_ew_defect, + #missing_whitespace_after_ew_defect, ew_inside_quoted_string_defect, ], + # XXX XXX this will change during refactoring. Currently only + # get_bare_quoted_string is adding indexes. + ew_indexes=[1], ), **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + '."')( @@ -3849,10 +3854,13 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): # XXX XXX There should be exactly one ew whitespace defect # here, but the number generated will change during refactor, # until it is fixed when get_obs_local_part is refactored. - *[missing_whitespace_after_ew_defect]*2, + missing_whitespace_after_ew_defect, missing_dot_in_local_part_defect, ew_inside_quoted_string_defect, ], + # XXX XXX this will change during refactoring. Currently only + # get_bare_quoted_string is adding indexes. + ew_indexes=[1], ), less_invalid_ew_atoms = C( @@ -3928,8 +3936,10 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): kw['local_part'] = kw.pop('content') yield '', C(*args, **kw) - @params_map + # XXX XXX revert to no with_namelist when get_local_part is refactored + @params_map(with_namelist=True) def adapt_get_obs_local_part_tests_for_get_local_part( + nl, *args, defects=[], **kw, @@ -3947,6 +3957,9 @@ def adapt_get_obs_local_part_tests_for_get_local_part( defects.append(not_even_obs_local_part_defect) else: defects.append(non_dot_atom_local_part_obs_defect) + # XXX XXX delete this fixup when get_local_part is refactored. + if 'invalid_ew_atoms' in nl: + kw.pop('ew_indexes') yield '', C(*args, defects=defects, **kw) params_test_get_local_part = old_api_only( diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index aa918255d15c37e..ef4d8f1d319fc6f 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -920,7 +920,7 @@ def content_disp_as_value(self, ' =?UTF-8?Q?pdf?="', 'attachment', {'filename': 'Schulbesuchsbestättigung.pdf'}, - [errors.InvalidHeaderDefect]*3, + [errors.InvalidHeaderDefect]*2, ('attachment; filename="Schulbesuchsbestättigung.pdf"'), ('Content-Disposition: attachment;\n' ' filename*=utf-8\'\'Schulbesuchsbest%C3%A4ttigung.pdf\n'), @@ -1242,8 +1242,7 @@ class TestAddressHeader(TestHeaderBase): 'rfc2047_atom_in_quoted_string_is_decoded': ('"=?utf-8?q?=C3=89ric?=" ', - [errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect], + [errors.InvalidHeaderDefect], 'Éric ', 'Éric', 'foo@example.com', From d5dff26bdc917a1d065f9329fec7537d819c2c73 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 20 Mar 2026 15:19:04 -0400 Subject: [PATCH 138/152] Preliminary deprecation of get_qcontent. get_parameter uses it, but we'll fix that when we refactor that function. --- Lib/email/_header_value_parser.py | 1 + .../test_email/test__header_value_parser.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b6b59eb49fd3c67..2ae6ff16816b655 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1545,6 +1545,7 @@ def get_qp_ctext(value): _validate_xtext(ptext) return ptext, value +@_deprecate('get_bare_quoted_string') def get_qcontent(value): """qcontent = qtext / quoted-pair diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e42c85c1db51668..9797b3606be01a4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2107,15 +2107,28 @@ def test_get_qp_ctext(self, s, *args, value=' ', **kw): # XXX POSDEP: ...to here. + # XXX POSTDEP: delete from here... + # # get_qcontent @params def test_get_qcontent(self, s, *args, **kw): - ptext = self._test_parse(parser.get_qcontent, C(s), *args, **kw) + ptext = self._test_parse( + parser.get_qcontent, + C(s), + *args, + test_start=False, + warnings=[ + (DeprecationWarning, r".*deprecated.*get_bare_quoted_string"), + (DeprecationWarning, r".*ptext.*deprecated"), + (DeprecationWarning, r".*validate.*deprecated"), + ], + **kw, + ) self.assertIsInstance(ptext, parser.Terminal) self.assertEqual(ptext.token_type, 'ptext') - params_test_get_qcontent = old_api_only( + params_test_get_qcontent = Params( no_qp_no_end_char = C( 'foobar', @@ -2172,6 +2185,8 @@ def test_get_qcontent(self, s, *args, **kw): ) + # XXX POSTDEP: ...to here. + # get_atext From 1f64c32fd14ec5f1462b410b1b97ab9356ca2acf Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 20 Mar 2026 16:14:19 -0400 Subject: [PATCH 139/152] Add get_atext_sequence. This will replace get_atext. It handles encoded words at a level that is more useful to our error-recovery parsing. We want to recognize encoded words almost everywhere they occur, which means looking for them at the atext level. This will become clearer when we refactor get_dot_atom_text, but the disabled ew tests should give a clue. --- Lib/email/_header_value_parser.py | 35 ++++++ .../test_email/test__header_value_parser.py | 118 ++++++++++++++++-- 2 files changed, 143 insertions(+), 10 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 2ae6ff16816b655..5a7cc0b666b98eb 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -83,8 +83,12 @@ # https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 _WSP = ' \t' WSP = set(_WSP) +# This isn't an RFC concept but is useful for parsing. CFWS_LEADER = WSP | set('(') +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 SPECIALS = set(r'()<>@,:;.\"[]') +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 +# These are the characters that *can't* appear in an atom/dot-atom (non-atext). ATOM_ENDS = SPECIALS | WSP DOT_ATOM_ENDS = ATOM_ENDS - set('.') # '.', '"', and '(' do not end phrases in order to support obs-phrase @@ -1561,6 +1565,37 @@ def get_qcontent(value): _validate_xtext(ptext) return ptext, value +_get_atext_content = content_getter(TokenList, 'atext', end_chars=ATOM_ENDS) +def get_atext_sequence(value, start): + """atext = Printable US-ASCII characters not including specials + + This augments the RFC atext by handling encoded words at a level that makes + it easier to recover from errors in the input. + + Return a TokenList containing all characters up to the next special or WSP + outside of an encoded word (or the end of value), and the index of the + special or WSP (or the len of value), decoding any encoded words. + + Raise a HeaderParseError if no characters are found before the special, + WSP, or end of value. + + Encoded words should be decoded even if there is non-whitespace around + them, and whether or not they contain any RFC invalid whitespace. Register + internal or missing whitespace defects. + + Register defects if there are any non-printable or undecodable characters + in the non-whitespace tokens. + + All ValueTerminals returned should have the type 'atext'. + + """ + atext, end = _get_atext_content(value, start) + if not atext: + raise errors.HeaderParseError( + f"expected atext but found {value[start:]!r}", + ) + return atext, end + def get_atext(value): """atext = diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9797b3606be01a4..f179e6266a9d3e9 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2188,17 +2188,92 @@ def test_get_qcontent(self, s, *args, **kw): # XXX POSTDEP: ...to here. + # get_atext_sequence + + @params + def test_get_atext_sequence(self, s, *args, **kw): + tl = self._test_parse(parser.get_atext_sequence, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(tl, parser.TokenList) + # There can be fws inside the encoded words. + self.verify_terminal_types(tl, 'atext', 'fws') + + params_test_get_atext_sequence = Params( + + ew_only = C( + '=?utf-8?q?=20bob?=', + stringified=' bob', + ew_indexes=[0], + ), + + # get_atext_sequence doesn't add a missing whitespace error here even + # though the RFC requires one before the special, because adding that + # defect is handled at the next level up in the parser. + # XXX Ideally this should have a defect for the specials. + **for_each_character(RFC_SPECIALS)( + ew_with_unencoded_special = C( + '=?UTF-8?q?bob{char}?=@foo', + stringified='bob{char}', + remainder='@foo', + ew_indexes=[0], + ), + ), + + ew_after_atom_no_ws = C( + 'foo@=?UTF-8?q?bob?=', + value='foo', + remainder='@=?UTF-8?q?bob?=', + ), + + multiple_ew_no_ws = C( + '=?UTF-8?q?foo?==?UTF-8?q?bar?=', + stringified='foobar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], + ), + + ew_in_middle_of_atext = C( + 'foo{=?UTF-8?q?foo?=}{=?UTF-8?q?bar?=}bar', + stringified='foo{foo}{bar}bar', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[4, 21], + ), + + all_non_special_printables_are_allowed = C( + f'{"".join(set(RFC_PRINTABLES) - set(RFC_SPECIALS))}@', + remainder='@', + ), + + ) + + # get_atext @params def test_get_atext(self, s, *args, **kw): - atext = self._test_parse(parser.get_atext, C(s), *args, **kw) + atext = self._test_parse( + parser.get_atext, + C(s), + *args, + warnings=..., + test_start=False, + **kw, + ) if 'exception' in kw: return self.assertIsInstance(atext, parser.Terminal) self.assertEqual(atext.token_type, 'atext') - params_test_get_atext = old_api_only( + params_test_get_atext = Params( only = C( 'foobar', @@ -2253,6 +2328,19 @@ def test_get_atext(self, s, *args, **kw): ) + # This params_map deals with the fact that get_atext doesn't call repr + # on value in the exception message, but get_atext_sequence does. + @params_map(with_namelist=True) + def atext_repr_fixup(nl, *args, **kw): + if nl.has_all('no_atext_before_special_or_wsp', 'HT'): + kw['exception'] = (kw['exception'][0], re.escape('\\tfoo')) + yield '', C(*args, **kw) + + # get_atext_sequence needs to pass all the get_atext tests. + params_test_get_atext_sequence.update( + atext_repr_fixup(params_test_get_atext) + ) + # get_bare_quoted_string @@ -2994,14 +3082,16 @@ def adapt_get_cfws_tests_for_get_atom( adapt_get_cfws_tests_for_get_atom(params_test_get_cfws), - # get_atom should pass all the get_atext tests except for those + # get_atom should pass all the get_atext_sequence tests except for those # involving leading or trailing whitespace. include_unless( lambda n, s, *a, remainder='', **k: s.startswith(tuple(CFWS_LEADER)) + # XXX XXX disable the ew tests until get_atom is refactored + or 'ew_' in str(n) or remainder.startswith(tuple(CFWS_LEADER)), - label='from_test_get_atext', - )(params_test_get_atext), + label='from_test_get_atext_sequence', + )(params_test_get_atext_sequence), with_wsp = C( '\t bob ', @@ -3180,11 +3270,15 @@ def test_get_dot_atom_text(self, s, *args, **kw): params_test_get_dot_atom_text = old_api_only( # a bare atext is valid in a dot-atom, so we should pass all the - # get_atext tests except the ones involving the dot. + # get_atext_sequence tests except the ones involving the dot. include_unless( - lambda n, *a, **k: 'full_stop' in n, + lambda n, *a, **k: 'full_stop' in n + # XXX XXX disable ew tests until get_dot_atom_text refactored + or 'ew_' in str(n) + # XXX XXX disable the test involving an escaped repr likewise. + or n.has_all('no_atext_before_special_or_wsp', 'HT'), label='from_test_get_atext', - )(params_test_get_atext), + )(params_test_get_atext_sequence), only = C( 'foo.bar.bang', @@ -3258,7 +3352,9 @@ def test_get_dot_atom(self, s, *args, **kw): # Atom is a subset of dot atom, so get_dot_atom should pass any # get_atom test except those involving the dot (full_stop). include_unless( - lambda n, *a, **k: 'full_stop' in n, + lambda n, *a, **k: 'full_stop' in n + # XXX XXX disable the ew tests until get_dot_atom is refactored + or 'ew_' in str(n), label='from_test_get_atom', )(params_test_get_atom), @@ -3425,7 +3521,9 @@ def adapt_get_quoted_string_tests_for_get_word(*args, **kw): 'no_atom_before_special', 'no_atext_before_special_or_wsp', ) - and 'quotation_mark' in n, + and 'quotation_mark' in n + # XXX XXX disable the ew tests until get_word is refactored + or 'ew_' in str(n), label='from_test_get_atom', )(params_test_get_atom), ), From 9103f273a646e8a0530c6ea43d900d38aee4c424 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 20 Mar 2026 16:14:55 -0400 Subject: [PATCH 140/152] Preliminary deprecation of get_atext. --- Lib/email/_header_value_parser.py | 1 + Lib/test/test_email/test__header_value_parser.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 5a7cc0b666b98eb..bf71bb846837a68 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1596,6 +1596,7 @@ def get_atext_sequence(value, start): ) return atext, end +@_deprecate('get_atext_sequence') def get_atext(value): """atext = diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f179e6266a9d3e9..5b47dc2a35a0250 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2253,6 +2253,8 @@ def test_get_atext_sequence(self, s, *args, **kw): remainder='@', ), + # XXX POSTDEP: delete from here... + ) @@ -2260,11 +2262,14 @@ def test_get_atext_sequence(self, s, *args, **kw): @params def test_get_atext(self, s, *args, **kw): + warnings = [(DeprecationWarning, '.*deprecated.*get_atext_sequence')] + if 'exception' not in kw: + warnings.append((DeprecationWarning, '.*deprecated')) atext = self._test_parse( parser.get_atext, C(s), *args, - warnings=..., + warnings=warnings, test_start=False, **kw, ) @@ -2275,6 +2280,8 @@ def test_get_atext(self, s, *args, **kw): params_test_get_atext = Params( + # XXX POSTDEP: ....to here + only = C( 'foobar', ), @@ -2312,6 +2319,7 @@ def test_get_atext(self, s, *args, **kw): **for_each_character(RFC_SPECIALS + RFC_WSP)( no_atext_before_special_or_wsp = C( '{char}foo', + # XXX POSTDEP: replace 'echar' with 'erchar': exception=(errors.HeaderParseError, '{echar}foo'), ), ), @@ -2328,6 +2336,8 @@ def test_get_atext(self, s, *args, **kw): ) + # XXX POSTDEP: Delete from here... + # # This params_map deals with the fact that get_atext doesn't call repr # on value in the exception message, but get_atext_sequence does. @params_map(with_namelist=True) @@ -2340,6 +2350,7 @@ def atext_repr_fixup(nl, *args, **kw): params_test_get_atext_sequence.update( atext_repr_fixup(params_test_get_atext) ) + # XXX POSTDEP: ...to here. # get_bare_quoted_string From 24087d8baac4bea5f2d0e94f6fd13296a28fa35f Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 8 May 2026 10:26:52 -0400 Subject: [PATCH 141/152] Refactor get_comment. BUGFIX: Per the RFC encoded words are allowed in comments, but previously we did not decode them. They are now correctly decoded. --- Lib/email/_header_value_parser.py | 54 ++++-- .../test_email/test__header_value_parser.py | 170 +++++++++++++----- 2 files changed, 158 insertions(+), 66 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index bf71bb846837a68..ae5da30c756b9dc 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1669,30 +1669,48 @@ def get_bare_quoted_string(value, start): ) return bare_quoted_string, start -def get_comment(value): +@_deprecate_old_api +def get_comment(value, start): """comment = "(" *([FWS] ccontent) [FWS] ")" - ccontent = ctext / quoted-pair / comment + ccontent = ctext / quoted-pair / encoded_word / comment + + If start does not point to an open parenthesis, raise an error. Otherwise + return a (possibly empty) Comment that incorporates all characters up to + the corresponding close parenthesis (or the end of the value if there is no + corresponding close parenthesis) and the index to the character after that + closing parenthesis (or the len of input), unquoting any quoted printables, + and decoding any encoded words. The Comment should be a nested token list + structure containing any nested comments. The Comment should not contain + any ValueTerminals for the parentheses, but when stringified the + parentheses should be added, whether the trailing parenthesis was present + or not. If the trailing parenthesis is not present register a defect. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. - We handle nested comments here, and quoted-pair in our qp-ctext routine. """ - if not value or value[0] != '(': + vlen = len(value) + if start >= vlen or value[start] != '(': raise errors.HeaderParseError( - "expected '(' but found '{}'".format(value)) + f"expected '(' but found {value[start:]!r}" + ) comment = Comment() - value = value[1:] - while value and value[0] != ")": - if value[0] in WSP: - token, value = get_fws(value) - elif value[0] == '(': - token, value = get_comment(value) + start += 1 + while start < vlen: + if (c := value[start]) == ")": + break + elif c == '(': + token, start = get_comment(value, start) + comment.append(token) else: - token, value = get_qp_ctext(value) - comment.append(token) - if not value: - comment.defects.append(errors.InvalidHeaderDefect( - "end of header inside comment")) - return comment, value - return comment, value[1:] + tl, start = get_ccontent_sequence(value, start) + comment.extend(tl) + else: + comment.defects.append( + errors.InvalidHeaderDefect("end of header inside comment"), + ) + return comment, start + return comment, start + 1 def get_cfws(value): """CFWS = (1*([FWS] comment) [FWS]) / FWS diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 5b47dc2a35a0250..004b88a0ceaebd6 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -585,10 +585,11 @@ def _test_parse( self.assertEqual(result.comments, comments) if commenttree is not None: self.assertEqual(self.ctree(result), commenttree) - self.assertEqual( - [x - prefix_len for x in result.ew_indexes], - ew_indexes, - ) + if ew_indexes is not ...: + self.assertEqual( + [x - prefix_len for x in result.ew_indexes], + ew_indexes, + ) return (result, *other) if other else result def verify_terminal_types(self, tl, *text_types): @@ -2538,31 +2539,29 @@ def test_get_comment(self, self.assertEqual(cmt.token_type, 'comment') self.verify_terminal_types(cmt, 'ptext', 'fws') - @params_map(with_name=True) + @params_map def adapt_get_ccontent_sequence_tests_for_get_comment( - name, s, *args, stringified=None, remainder='', + ew_indexes=[], **kw, ): # get_comment parses parens, and quotes them differently in str, so # tests involving parens in the test string won't pass here. if '(' in s or ')' in s: return - # XXX XXX (most) ew tests will work after get_comment is refactored. - if 'ew' in name: - return if stringified: kw['comments'] = [stringified] kw['stringified'] = f"({stringified})" else: kw['comments'] = [s] kw.pop('value', None) + kw['ew_indexes'] = [x + 1 for x in ew_indexes] yield 'from_test_get_ccontent_sequence', C(f'({s})', *args, **kw) - params_test_get_comment = old_api_only( + params_test_get_comment = for_each_api( adapt_get_ccontent_sequence_tests_for_get_comment( params_test_get_ccontent_sequence, @@ -2711,35 +2710,107 @@ def adapt_get_ccontent_sequence_tests_for_get_comment( remainder='=?UTF-8?q?foo?=', ), - # XXX XXX comments may contain EWs, but the current code is buggy. - # These will get decoded after the refactor is done. We'll add some - # some more test then, this is a target sample. - ws_around_ew = C( '( =?utf-8?q?test?= )', - #stringified='( test )', - comments=[' =?utf-8?q?test?= '], - #comments=[' test '], + stringified='( test )', + comments=[' test '], + ew_indexes=[2], ), ew_in_nested_comment = C( '(foo (=?UTF-8?q?bar?=))', - #stringified='(foo (bar))', - comments=['foo (=?UTF-8?q?bar?=)'], - #comments=['foo (bar)'], - commenttree=['foo ', ['=?UTF-8?q?bar?=']], - #commenttree=['foo ', ['bar']], + stringified='(foo (bar))', + comments=['foo (bar)'], + commenttree=['foo ', ['bar']], + ew_indexes=[6], ), ew_missing_whitespace = C( '(=?UTF-8?q?foo?==?UTF-8?q?bar?=)', - #stringified='(foobar)', - comments=['=?UTF-8?q?foo?==?UTF-8?q?bar?='], - #comments=['foobar'], - #defects=[ - # missing_whitespace_after_ew_defect, - # missing_whitespace_before_ew_defect, - # ], + stringified='(foobar)', + comments=['foobar'], + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[1, 16], + ), + + no_ws_around_ew = C( + '(=?UTF-8?q?test?=)', + stringified='(test)', + comments=['test'], + ew_indexes=[1], + ), + + ws_inside_ew = C( + '(=?UTF-8?q? Test ?=)', + stringified='( Test )', + comments=[' Test '], + defects=[whitespace_inside_ew_defect], + ew_indexes=[1], + ), + + non_ws_around_ew = C( + '(foo=?UTF-8?q?bar_?=bird)', + stringified='(foobar bird)', + comments=['foobar bird'], + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[4], + ), + + multiple_ew = C( + '(foo =?UTF-8?q?a?= =?UTF-8?q?t?=)', + stringified='(foo at)', + comments=['foo at'], + ew_indexes=[5, 19], + ), + + **for_each_character(RFC_WSP)( + inter_ew_whitespace_handled_correctly = C( + '({char}=?UTF-8?q?_foo_?={char}{char}=?UTF-8?q?bar_?= )', + stringified='({char} foo bar )', + comments=['{char} foo bar '], + ew_indexes=[2, 21], + ), + ), + + ew_nested_first_comment_valid_no_ws = C( + '((=?UTF-8?q?foo?=)=?UTF-8?q?bar?=)', + stringified='((foo)bar)', + comments=['(foo)bar'], + commenttree=[['foo'], 'bar'], + ew_indexes=[2, 18], + ), + + ew_in_nested_second_comment_valid_no_ws = C( + '(=?UTF-8?q?foo?=(=?UTF-8?q?bar?=))', + stringified='(foo(bar))', + comments=['foo(bar)'], + commenttree=['foo', ['bar']], + ew_indexes=[1, 17], + ), + + # parenthesis inside encoded words in comments is RFC illegal, but + # we handle it anyway. XXX we aren't registering defects for this, but + # ideally we should be. + + qp_inside_ew = C( + r'(=?UTF-8?q?\test\)_?= =?UTF-8?q?\(test?=)', + stringified=r'(test\) \(test)', + comments=['test) (test'], + ew_indexes=[1, 22], + ), + + unquoted_parens_inside_ew = C( + '(=?UTF-8?q?test)_?= =?UTF-8?q?(test?=) foo', + stringified=r'(test\) \(test)', + comments=[r'test) (test'], + remainder=' foo', + ew_indexes=[1, 20], ), ) @@ -2828,25 +2899,24 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): remainder='=?UTF-8?q?foo?=', ), - # XXX XXX these will get decoded after refactor is done. - ew_in_nested_comment = C( ' (a) (foo (=?UTF-8?q?bar?=))', - #stringified=' (a) (foo (bar))', - comments=['a', 'foo (=?UTF-8?q?bar?=)'], - #comments=['a', 'foo (bar)'], - #commenttree=[('a', []), ('foo (bar)', [('bar', [])])], + stringified=' (a) (foo (bar))', + comments=['a', 'foo (bar)'], + commenttree=[['a'], ['foo ', ['bar']]], + # XXX XXX this index will change during refactor. + ew_indexes=[6], ), ew_missing_whitespace = C( '(=?UTF-8?q?foo?==?UTF-8?q?bar?=) (b)', - #stringified='(foobar) (b)', - comments=['=?UTF-8?q?foo?==?UTF-8?q?bar?=', 'b'], - #comments=['foobar', 'b'], - #defects=[ - # missing_whitespace_after_ew_defect, - # missing_whitespace_before_ew_defect, - # ], + stringified='(foobar) (b)', + comments=['foobar', 'b'], + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[1, 16], ), nested_and_unnested_empty_comments = C( @@ -2919,6 +2989,10 @@ def adapt_get_cfws_tests_for_get_quoted_string( for k in ('comments', 'commenttree', 'defects'): if (v := kw.get(k)): kw[k] = v * 2 + # XXX XXX mid refactoring the idx values are wrong. Replace this + # when get_quoted_string is refactored. + if kw.get('ew_indexes'): + kw['ew_indexes'] = ... yield 'adapted_from_get_cfws', C(new_s, **kw) params_test_get_quoted_string = old_api_only( @@ -3087,6 +3161,10 @@ def adapt_get_cfws_tests_for_get_atom( for k in ('comments', 'commenttree', 'defects'): if (v := kw.get(k)): kw[k] = v * 2 + # XXX XXX mid refactoring the idx values are wrong. Replace this + # when get_atom is refactored. + if kw.get('ew_indexes'): + kw['ew_indexes'] = ... yield 'adapted_from_get_cfws', C(new_s, **kw) params_test_get_atom = old_api_only( @@ -3989,14 +4067,10 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): less_invalid_ew_atoms = C( '=?utf-8?q?foo_?= . (=?utf-8?q?test?=) =?utf-8?q?_bar?= .bird', - # XXX XXX after refactoring the comment ew will also be decoded. - #stringified='foo . (test) bar .bird', - stringified='foo . (=?utf-8?q?test?=) bar .bird', + stringified='foo . (test) bar .bird', value="foo . bar .bird", local_part="foo . bar.bird", - # XXX XXX after refactoring the comment ew will also be decoded. - # comments=['test'] - comments=['=?utf-8?q?test?='], + comments=['test'], ), # XXX XXX Since we've decided to decode encoded words, this becomes a From 51e075a8d6202acc9c8c4e969ff4ca2bf39f5f66 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 21 Jan 2026 18:27:40 -0500 Subject: [PATCH 142/152] Finalize deprecation of get_qp_ctext. No code is calling it any longer. --- Lib/email/_header_value_parser.py | 4 ++-- Lib/test/test_email/test__header_value_parser.py | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ae5da30c756b9dc..f4794a99bb0988b 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1531,8 +1531,8 @@ def get_ccontent_sequence(value, start): """ return _get_ccontent_content(value, start) -@_deprecate('get_ccontent_sequence') -def get_qp_ctext(value): +@_replaced_with('get_ccontent_sequence') +def _deprecated_get_qp_ctext(value): r"""ctext = This is not the RFC ctext, since we are handling nested comments in comment diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 004b88a0ceaebd6..9d4c67d05d1ef86 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -260,6 +260,7 @@ def test_deprecated_names(self, name): @params(with_names( # XXX XXX make sure this is completely filled in with all the names # we've replaced. + get_qp_ctext='get_ccontent_sequence', )) def test_replaced_names(self, oldname, newname): with check_all_warnings(( @@ -1973,15 +1974,11 @@ def test_get_ccontent_sequence(self, s, *args, **kw): @params def test_get_qp_ctext(self, s, *args, value=' ', **kw): ptext = self._test_parse( - parser.get_qp_ctext, + parser._deprecated_get_qp_ctext, C(s), *args, value=value, - warnings=[ - (DeprecationWarning, '.*deprecated.*get_ccontent_sequence'), - (DeprecationWarning, '.*ptext.*deprecated'), - (DeprecationWarning, '.*validate.*deprecated'), - ], + warnings=..., test_start=False, **kw, ) From b282459addd490a4772dbaeeda281c031a4acfd2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Fri, 23 Jan 2026 16:14:12 -0500 Subject: [PATCH 143/152] Refactor get_cfws. BUGFIX: get_cfws would return an empty token list if the input was empty or started with non-cfws. This is clearly buggy, since it makes the parse tree inaccurate. It now raises a HeaderParseError in this case. When called via the old API, it generates a warning message and continues to return the buggy value. The existing code never calls it wrong, and hopefully no one else does either. --- Lib/email/_header_value_parser.py | 36 +++++++++++++++---- .../test_email/test__header_value_parser.py | 23 ++++++++---- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index f4794a99bb0988b..2fcad30be437f2c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1712,18 +1712,42 @@ def get_comment(value, start): return comment, start return comment, start + 1 -def get_cfws(value): +@_deprecate_old_api_and_lack_of_raise_on_invalid_input +def get_cfws(value, start): """CFWS = (1*([FWS] comment) [FWS]) / FWS + Raise an error if start does not point to either whitespace or an open + parenthesis in value. Otherwise return a CFWSList containing any + whitespace or comments up to the next non-CFWS character outside of a + comment (or the end of value), and the index of that next character (or the + len of value). + """ cfws = CFWSList() - while value and value[0] in CFWS_LEADER: - if value[0] in WSP: - token, value = get_fws(value) + vlen = len(value) + while start < vlen: + if (c := value[start]) in WSP: + token, start = get_fws(value, start) + elif c == '(': + token, start = get_comment(value, start) else: - token, value = get_comment(value) + break cfws.append(token) - return cfws, value + if not cfws: + # XXX POSTDEP: change this to raise the exception. + return ( + cfws, + start, + errors.HeaderParseError( + f'expected cfws but found {value[start:]!r}' + ), + ( + "Calling get_cfws when there is no whitespace or comment at" + " the start is deprecated and will raise an error in the" + " future." + ), + ) + return cfws, start def get_quoted_string(value): """quoted-string = [CFWS] [CFWS] diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9d4c67d05d1ef86..7374a0767d850ae 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2833,15 +2833,25 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): # of just the single nested comment it does for Comment. if 'commenttree' in kw: kw['commenttree'] = [kw['commenttree']] - # XXX: get_cfws has the same bug that get_fws has: it does *not* raise - # an error if there is no cfws, and it should. - # XXX XXX Like get_fws, we'll deprecate this in the refactor. - if nl.has_any('empty', 'non_wsp_before_left_paren_is_error'): + # XXX POSTDEP: delete from here... + # get_cfws had the same bug that get_fws had: it did *not* + # raise an error if there is no cfws, and it should. For backward + # compatibility we continue to not raise under the old api. + if ('oldapi' in nl + and nl.has_any('empty', 'non_wsp_before_left_paren_is_error') + ): kw.pop('exception') kw['remainder'] = s + kw['warnings'] = kw.get('warnings', []) + [ + ( + DeprecationWarning, + r'(?i)(?=.*no whitespace)(?=.*comment)(?=.*raise)', + ) + ] + # XXX POSTDEP: ...to here yield 'from_test_get_comment', C(s, *args, **kw) - params_test_get_cfws = old_api_only( + params_test_get_cfws = for_each_api( # get_cfws should behave exactly the same as get_fws when parsing # whitespace only strings, except for the case of ending at a '(' @@ -2901,8 +2911,7 @@ def adapt_comment_tests_for_cfws(nl, s, *args, **kw): stringified=' (a) (foo (bar))', comments=['a', 'foo (bar)'], commenttree=[['a'], ['foo ', ['bar']]], - # XXX XXX this index will change during refactor. - ew_indexes=[6], + ew_indexes=[11], ), ew_missing_whitespace = C( From ac79897226a8a6b1525cf99c8b6a386beb0df6ce Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 26 Jan 2026 15:12:54 -0500 Subject: [PATCH 144/152] Refactor get_quoted_string. --- Lib/email/_header_value_parser.py | 28 +++++++++------ .../test_email/test__header_value_parser.py | 34 ++++++++++++------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 2fcad30be437f2c..e857d5f24fb1a12 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1749,23 +1749,29 @@ def get_cfws(value, start): ) return cfws, start -def get_quoted_string(value): - """quoted-string = [CFWS] [CFWS] +@_deprecate_old_api +def get_quoted_string(value, start): + """quoted-string = [CFWS] bare-quoted-string [CFWS] + + Return a QuotedString containing the leading CFWSList (if any), the + BareQuotedString, and the trailing CFWSList (if any), plus the index of the + character after the parsed text (or the len of value if there is no text + left unparsed). + + If no bare-quoted-string is found raise a HeaderParseError. - 'bare-quoted-string' is an intermediate class defined by this - parser and not by the RFC grammar. It is the quoted string - without any attached CFWS. """ quoted_string = QuotedString() - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) quoted_string.append(token) - token, value = get_bare_quoted_string(value) + token, start = get_bare_quoted_string(value, start) quoted_string.append(token) - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) quoted_string.append(token) - return quoted_string, value + return quoted_string, start def get_atom(value): """atom = [CFWS] 1*atext [CFWS] diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 7374a0767d850ae..84d21833557acdd 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2953,7 +2953,6 @@ def test_get_quoted_string( self.assertEqual(qs.token_type, 'quoted-string') self.verify_terminal_types(qs, 'ptext', 'fws') - # get_quoted_string should pass any get_bare_quoted_string test that # doesn't involve leading or trailing whitespace. @params_map @@ -2967,9 +2966,9 @@ def adapt_bare_quoted_string_tests_for_get_quoted_string(s, *args, **kw): kw['quoted_value'] = kw.get('stringified', s[:-len(r)] if r else s) yield 'from_test_bare_quoted_string', C(s, *args, **kw) - # If there is no remainder a cfws test string should be valid as a quoted - # string prefix or suffix, with a few exceptions that test for what happens - # if closing parens are missing. + # If there is no remainder or exception expectation, a cfws test string + # should be valid as a quoted string prefix or suffix, with a few + # exceptions that test for what happens if closing parens are missing. @params_map(with_namelist=True) def adapt_get_cfws_tests_for_get_quoted_string( nl, @@ -2977,13 +2976,15 @@ def adapt_get_cfws_tests_for_get_quoted_string( *args, stringified=None, remainder=None, + exception=None, **kw, ): - if remainder or nl.has_any( + if remainder or exception or nl.has_any( 'multiple_mesting_missing_two_right_parens', 'no_right_paren_after_non_ws', 'no_right_paren_after_ws', 'header_ends_in_comment', + 'empty', # XXX POSTDEP remove this line, it's from a deprecation ): return new_s = f'{s} "foo" {s}' @@ -2995,13 +2996,11 @@ def adapt_get_cfws_tests_for_get_quoted_string( for k in ('comments', 'commenttree', 'defects'): if (v := kw.get(k)): kw[k] = v * 2 - # XXX XXX mid refactoring the idx values are wrong. Replace this - # when get_quoted_string is refactored. - if kw.get('ew_indexes'): - kw['ew_indexes'] = ... + if (idxs := kw.get('ew_indexes')): + kw['ew_indexes'] = idxs + [x + len(s) + 7 for x in idxs] yield 'adapted_from_get_cfws', C(new_s, **kw) - params_test_get_quoted_string = old_api_only( + params_test_get_quoted_string = for_each_api( adapt_bare_quoted_string_tests_for_get_quoted_string( params_test_get_bare_quoted_string, @@ -3598,9 +3597,16 @@ def adapt_get_atom_tests_for_get_word(*args, **kw): kw['tokenlisttype'] = parser.TokenList yield '', C(*args, **kw) - @params_map - def adapt_get_quoted_string_tests_for_get_word(*args, **kw): + @params_map(with_namelist=True) + def adapt_get_quoted_string_tests_for_get_word(nl, *args, **kw): kw['tokenlisttype'] = parser.QuotedString + # XXX XXX Compensate for the fact that get_word is currently peeling + # off the first cfws without copying the indexes, and is only passing + # get_quoted_string the truncated value. + if ('adapted_from_get_cfws' in nl + and (idxs := kw.get('ew_indexes')) + ): + kw['ew_indexes'] = [x + 6 for x in idxs[:len(idxs)//2]] yield '', C(*args, **kw) params_test_get_word = old_api_only( @@ -4138,6 +4144,10 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): kw['value'] = kw.pop('quoted_value') if 'exception' not in kw: kw['local_part'] = kw.pop('content') + # XXX XXX indexes won't be right mid-refactor, remove when + # get_local_part refactored. + if 'ew_indexes' in kw: + kw['ew_indexes'] = ... yield '', C(*args, **kw) # XXX XXX revert to no with_namelist when get_local_part is refactored From 0362f67fc464b73a393e1b1d3eba44837285c6ad Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Tue, 27 Jan 2026 10:24:27 -0500 Subject: [PATCH 145/152] Refactor get_atom. BUGFIX: Previously, while encoded words in phrases (eg: display names) would be decoded if they were surrounded by whitespace, encoded words embedded in non-whitespace would not be. While the latter is not RFC compliant, most email clients do decode them. The email package now does so as well. BUGFIX: Missing leading whitespace on encoded words preceded by a comment was not previously reported as a defect, but is now. BUGFIX: The TokenList method 'startwith_fws' would raise an IndexError if the TokenList was empty. This could only be an issue to user code if that code directly manipulated the private _parse_tree attribute of headers, or used the private _header_value_parser module directly. This changeset starts the elimination of 'vtext' as a thing. Instead of an atom list being comprised of a mixture of atext and vtext, it is now all labeled as atext. This gives a more concrete meaning to the Terminal token_type: it is what context the text came from. This is how it is being used in the folder: things sourced from atext or ptext need to be quoted if they contain any specials. This should not be visible to user code unless it is really digging in to the internals of this private module or the _parse_tree attached to headers. As part of this refactoring I've added endswith_fws in parallel to the existing startswith_fws, to support the checks for missing whitespace. --- Lib/email/_header_value_parser.py | 65 ++++++---- .../test_email/test__header_value_parser.py | 114 ++++++++++-------- 2 files changed, 110 insertions(+), 69 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index e857d5f24fb1a12..bd5ae1407db8b63 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -178,7 +178,10 @@ def all_defects(self): return sum((x.all_defects for x in self), self.defects) def startswith_fws(self): - return self[0].startswith_fws() + return self and self[0].startswith_fws() + + def endswith_fws(self): + return self and self[-1].endswith_fws() as_ew_allowed = True @@ -969,6 +972,9 @@ def value(self): def startswith_fws(self): return self and self[0] in WSP + def endswith_fws(self): + return self and self[-1] in WSP + class ValueTerminal(Terminal): @@ -979,6 +985,9 @@ def value(self): def startswith_fws(self): return False + def endswith_fws(self): + return False + class EWWhiteSpaceTerminal(WhiteSpaceTerminal): @@ -1773,32 +1782,44 @@ def get_quoted_string(value, start): quoted_string.append(token) return quoted_string, start -def get_atom(value): +@_deprecate_old_api +def get_atom(value, start): """atom = [CFWS] 1*atext [CFWS] - An atom could be an rfc2047 encoded word. + Return an Atom containing the leading and trailing CFWSList tokens + if appropriate, as well as ValueTerminals of token_type atext, containing + all characters up to the next SPECIAL character or the end of value, and a + pointer to the special or the len of value. + + Decode any encoded words, regardless of whitespace, registering defects + if the RFC required whitespace is missing. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ + # We decode encoded words mixed in to atext without whitespace to in-total + # comprise the body of the atom. This might qualify as a separate defect. atom = Atom() - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) atom.append(token) - if value and value[0] in ATOM_ENDS: + if start >= vlen or value[start] in ATOM_ENDS: raise errors.HeaderParseError( - "expected atom but found '{}'".format(value)) - if value.startswith('=?'): - try: - token, value = get_encoded_word(value) - except errors.HeaderParseError: - # XXX: need to figure out how to register defects when - # appropriate here. - token, value = get_atext(value) - else: - token, value = get_atext(value) - atom.append(token) - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + "expected atom but found '{}'".format(value[start:])) + tl, start = get_atext_sequence(value, start) + if (tl[0].token_type == 'encoded-word' + and atom and not atom[-1].endswith_fws() + ): + atom.defects.append(_MissingWhitespaceBeforeEWDefect) + atom.extend(tl) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) + if tl[-1].token_type == 'encoded-word' and not token.startswith_fws(): + atom.defects.append(_MissingWhitespaceAfterEWDefect) atom.append(token) - return atom, value + return atom, start def get_dot_atom_text(value): """ dot-text = 1*atext *("." 1*atext) @@ -3362,7 +3383,9 @@ def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy): continue tstr = str(part) if not want_encoding: - if part.token_type in ('ptext', 'vtext'): + # XXX At the end of the old API deprecation period 'vtext' can + # be removed from this list as it will no longer exist at all. + if part.token_type in ('ptext', 'atext', 'vtext'): # Encode if tstr contains special characters. want_encoding = not SPECIALSNL.isdisjoint(tstr) else: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 84d21833557acdd..3182d073d5cf60b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3138,11 +3138,11 @@ def test_get_atom(self, s, *args, **kw): return self.assertIsInstance(atom, parser.Atom) self.assertEqual(atom.token_type, 'atom') - self.verify_terminal_types(atom, 'atext', 'vtext', 'ptext', 'fws') + self.verify_terminal_types(atom, 'atext', 'ptext', 'fws') - # If there is no remainder a cfws test string should be valid as a atom - # prefix or suffix, with a few exceptions that test for what happens - # if closing parens are missing. + # If there is no remainder or exception expectation, a cfws test string + # should be valid as a atom prefix or suffix, with a few exceptions that + # test for what happens if closing parens are missing. @params_map(with_namelist=True) def adapt_get_cfws_tests_for_get_atom( nl, @@ -3150,13 +3150,15 @@ def adapt_get_cfws_tests_for_get_atom( *args, stringified=None, remainder=None, + exception=None, **kw, ): - if remainder or nl.has_any( + if remainder or exception or nl.has_any( 'multiple_mesting_missing_two_right_parens', 'no_right_paren_after_non_ws', 'no_right_paren_after_ws', 'header_ends_in_comment', + 'empty', # XXX POSTDEP remove this line, it's from a deprecation ): return new_s = f'{s} foo {s}' @@ -3166,13 +3168,11 @@ def adapt_get_cfws_tests_for_get_atom( for k in ('comments', 'commenttree', 'defects'): if (v := kw.get(k)): kw[k] = v * 2 - # XXX XXX mid refactoring the idx values are wrong. Replace this - # when get_atom is refactored. - if kw.get('ew_indexes'): - kw['ew_indexes'] = ... + if (idxs := kw.get('ew_indexes')): + kw['ew_indexes'] = idxs + [x + len(s) + 5 for x in idxs] yield 'adapted_from_get_cfws', C(new_s, **kw) - params_test_get_atom = old_api_only( + params_test_get_atom = for_each_api( adapt_get_cfws_tests_for_get_atom(params_test_get_cfws), @@ -3181,8 +3181,6 @@ def adapt_get_cfws_tests_for_get_atom( include_unless( lambda n, s, *a, remainder='', **k: s.startswith(tuple(CFWS_LEADER)) - # XXX XXX disable the ew tests until get_atom is refactored - or 'ew_' in str(n) or remainder.startswith(tuple(CFWS_LEADER)), label='from_test_get_atext_sequence', )(params_test_get_atext_sequence), @@ -3264,6 +3262,7 @@ def adapt_get_cfws_tests_for_get_atom( ew_only = C( '=?utf-8?q?=20bob?=', stringified=' bob', + ew_indexes=[0], ), ew_and_comments = C( @@ -3271,30 +3270,31 @@ def adapt_get_cfws_tests_for_get_atom( stringified='(a) bob (b)', value=' bob ', comments=['a', 'b'], + ew_indexes=[4], ), - # XXX XXX this should actually be two missing whitespace defects. ew_and_comments_no_ws = C( '(a)=?UTF-8?q?bob?=(b)', stringified='(a)bob(b)', value=' bob ', comments=['a', 'b'], defects=[ - #missing_whitespace_before_ew_defect, + missing_whitespace_before_ew_defect, missing_whitespace_after_ew_defect, ], + ew_indexes=[3], ), - # XXX XXX ditto ew_and_empty_comments_no_ws = C( '()=?UTF-8?q?bob?=()', stringified='()bob()', value=' bob ', comments=['', ''], defects=[ - #missing_whitespace_before_ew_defect, + missing_whitespace_before_ew_defect, missing_whitespace_after_ew_defect, ], + ew_indexes=[2], ), # XXX Ideally this should have a defect for the specials. @@ -3303,6 +3303,7 @@ def adapt_get_cfws_tests_for_get_atom( '=?UTF-8?q?bob{char}?= @foo', stringified='bob{char} ', remainder='@foo', + ew_indexes=[0], ), ), @@ -3312,29 +3313,26 @@ def adapt_get_cfws_tests_for_get_atom( remainder='@=?UTF-8?q?bob?=', ), - # XXX XXX Technically these are correct as is but we're going to fix it - # to always decode the ews anyway, because most email software does. - multiple_ew_no_ws = C( '=?UTF-8?q?foo?==?UTF-8?q?bar?=', - stringified='foo', - #stringified='foobar', - remainder='=?UTF-8?q?bar?=', + stringified='foobar', defects=[ missing_whitespace_after_ew_defect, - #missing_whitespace_before_ew_defect, + missing_whitespace_before_ew_defect, ], + ew_indexes=[0, 15], ), ew_in_middle_of_atom_text = C( 'foo{=?UTF-8?q?foo?=}{=?UTF-8?q?bar?=}bar', - #stringified='foo{foo}{bar}bar', - #defects=[ - # missing_whitespace_before_ew_defect, - # missing_whitespace_after_ew_defect, - # missing_whitespace_before_ew_defect, - # missing_whitespace_after_ew_defect, - # ], + stringified='foo{foo}{bar}bar', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[4, 21], ), empty_comments_no_ws = C( @@ -3590,7 +3588,7 @@ def test_get_word( self.assertEqual(word.quoted_value, quoted_value) if content is not None: self.assertEqual(word.content, content) - self.verify_terminal_types(word, 'dot', 'atext', 'ptext', 'fws', 'vtext') + self.verify_terminal_types(word, 'dot', 'atext', 'ptext', 'fws') @params_map def adapt_get_atom_tests_for_get_word(*args, **kw): @@ -3659,7 +3657,7 @@ def test_get_phrase(self, s, *args, obs_dots=0, **kw): obs_dots, phrase.ppstr(), ) - self.verify_terminal_types(phrase, 'dot', 'atext', 'ptext', 'fws', 'vtext') + self.verify_terminal_types(phrase, 'dot', 'atext', 'ptext', 'fws') @params_map(with_namelist=True) def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): @@ -3761,16 +3759,22 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): adjacent_ew = C( '=?ascii?q?Joi?= \t =?ascii?q?ned?=', stringified='Joined', + # XXX XXX second index will change during refactor + ew_indexes=[0, 0], ), adjacent_ew_different_encodings = C( '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', stringified='Bérénice', + # XXX XXX second index will change during refactor + ew_indexes=[0, 0], ), adjacent_ew_encoded_spaces = C( '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', stringified='Encoded spaces preserved', + # XXX XXX second and third indexes will change during refactor + ew_indexes=[0, 0, 0], ), adjacent_ew_comment_is_not_linear_white_space = C( @@ -3778,28 +3782,38 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): stringified='Comment (is not) linear-white-space', value='Comment linear-white-space', comments=['is not'], + # XXX XXX second index will change during refactor + ew_indexes=[0, 0], ), adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', stringified='Defect still joins', defects=[whitespace_inside_ew_defect], + # XXX XXX second index will change during refactor + ew_indexes=[0, 0], ), adjacent_ew_ignore_non_ew = C( '=?ascii?q?No?= =?join?= for non-ew', stringified='No =?join?= for non-ew', + ew_indexes=[0], ), adjacent_ew_ignore_invalid_ew = C( '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew', stringified='No =?ascii?rot13?wbva= for invalid ew', + ew_indexes=[0], ), adjacent_ew_missing_space = C( '=?ascii?q?Joi?==?ascii?q?ned?=', stringified='Joined', - defects=[missing_whitespace_after_ew_defect], + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], ), ew_before_quoted_string_missing_space = C( @@ -3808,12 +3822,11 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value='disjointed', defects=[ # XXX XXX After refactoring there should be one 'after' defect - missing_whitespace_after_ew_defect, + #missing_whitespace_after_ew_defect, ew_inside_quoted_string_defect, ], - # XXX XXX this will change during refactoring. Currently only - # get_bare_quoted_string is adding indexes. - ew_indexes=[1], + # XXX XXX second index will change during refactoring + ew_indexes=[0, 1], ), ew_after_quoted_string_missing_space = C( @@ -3825,9 +3838,8 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): #missing_whitespace_after_ew_defect, ew_inside_quoted_string_defect, ], - # XXX XXX this will change during refactoring. Currently only - # get_bare_quoted_string is adding indexes. - ew_indexes=[1], + # XXX XXX second index will change during refactoring + ew_indexes=[1, 0], ), **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + '."')( @@ -3904,7 +3916,6 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): 'atext', 'ptext', 'fws', - 'vtext', 'misplaced-special', ) @@ -4068,13 +4079,12 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): # XXX XXX There should be exactly one ew whitespace defect # here, but the number generated will change during refactor, # until it is fixed when get_obs_local_part is refactored. - missing_whitespace_after_ew_defect, + #missing_whitespace_after_ew_defect, missing_dot_in_local_part_defect, ew_inside_quoted_string_defect, ], - # XXX XXX this will change during refactoring. Currently only - # get_bare_quoted_string is adding indexes. - ew_indexes=[1], + # XXX XXX second index will change during refactor + ew_indexes=[0, 1], ), less_invalid_ew_atoms = C( @@ -4083,6 +4093,8 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): value="foo . bar .bird", local_part="foo . bar.bird", comments=['test'], + # XXX XXX the indexes will change during refactor + ew_indexes=[0, 0], ), # XXX XXX Since we've decided to decode encoded words, this becomes a @@ -4096,9 +4108,11 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): local_part="foo . bar.bird", defects=[ # XXX XXX the whitespace defects will change during refactoring - missing_whitespace_after_ew_defect, - missing_whitespace_after_ew_defect, + #missing_whitespace_after_ew_defect, + #missing_whitespace_after_ew_defect, ], + # XXX XXX second index will change during refactor + ew_indexes=[0, 0], ), ) @@ -4172,7 +4186,11 @@ def adapt_get_obs_local_part_tests_for_get_local_part( else: defects.append(non_dot_atom_local_part_obs_defect) # XXX XXX delete this fixup when get_local_part is refactored. - if 'invalid_ew_atoms' in nl: + if nl.has_any( + 'invalid_ew_atoms', + 'less_invalid_ew_atoms', + 'sort_of_valid_ew_dot_atom', + ): kw.pop('ew_indexes') yield '', C(*args, defects=defects, **kw) From e919de5c1e864b6da81c2a2d74329af039de72d5 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Thu, 5 Feb 2026 17:03:00 -0500 Subject: [PATCH 146/152] Refactor get_dot_atom_text. Which adds EW support. Which cascades to partially fix EW support in get_dot_atom. --- Lib/email/_header_value_parser.py | 55 +++++++++--- .../test_email/test__header_value_parser.py | 87 ++++++++++++++----- 2 files changed, 109 insertions(+), 33 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index bd5ae1407db8b63..c1277f404b74e6f 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1821,24 +1821,53 @@ def get_atom(value, start): atom.append(token) return atom, start -def get_dot_atom_text(value): - """ dot-text = 1*atext *("." 1*atext) +@_deprecate_old_api +def get_dot_atom_text(value, start): + """ dot-atom-text = 1*atext *("." 1*atext) + + Return a DotAtomText containing all characters up to the next non-'.' + special or WSP outside of an enocded word or the end of value, and the + index of the special, WSP, or the len of value, decoding any encoded words. + All ValueTerminals returned should have the type 'atext'. '.' characters + should be returned as ValueTermibnals of token_type 'dot'. + + Encoded words should be decoded even if there is non-whitespace around + them, and whether or not they contain any RFC invalid whitespace. Register + defects for any missing whitespace. + + Register defects if there are any non-printable or undecodable characters + in the non-whitespace tokens. """ + # The only legitimate way an encoded word can be in a dot-atom-text + # position is if it is the only thing there. Following our policy of + # generous decoding we accept them anywhere in the dot-atom-text. The only + # defects we're registering are the whitespace defects. An encoded word is + # legitimate here; it's the whitespace that's wrong. To get it right the + # text, including the dots, would end up inside the encoded word. dot_atom_text = DotAtomText() - if not value or value[0] in ATOM_ENDS: - raise errors.HeaderParseError("expected atom at a start of " - "dot-atom-text but found '{}'".format(value)) - while value and value[0] not in ATOM_ENDS: - token, value = get_atext(value) - dot_atom_text.append(token) - if value and value[0] == '.': + vlen = len(value) + if start >= vlen or value[start] in ATOM_ENDS: + raise errors.HeaderParseError( + f"expected atom at a start of dot-atom-text" + f" but found {value[start:]!r}" + ) + while start < vlen and value[start] not in ATOM_ENDS: + token, start = get_atext_sequence(value, start) + if token[0].token_type == 'encoded-word' and dot_atom_text: + dot_atom_text.defects.append(_MissingWhitespaceBeforeEWDefect) + dot_atom_text.extend(token) + if start < vlen and value[start] == '.': + if dot_atom_text[-1].token_type == 'encoded-word': + dot_atom_text.defects.append(_MissingWhitespaceAfterEWDefect) dot_atom_text.append(DOT) - value = value[1:] + start += 1 if dot_atom_text[-1] is DOT: - raise errors.HeaderParseError("expected atom at end of dot-atom-text " - "but found '{}'".format('.'+value)) - return dot_atom_text, value + raise errors.HeaderParseError( + f"expected atom at end of dot-atom-text" + f" but found {value[start-1:]!r}" + ) + return dot_atom_text, start def get_dot_atom(value): """ dot-atom = [CFWS] dot-atom-text [CFWS] diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 3182d073d5cf60b..51f4b8a2139d8a2 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3357,18 +3357,15 @@ def test_get_dot_atom_text(self, s, *args, **kw): return self.assertIsInstance(atom, parser.DotAtomText) self.assertEqual(atom.token_type, 'dot-atom-text') - self.verify_terminal_types(atom, 'dot', 'atext') + # There can be fws inside encoded words. + self.verify_terminal_types(atom, 'dot', 'atext', 'fws') - params_test_get_dot_atom_text = old_api_only( + params_test_get_dot_atom_text = for_each_api( # a bare atext is valid in a dot-atom, so we should pass all the # get_atext_sequence tests except the ones involving the dot. include_unless( - lambda n, *a, **k: 'full_stop' in n - # XXX XXX disable ew tests until get_dot_atom_text refactored - or 'ew_' in str(n) - # XXX XXX disable the test involving an escaped repr likewise. - or n.has_all('no_atext_before_special_or_wsp', 'HT'), + lambda n, *a, **k: 'full_stop' in n, label='from_test_get_atext', )(params_test_get_atext_sequence), @@ -3389,7 +3386,7 @@ def test_get_dot_atom_text(self, s, *args, **kw): **for_each_character(RFC_SPECIALS + RFC_WSP)( raises_on_leading_special_or_wsp = C( '{char}foo.bar', - exception=(errors.HeaderParseError, r'expected.*{echar}foo\.'), + exception=(errors.HeaderParseError, r'expected.*{erchar}foo\.'), ), ), @@ -3425,6 +3422,55 @@ def test_get_dot_atom_text(self, s, *args, **kw): ), ), + ew = C( + '=?UTF-8?q?foo?=', + stringified='foo', + ew_indexes=[0], + ), + + two_ew_two_atoms = C( + '=?UTF-8?q?foo?= =?UTF-8?q?bar?=', + stringified='foo', + remainder=' =?UTF-8?q?bar?=', + ew_indexes=[0], + ), + + # The tests above are the only RFC valid way for an encoded word to be + # in a dot-atom-text, but we're going to be generous. + + two_ew_with_dot = C( + '=?UTF-8?q?foo?=.=?UTF-8?q?bar?=', + stringified='foo.bar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 16], + ), + + two_ew_no_dot = C( + '=?UTF-8?q?foo?==?UTF-8?q?bar?=', + stringified='foobar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], + ), + + mixed_ews_and_atext = C( + 'foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar', + stringified='foo.bar foobar.foobar', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + # XXX XXX indexes will change during the refactor. + ew_indexes=[7, 27], + ), + ) @@ -3533,23 +3579,24 @@ def test_get_dot_atom(self, s, *args, **kw): comments=['hey'], ), - # XXX XXX These additional EW cases not already tested by the atom - # tests will be fully decoded after refactoring. - mixed_ews_and_atext = C( '(hey)foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar (hey)', - #stringified='(hey)foo.bar foobar.foobar (hey)', - value=' foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar ', - #value=' foo.bar foobar.foobar ', - #defects=[ - # missing_whitespace_before_ew_defect, - # missing_whitespace_after_ew_defect, - # missing_whitespace_before_ew_defect, - # missing_whitespace_after_ew_defect, - # ], + stringified='(hey)foo.bar foobar.foobar (hey)', + value=' foo.bar foobar.foobar ', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], comments=['hey', 'hey'], + # XXX XXX indexes will change during the refactor. + ew_indexes=[7, 27], ), + # XXX XXX This additional EW case not already tested by the atom + # tests will be fully decoded after refactoring. + two_ew_with_dot = C( '=?UTF-8?q?foo?=.=?UTF-8?q?bar?=(hey)', stringified='foo', From 578913f2eec3f2bd21ecd7bec6e29d7b269bfc69 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 3 May 2026 13:52:43 -0400 Subject: [PATCH 147/152] Finalize deprecation of get_atext. No code is using it at this point. --- Lib/email/_header_value_parser.py | 8 ++++---- Lib/test/test_email/test__header_value_parser.py | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index c1277f404b74e6f..09040a7f89ffef2 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1150,7 +1150,7 @@ def _(func): ) _deprecated__wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split -_non_atom_end_matcher = re.compile(r"[^{}]+".format( +_deprecated__non_atom_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(ATOM_ENDS)))).match _non_token_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(TOKEN_ENDS)))).match @@ -1605,14 +1605,14 @@ def get_atext_sequence(value, start): ) return atext, end -@_deprecate('get_atext_sequence') -def get_atext(value): +@_replaced_with('get_atext_sequence') +def _deprecated_get_atext(value): """atext = We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to the token's defects list if we find non-atext characters. """ - m = _non_atom_end_matcher(value) + m = _deprecated__non_atom_end_matcher(value) if not m: raise errors.HeaderParseError( "expected atext but found '{}'".format(value)) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 51f4b8a2139d8a2..7c2aa3ceab125d1 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -249,6 +249,7 @@ def _deprecated_bar(a): '_InvalidEwError', 'rfc2047_matcher', '_wsp_splitter', + '_non_atom_end_matcher', )) def test_deprecated_names(self, name): with check_all_warnings(( @@ -261,6 +262,7 @@ def test_deprecated_names(self, name): # XXX XXX make sure this is completely filled in with all the names # we've replaced. get_qp_ctext='get_ccontent_sequence', + get_atext='get_atext_sequence', )) def test_replaced_names(self, oldname, newname): with check_all_warnings(( @@ -2260,14 +2262,11 @@ def test_get_atext_sequence(self, s, *args, **kw): @params def test_get_atext(self, s, *args, **kw): - warnings = [(DeprecationWarning, '.*deprecated.*get_atext_sequence')] - if 'exception' not in kw: - warnings.append((DeprecationWarning, '.*deprecated')) atext = self._test_parse( - parser.get_atext, + parser._deprecated_get_atext, C(s), *args, - warnings=warnings, + warnings=..., test_start=False, **kw, ) From 1821da2143e51450cf67ef39d40be5181ced34c4 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sun, 8 Feb 2026 14:31:13 -0500 Subject: [PATCH 148/152] Refactor get_dot_atom. This completes the elimination of 'vtext' as a thing outside of direct calls to the old API. It also changes the treatment of the 'sort_of_valid_ew' case in the local part tests from an 'obs' case into a non-obs case, because now get_dot_atom is parsing it instead of raising an error. --- Lib/email/_header_value_parser.py | 45 +++++---- .../test_email/test__header_value_parser.py | 97 +++++++++---------- 2 files changed, 71 insertions(+), 71 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 09040a7f89ffef2..b8855ab5cf5b9b5 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1869,30 +1869,39 @@ def get_dot_atom_text(value, start): ) return dot_atom_text, start -def get_dot_atom(value): +@_deprecate_old_api +def get_dot_atom(value, start): """ dot-atom = [CFWS] dot-atom-text [CFWS] - Any place we can have a dot atom, we could instead have an rfc2047 encoded - word. + Return a DotAtom containing leading and trailing CFWSList tokens, if + appropriate, as well as a DotAtomText token, containing all of the + characters up to the next SPECIAL character or the end of value, + and a pointer to the special or the len of value. + + Decode any encoded words, regardless of whitespace, registering defects + if the RFC required whitespace is missing. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ dot_atom = DotAtom() - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) dot_atom.append(token) - if value.startswith('=?'): - try: - token, value = get_encoded_word(value) - except errors.HeaderParseError: - # XXX: need to figure out how to register defects when - # appropriate here. - token, value = get_dot_atom_text(value) - else: - token, value = get_dot_atom_text(value) - dot_atom.append(token) - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + tl, start = get_dot_atom_text(value, start) + if (tl[0].token_type == 'encoded-word' + and dot_atom and not dot_atom[-1].endswith_fws() + ): + dot_atom.defects.append(_MissingWhitespaceBeforeEWDefect) + dot_atom.append(tl) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) + if tl[-1].token_type == 'encoded-word' and not token.startswith_fws(): + dot_atom.defects.append(_MissingWhitespaceAfterEWDefect) dot_atom.append(token) - return dot_atom, value + return dot_atom, start def get_word(value): """word = atom / quoted-string diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 7c2aa3ceab125d1..50d5d7a036410dd 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3466,7 +3466,6 @@ def test_get_dot_atom_text(self, s, *args, **kw): missing_whitespace_before_ew_defect, missing_whitespace_after_ew_defect, ], - # XXX XXX indexes will change during the refactor. ew_indexes=[7, 27], ), @@ -3482,16 +3481,14 @@ def test_get_dot_atom(self, s, *args, **kw): return self.assertIsInstance(atom, parser.DotAtom) self.assertEqual(atom.token_type, 'dot-atom') - self.verify_terminal_types(atom, 'dot', 'atext', 'ptext', 'fws', 'vtext') + self.verify_terminal_types(atom, 'dot', 'atext', 'ptext', 'fws') - params_test_get_dot_atom = old_api_only( + params_test_get_dot_atom = for_each_api( # Atom is a subset of dot atom, so get_dot_atom should pass any # get_atom test except those involving the dot (full_stop). include_unless( - lambda n, *a, **k: 'full_stop' in n - # XXX XXX disable the ew tests until get_dot_atom is refactored - or 'ew_' in str(n), + lambda n, *a, **k: 'full_stop' in n, label='from_test_get_atom', )(params_test_get_atom), @@ -3538,6 +3535,7 @@ def test_get_dot_atom(self, s, *args, **kw): rfc2047_atom = C( '=?utf-8?q?=20bob?=', stringified=' bob', + ew_indexes=[0], ), **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( @@ -3576,6 +3574,7 @@ def test_get_dot_atom(self, s, *args, **kw): value=' foo ', remainder='=?UTF-8?q?bar?=', comments=['hey'], + ew_indexes=[6], ), mixed_ews_and_atext = C( @@ -3589,26 +3588,20 @@ def test_get_dot_atom(self, s, *args, **kw): missing_whitespace_after_ew_defect, ], comments=['hey', 'hey'], - # XXX XXX indexes will change during the refactor. - ew_indexes=[7, 27], + ew_indexes=[12, 32], ), - # XXX XXX This additional EW case not already tested by the atom - # tests will be fully decoded after refactoring. - two_ew_with_dot = C( '=?UTF-8?q?foo?=.=?UTF-8?q?bar?=(hey)', - stringified='foo', - #stringified='foo.bar(hey)', - value='foo', - #value='foo.bar ', - remainder='.=?UTF-8?q?bar?=(hey)', + stringified='foo.bar(hey)', + value='foo.bar ', defects=[ missing_whitespace_after_ew_defect, - # missing_whitespace_before_ew_defect, - # missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, ], - #comments=['hey'], + comments=['hey'], + ew_indexes=[0, 16], ), ) @@ -4143,24 +4136,6 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): ew_indexes=[0, 0], ), - # XXX XXX Since we've decided to decode encoded words, this becomes a - # "valid" dot-atom, which it will be treated as after the refactoring. - # But if you clear up the whitespace defects by adding whitespace, it - # turns into an obs_local_part because of the whitespace. - sort_of_valid_ew_dot_atom = C( - '=?utf-8?q?foo_?=.=?utf-8?q?_bar?=.bird', - stringified='foo . bar.bird', - value="foo . bar.bird", - local_part="foo . bar.bird", - defects=[ - # XXX XXX the whitespace defects will change during refactoring - #missing_whitespace_after_ew_defect, - #missing_whitespace_after_ew_defect, - ], - # XXX XXX second index will change during refactor - ew_indexes=[0, 0], - ), - ) @@ -4177,7 +4152,6 @@ def test_get_local_part(self, s, *args, local_part=None, **kw): 'atext', 'ptext', 'fws', - 'vtext', 'misplaced-special', ) self.assertEqual(lp.local_part, local_part) @@ -4196,6 +4170,10 @@ def adapt_get_dot_atom_tests_for_get_local_part(nl, s, *args, **kw): # For those two ew tests the blank comes from inside the ew. local_part = local_part.removeprefix(' ').removesuffix(' ') kw['local_part'] = local_part + # XXX XXX indexes won't be right mid-refactor, remove when + # get_local_part refactored. + if 'ew_indexes' in kw: + kw['ew_indexes'] = ... yield '', C(s, *args, **kw) @params_map @@ -4210,10 +4188,8 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): kw['ew_indexes'] = ... yield '', C(*args, **kw) - # XXX XXX revert to no with_namelist when get_local_part is refactored - @params_map(with_namelist=True) + @params_map def adapt_get_obs_local_part_tests_for_get_local_part( - nl, *args, defects=[], **kw, @@ -4231,13 +4207,10 @@ def adapt_get_obs_local_part_tests_for_get_local_part( defects.append(not_even_obs_local_part_defect) else: defects.append(non_dot_atom_local_part_obs_defect) - # XXX XXX delete this fixup when get_local_part is refactored. - if nl.has_any( - 'invalid_ew_atoms', - 'less_invalid_ew_atoms', - 'sort_of_valid_ew_dot_atom', - ): - kw.pop('ew_indexes') + # XXX XXX indexes won't be right mid-refactor, remove when + # get_local_part refactored. + if 'ew_indexes' in kw: + kw['ew_indexes'] = ... yield '', C(*args, defects=defects, **kw) params_test_get_local_part = old_api_only( @@ -4257,10 +4230,10 @@ def adapt_get_obs_local_part_tests_for_get_local_part( 'two_dots_raises', 'trailing_dot_raises', 'space_ends_dot_atom', - # XXX XXX These tests should pass after the refactoring - # of get_dot_atom. - 'two_ew_with_dot', - 'multiple_ew_no_ws', + # XXX XXX These need a logic fix to whitespace handling + # in get_local_part itself. + 'ew_and_comments_no_ws', + 'ew_and_empty_comments_no_ws', ) or # get_local_part handles quoted strings (tested above), @@ -4388,10 +4361,28 @@ def adapt_get_obs_local_part_tests_for_get_local_part( defects=[ # XXX XXX there should be exactly one missing whitespace here, # but it will change until we refactor get_local_part. - missing_whitespace_after_ew_defect, + #missing_whitespace_after_ew_defect, # XXX XXX there should be a defect for there being an EW at all. ], local_part='exámple', + ew_indexes=[0], + ), + + # Since we've decided to decode encoded words, this is a "valid" + # dot-atom. But if you clear up the whitespace defects whitespace, it + # turns into an obs_local_part because of the whitespace. + sort_of_valid_ew_dot_atom = C( + '=?utf-8?q?foo_?=.=?utf-8?q?_bar?=.bird', + stringified='foo . bar.bird', + value="foo . bar.bird", + local_part="foo . bar.bird", + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + # XXX XXX There should also be an ew in local part defect. + ], + ew_indexes=[0, 17], ), ) From 2755b6d0ea9cfbf49429d6ae309339dbf360b51d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 16 Feb 2026 16:59:45 -0500 Subject: [PATCH 149/152] Refactor get_word. --- Lib/email/_header_value_parser.py | 47 ++++++++++--------- .../test_email/test__header_value_parser.py | 19 ++------ 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b8855ab5cf5b9b5..66e0bb5db3f5c58 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1903,39 +1903,40 @@ def get_dot_atom(value, start): dot_atom.append(token) return dot_atom, start -def get_word(value): +@_deprecate_old_api +def get_word(value, start): """word = atom / quoted-string - Either atom or quoted-string may start with CFWS. We have to peel off this - CFWS first to determine which type of word to parse. Afterward we splice - the leading CFWS, if any, into the parsed sub-token. - - If neither an atom or a quoted-string is found before the next special, a - HeaderParseError is raised. - - The token returned is either an Atom or a QuotedString, as appropriate. - This means the 'word' level of the formal grammar is not represented in the - parse tree; this is because having that extra layer when manipulating the - parse tree is more confusing than it is helpful. + Return either an Atom or a QuotedString, as appropriate, containing any + leading or trailing whitespace, up to the next non-whitespace + non-special character, and a pointer to the special or the len of value. + If no quoted string or atom is found, raise a HeaderParseError. """ - if value and value[0] in CFWS_LEADER: - leader, value = get_cfws(value) + # The 'word' level of the RFC grammar is not represented in the parse tree; + # having that extra layer when manipulating the parse tree is more + # confusing than it is helpful, and would not affect re-folding. + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + leader, start = get_cfws(value, start) else: leader = None - if not value: + if start >= vlen: raise errors.HeaderParseError( "Expected 'atom' or 'quoted-string' but found nothing.") - if value[0]=='"': - token, value = get_quoted_string(value) - elif value[0] in SPECIALS: - raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " - "but found '{}'".format(value)) + if value[start]=='"': + token, start = get_quoted_string(value, start) + elif value[start] in SPECIALS: + raise errors.HeaderParseError( + f"Expected 'atom' or 'quoted-string' but found {value[start:]!r}" + ) else: - token, value = get_atom(value) + token, start = get_atom(value, start) if leader is not None: - token[:0] = [leader] - return token, value + if not leader.endswith_fws() and token[0].token_type == 'encoded-word': + token.defects.append(_MissingWhitespaceBeforeEWDefect) + token.push(leader) + return token, start def get_phrase(value): """ phrase = 1*word / obs-phrase diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 50d5d7a036410dd..238fb025c1aa0f7 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3634,19 +3634,12 @@ def adapt_get_atom_tests_for_get_word(*args, **kw): kw['tokenlisttype'] = parser.TokenList yield '', C(*args, **kw) - @params_map(with_namelist=True) - def adapt_get_quoted_string_tests_for_get_word(nl, *args, **kw): + @params_map + def adapt_get_quoted_string_tests_for_get_word(*args, **kw): kw['tokenlisttype'] = parser.QuotedString - # XXX XXX Compensate for the fact that get_word is currently peeling - # off the first cfws without copying the indexes, and is only passing - # get_quoted_string the truncated value. - if ('adapted_from_get_cfws' in nl - and (idxs := kw.get('ew_indexes')) - ): - kw['ew_indexes'] = [x + 6 for x in idxs[:len(idxs)//2]] yield '', C(*args, **kw) - params_test_get_word = old_api_only( + params_test_get_word = for_each_api( # A word can be an atom, so get_word should pass many of the atom tests. adapt_get_atom_tests_for_get_word( @@ -3659,9 +3652,7 @@ def adapt_get_quoted_string_tests_for_get_word(nl, *args, **kw): 'no_atom_before_special', 'no_atext_before_special_or_wsp', ) - and 'quotation_mark' in n - # XXX XXX disable the ew tests until get_word is refactored - or 'ew_' in str(n), + and 'quotation_mark' in n, label='from_test_get_atom', )(params_test_get_atom), ), @@ -4133,7 +4124,7 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): local_part="foo . bar.bird", comments=['test'], # XXX XXX the indexes will change during refactor - ew_indexes=[0, 0], + ew_indexes=[0, 2, 20], ), ) From 2a5b64eefb82df319771d0b70c06370a7b35793b Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sat, 14 Mar 2026 17:00:36 -0400 Subject: [PATCH 150/152] Refactor get_phrase. Deprecating the lack of raise if there are no words behavior. BUGFIX: Previously get_phrase would return a phrase containing nothing or only comment folding whitespace if that's all there was. However, a phrase per the RFC must contain at least one word. get_phrase now correctly raises a HeaderParseError in this case. When called via the old API, the existing behavior is maintained, with a deprecation warning. In addition to the basic value -> value, start refactor, this commit adds correct missing ew whitespace detection to get_phrase. For some definition of correct. This commit also updates the 'comment_without_atom_in_obs_phrase' defect to talk about 'cfws' rather than 'comment', since that is what is really accepted as a defect. The constant name changes accordingly. --- Lib/email/_header_value_parser.py | 79 ++++++++----- .../test_email/test__header_value_parser.py | 106 +++++++++--------- 2 files changed, 109 insertions(+), 76 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 66e0bb5db3f5c58..e68369c9641127a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1938,53 +1938,80 @@ def get_word(value, start): token.push(leader) return token, start -def get_phrase(value): +@_deprecate_old_api_and_lack_of_raise_on_invalid_input +def get_phrase(value, start): """ phrase = 1*word / obs-phrase obs-phrase = word *(word / "." / CFWS) - This means a phrase can be a sequence of words, periods, and CFWS in any - order as long as it starts with at least one word. If anything other than - words is detected, an ObsoleteHeaderDefect is added to the token's defect - list. We also accept a phrase that starts with CFWS followed by a dot; - this is registered as an InvalidHeaderDefect, since it is not supported by - even the obsolete grammar. + Return a Phrase containing the any sequence of words, periods, and CFWS in + any order up to the next unquoted character that is not allowed in a phrase + or obsolete phrase, and a pointer to that character or the len of value. + If periods or cfws without adjacent words are found, add an + ObsoleteHeaderDefect to the token's defect list. If one or more periods + are found before the first word (or if there are no words, only periods and + whitespace), add an InvalidHeaderDefect. If there are no words or periods, + raise a HeaderParseError. """ + origstart = start + found_content = False phrase = Phrase() + vlen = len(value) try: - token, value = get_word(value) + token, start = get_word(value, start) + found_content = True phrase.append(token) except errors.HeaderParseError: phrase.defects.append(errors.InvalidHeaderDefect( "phrase does not start with word")) - while value and value[0] not in PHRASE_ENDS: - if value[0]=='.': + while start < vlen and value[start] not in PHRASE_ENDS: + if value[start]=='.': phrase.append(DOT) + found_content = True phrase.defects.append(errors.ObsoleteHeaderDefect( "period in 'phrase'")) - value = value[1:] + start += 1 else: try: - token, value = get_word(value) - if (token[0].token_type == 'encoded-word' - and phrase - and phrase[-1].token_type == 'atom' - and len(phrase[-1]) > 1 - and phrase[-1][-2].token_type == 'encoded-word' - and phrase[-1][-1].token_type == 'cfws' - and not phrase[-1][-1].comments - ): - # linear ws between ews needs special handing... - phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws') + token, start = get_word(value, start) + found_content = True except errors.HeaderParseError: - if value[0] in CFWS_LEADER: - token, value = get_cfws(value) + if value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) phrase.defects.append(errors.ObsoleteHeaderDefect( - "comment found without atom")) + "cfws found without atom")) else: raise + if phrase and phrase[-1].token_type == 'atom': + if phrase[-1][-1].token_type == 'encoded-word': + if not token.startswith_fws(): + phrase.defects.append(_MissingWhitespaceAfterEWDefect) + elif (token[0].token_type == 'encoded-word' + and len(phrase[-1]) > 1 + and phrase[-1][-2].token_type == 'encoded-word' + and phrase[-1][-1].token_type == 'cfws' + and not phrase[-1][-1].comments + ): + phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws') + if (phrase + and token[0].token_type == 'encoded-word' + and not phrase.endswith_fws() + ): + phrase.defects.append(_MissingWhitespaceBeforeEWDefect) phrase.append(token) - return phrase, value + if found_content: + return phrase, start + # XXX POSTDEP: change this to raise the exception. + return ( + phrase, + start, + errors.HeaderParseError( + f"expected phrase but found {value[origstart:]!r}", + ), + "Calling get_phrase when there is not at least one word or" + " period in addition to whitespace is deprecated and will" + " raise an error in the future." + ) def get_obs_local_part(value): """ obs-local-part = word *("." word) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 238fb025c1aa0f7..2fa6bca7e63e61a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -141,9 +141,9 @@ def charset_defect(chars): "period in 'phrase'", ) -comment_without_atom_in_phrase_obs_defect = ( +cfws_without_atom_in_phrase_obs_defect = ( errors.ObsoleteHeaderDefect, - 'comment found without atom', + 'cfws found without atom', ) non_word_phrase_start_defect = ( @@ -3690,47 +3690,62 @@ def test_get_phrase(self, s, *args, obs_dots=0, **kw): self.verify_terminal_types(phrase, 'dot', 'atext', 'ptext', 'fws') @params_map(with_namelist=True) - def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): + def adapt_get_word_tests_for_get_phrase(nl, s, *args, **kw): kw.pop('tokenlisttype') kw.pop('quoted_value', None) kw.pop('content', None) - # XXX XXX A phrase has to have at least one word, but the current code - # does not enforce this. We'll fix this in the refactor, but for now - # we skip the parameters that expect a raise on a value with no - # content. - if nl.has_any( - 'empty', - 'no_atom_before_special', - 'no_atom', - 'no_atext_before_special_or_wsp', - 'cfws_only_raises', - 'empty_input', - ): - return - yield '', C(*args, **kw) + # XXX POSTDEP: delete from here... + if 'oldapi' in nl: + # A phrase has to have at least one word, but the old code did not + # enforce this. For backward compatibility we preserve that + # behavior in the old api, so for the parameters that expect a + # raise on no content we'll either skip them or adapt them. + if nl.has_any( + 'no_atom_before_special', + 'no_atom', + 'cfws_only_raises', + 'empty_input', + ): + return + # These two tests will serve to test the lack-of-raise deprecation. + if nl.has_any( + 'empty', + 'no_atext_before_special_or_wsp', + ): + kw.pop('exception') + kw['remainder'] = s + kw['warnings'] = kw.get('warnings', []) + [ + ( + DeprecationWarning, + r'(?i)(?=.*word)(?=.*whitespace)(?=.*raise)', + ) + ] + kw['defects'] = kw.get('defects', []) + [ + non_word_phrase_start_defect, + ] + # XXX POSTDEP: ...to here + yield '', C(s, *args, **kw) - params_test_get_phrase = old_api_only( + params_test_get_phrase = for_each_api( # A phrase is a series of words, and single words are valid, # so get_phrase should pass many of the get_word tests. adapt_get_word_tests_for_get_phrase( + # A phrase only ends at specials other than " and ., so skip + # get_word tests that expect parsing to stop on those characters. include_unless( lambda n, *a, remainder=False, **k: n.has_any( - # A phrase only ends at specials other than " and . 'atom_ends_at_noncfws', + 'no_atext_before_special_or_wsp', 'qs_ends_at_noncfws', 'ew_after_dquote', 'encoded_word_after_dquote_with_no_ws', 'end_dquote_mid_word', - # XXX XXX This test should pass after refactoring. - 'multiple_ew_no_ws', ) - # A phrase does *not* end at a period or a quotation mark. - or remainder and n.has_any( - 'full_stop', - 'quotation_mark', - ), + or n.has_any('atom_ends_at_special', 'up_to_special') + and n.has_any('full_stop', 'quotation_mark') + or n.has_all('no_atom_before_special', 'full_stop'), label='from_test_get_word', )(params_test_get_word), ), @@ -3753,7 +3768,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value='Fred A. .O Johnson', defects=[ *[period_in_phrase_obs_defect]*2, - comment_without_atom_in_phrase_obs_defect, + cfws_without_atom_in_phrase_obs_defect, ], comments=['weird'], obs_dots=2, @@ -3764,7 +3779,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value=' .name', defects=[ non_word_phrase_start_defect, - comment_without_atom_in_phrase_obs_defect, + cfws_without_atom_in_phrase_obs_defect, period_in_phrase_obs_defect, ], comments=['even weirder'], @@ -3776,7 +3791,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value='simple phrase. ', defects=[ period_in_phrase_obs_defect, - comment_without_atom_in_phrase_obs_defect, + cfws_without_atom_in_phrase_obs_defect, ], remainder=':boo', comments=['with trailing comment'], @@ -3789,22 +3804,19 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): adjacent_ew = C( '=?ascii?q?Joi?= \t =?ascii?q?ned?=', stringified='Joined', - # XXX XXX second index will change during refactor - ew_indexes=[0, 0], + ew_indexes=[0, 18], ), adjacent_ew_different_encodings = C( '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', stringified='Bérénice', - # XXX XXX second index will change during refactor - ew_indexes=[0, 0], + ew_indexes=[0, 21], ), adjacent_ew_encoded_spaces = C( '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', stringified='Encoded spaces preserved', - # XXX XXX second and third indexes will change during refactor - ew_indexes=[0, 0, 0], + ew_indexes=[0, 20, 41], ), adjacent_ew_comment_is_not_linear_white_space = C( @@ -3812,16 +3824,14 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): stringified='Comment (is not) linear-white-space', value='Comment linear-white-space', comments=['is not'], - # XXX XXX second index will change during refactor - ew_indexes=[0, 0], + ew_indexes=[0, 29], ), adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', stringified='Defect still joins', defects=[whitespace_inside_ew_defect], - # XXX XXX second index will change during refactor - ew_indexes=[0, 0], + ew_indexes=[0, 16], ), adjacent_ew_ignore_non_ew = C( @@ -3851,12 +3861,10 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): stringified='disjoin"ted"', value='disjointed', defects=[ - # XXX XXX After refactoring there should be one 'after' defect - #missing_whitespace_after_ew_defect, + missing_whitespace_after_ew_defect, ew_inside_quoted_string_defect, ], - # XXX XXX second index will change during refactoring - ew_indexes=[0, 1], + ew_indexes=[0, 20], ), ew_after_quoted_string_missing_space = C( @@ -3864,12 +3872,10 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): stringified='"disjoin"ted', value='disjointed', defects=[ - # XXX XXX After refactoring 'after' should become 'before' - #missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, ew_inside_quoted_string_defect, ], - # XXX XXX second index will change during refactoring - ew_indexes=[1, 0], + ew_indexes=[1, 21], ), **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + '."')( @@ -3900,7 +3906,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): value=' . ', defects=[ non_word_phrase_start_defect, - *[comment_without_atom_in_phrase_obs_defect]*2, + *[cfws_without_atom_in_phrase_obs_defect]*2, period_in_phrase_obs_defect, ], obs_dots=1, @@ -3912,7 +3918,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): comments=['foo', 'bar'], defects=[ non_word_phrase_start_defect, - *[comment_without_atom_in_phrase_obs_defect]*2, + *[cfws_without_atom_in_phrase_obs_defect]*2, period_in_phrase_obs_defect, ], obs_dots=1, @@ -3924,7 +3930,7 @@ def adapt_get_word_tests_for_get_phrase(nl, *args, **kw): comments=['foo', 'bar'], defects=[ non_word_phrase_start_defect, - *[comment_without_atom_in_phrase_obs_defect]*2, + *[cfws_without_atom_in_phrase_obs_defect]*2, period_in_phrase_obs_defect, ], obs_dots=1, From c185843b3ca28872c72b72d45864dfd899675ceb Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 1 May 2026 20:43:29 -0400 Subject: [PATCH 151/152] Refactor get_obs_local_part. Along the way I've tidied up the 'dot' defects to all have the same format, and use 'local-part' instead of 'local part' to be consistent with the usage in other defects. --- Lib/email/_header_value_parser.py | 49 ++++++++++++------- .../test_email/test__header_value_parser.py | 14 +++--- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index e68369c9641127a..ec91dcb28eb0c9c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2013,39 +2013,54 @@ def get_phrase(value, start): " raise an error in the future." ) -def get_obs_local_part(value): +@_deprecate_old_api +def get_obs_local_part(value, start): """ obs-local-part = word *("." word) + + Return an ObsLocalPart containing a list of words and DOTs containing + all of the characters up to the next character not allowed in a phrase or + the end of the value, and a pointer to the SPECIAL or the len of value. + + Decode any encoded words, registering a defect if any are found. + Missing whitespace defects may also be registered. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ obs_local_part = ObsLocalPart() + vlen = len(value) last_non_ws_was_dot = False - while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): - if value[0] == '.': + while start < vlen and ((c := value[start]) == '\\' or c not in PHRASE_ENDS): + if c == '.': if last_non_ws_was_dot: obs_local_part.defects.append(errors.InvalidHeaderDefect( - "invalid repeated '.'")) + "invalid repeated '.' in local-part") + ) obs_local_part.append(DOT) last_non_ws_was_dot = True - value = value[1:] + start += 1 continue - elif value[0]=='\\': + elif c == '\\': # RFC 5322 doesn't allow \, but the old email code parsed it. - obs_local_part.append(ValueTerminal(value[0], - 'misplaced-special')) - value = value[1:] + obs_local_part.append(ValueTerminal(c,'misplaced-special')) + start += 1 obs_local_part.defects.append(errors.InvalidHeaderDefect( "'\\' character outside of quoted-string/ccontent")) last_non_ws_was_dot = False continue if obs_local_part and obs_local_part[-1].token_type != 'dot': - obs_local_part.defects.append(errors.InvalidHeaderDefect( - "missing '.' between words")) + obs_local_part.defects.append( + errors.InvalidHeaderDefect("missing '.' between words"), + ) try: - token, value = get_word(value) + token, start = get_word(value, start) last_non_ws_was_dot = False except errors.HeaderParseError: - if value[0] not in CFWS_LEADER: + if value[start] not in CFWS_LEADER: raise - token, value = get_cfws(value) + # There will be a 'dot' defect; no need for no-word defect here. + token, start = get_cfws(value, start) obs_local_part.append(token) if not obs_local_part: raise errors.HeaderParseError( @@ -2055,16 +2070,16 @@ def get_obs_local_part(value): len(obs_local_part) > 1 and obs_local_part[1].token_type=='dot'): obs_local_part.defects.append(errors.InvalidHeaderDefect( - "Invalid leading '.' in local part")) + "Invalid leading '.' in local-part")) if (obs_local_part[-1].token_type == 'dot' or obs_local_part[-1].token_type=='cfws' and len(obs_local_part) > 1 and obs_local_part[-2].token_type=='dot'): obs_local_part.defects.append(errors.InvalidHeaderDefect( - "Invalid trailing '.' in local part")) + "Invalid trailing '.' in local-part")) if obs_local_part.defects: obs_local_part.token_type = 'invalid-obs-local-part' - return obs_local_part, value + return obs_local_part, start def get_local_part(value): """ local-part = dot-atom / quoted-string / obs-local-part diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 2fa6bca7e63e61a..4f73efcbd69db50 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -168,17 +168,17 @@ def charset_defect(chars): trailing_dot_in_local_part_defect = ( errors.InvalidHeaderDefect, - "invalid trailing '.' in local part", + "invalid trailing '.' in local-part", ) leading_dot_in_local_part_defect = ( errors.InvalidHeaderDefect, - "invalid leading '.' in local part", + "invalid leading '.' in local-part", ) repeated_dot_in_local_part_defect = ( errors.InvalidHeaderDefect, - "invalid repeated '.'", + "invalid repeated '.' in local-part", ) misplaced_backslash_defect = ( @@ -3960,7 +3960,7 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): # not what it does with non-obs syntax. Anything else is "don't care". # The 'local_part' specs are checked by the get_local_part tests, since the # token list returned by get_obs_local_part doesn't have that attribute. - params_test_get_obs_local_part = old_api_only( + params_test_get_obs_local_part = for_each_api( simple_obsolete = C( 'Fred. A.Johnson@python.org', @@ -4119,8 +4119,7 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): missing_dot_in_local_part_defect, ew_inside_quoted_string_defect, ], - # XXX XXX second index will change during refactor - ew_indexes=[0, 1], + ew_indexes=[0, 17], ), less_invalid_ew_atoms = C( @@ -4129,8 +4128,7 @@ def test_get_obs_local_part(self, s, *args, local_part=None, **kw): value="foo . bar .bird", local_part="foo . bar.bird", comments=['test'], - # XXX XXX the indexes will change during refactor - ew_indexes=[0, 2, 20], + ew_indexes=[0, 20, 38], ), ) From 87c85abff9377638cfa236fa57cab63b9ae95df5 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 5 Jun 2026 11:25:37 -0400 Subject: [PATCH 152/152] Refactor get_local_part. BUGFIX: get_local_part now correctly registers defects if encoded words are found in the local part. --- Lib/email/_header_value_parser.py | 51 ++++--- .../test_email/test__header_value_parser.py | 142 +++++++++--------- 2 files changed, 108 insertions(+), 85 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ec91dcb28eb0c9c..ab652d945101ca1 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2081,40 +2081,55 @@ def get_obs_local_part(value, start): obs_local_part.token_type = 'invalid-obs-local-part' return obs_local_part, start -def get_local_part(value): +@_deprecate_old_api +def get_local_part(value, start): """ local-part = dot-atom / quoted-string / obs-local-part """ local_part = LocalPart() - orig_value = value + vlen = len(value) leader = None - if value and value[0] in CFWS_LEADER: - leader, value = get_cfws(value) - if not value: + if start < vlen and value[start] in CFWS_LEADER: + leader, start = get_cfws(value, start) + text_start = start + if start >= vlen: raise errors.HeaderParseError( "expected local-part but found '{}'".format(value)) try: - token, value = get_dot_atom(value) + token, start = get_dot_atom(value, start) except errors.HeaderParseError: try: - token, value = get_word(value) + token, start = get_word(value, start) except errors.HeaderParseError: - if value[0] != '\\' and value[0] in PHRASE_ENDS: + if value[start] != '\\' and value[start] in PHRASE_ENDS: + # XXX XXX should this be a separate message mentioning + # both dot atom and word? raise token = TokenList() - if leader is not None: - token[:0] = [leader] - local_part.append(token) - if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): - obs_local_part, value = get_obs_local_part(orig_value) - if obs_local_part.token_type == 'invalid-obs-local-part': + if start < vlen and (value[start]=='\\' or value[start] not in PHRASE_ENDS): + # Even if we started with valid text there is more, so start over as obs + token, start = get_obs_local_part(value, text_start) + if token.token_type == 'invalid-obs-local-part': local_part.defects.append(errors.InvalidHeaderDefect( "local-part is not dot-atom, quoted-string, or obs-local-part")) else: - local_part.defects.append(errors.ObsoleteHeaderDefect( - "local-part is not a dot-atom (contains CFWS)")) - local_part[0] = obs_local_part - return local_part, value + local_part.defects.append( + errors.ObsoleteHeaderDefect( + "local-part is not a valid dot-atom" + " (it contains internal CFWS)" + ) + ) + if leader is not None: + token.push(leader) + local_part.append(token) + if local_part.ew_indexes: + # XXX some day we'll put each index into its own defect. + local_part.defects.extend( + [ + errors.InvalidHeaderDefect('encoded-word in local-part'), + ] * len(local_part.ew_indexes) + ) + return local_part, start def get_dtext(value): r""" dtext = / obs-dtext diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 4f73efcbd69db50..a57ca188b40954b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -153,7 +153,7 @@ def charset_defect(chars): non_dot_atom_local_part_obs_defect = ( errors.ObsoleteHeaderDefect, - r'local-part is not a dot-atom \(contains CFWS\)', + r'local-part is not a valid dot-atom \(it contains internal CFWS\)', ) not_even_obs_local_part_defect = ( @@ -186,6 +186,11 @@ def charset_defect(chars): r"'\\' character outside of quoted-string/ccontent", ) +ew_in_local_part_defect = ( + errors.InvalidHeaderDefect, + 'encoded-word in local-part', + ) + # ---> End Defect Expectations @@ -4151,6 +4156,12 @@ def test_get_local_part(self, s, *args, local_part=None, **kw): ) self.assertEqual(lp.local_part, local_part) + @params_map + def add_ew_defects(*args, ew_indexes=[], defects=[], **kw): + if ew_indexes: + defects = defects + [ew_in_local_part_defect] * len(ew_indexes) + yield '', C(*args, ew_indexes=ew_indexes, defects=defects, **kw) + @params_map(with_namelist=True) def adapt_get_dot_atom_tests_for_get_local_part(nl, s, *args, **kw): r = kw.get('remainder') @@ -4165,10 +4176,6 @@ def adapt_get_dot_atom_tests_for_get_local_part(nl, s, *args, **kw): # For those two ew tests the blank comes from inside the ew. local_part = local_part.removeprefix(' ').removesuffix(' ') kw['local_part'] = local_part - # XXX XXX indexes won't be right mid-refactor, remove when - # get_local_part refactored. - if 'ew_indexes' in kw: - kw['ew_indexes'] = ... yield '', C(s, *args, **kw) @params_map @@ -4177,10 +4184,6 @@ def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): kw['value'] = kw.pop('quoted_value') if 'exception' not in kw: kw['local_part'] = kw.pop('content') - # XXX XXX indexes won't be right mid-refactor, remove when - # get_local_part refactored. - if 'ew_indexes' in kw: - kw['ew_indexes'] = ... yield '', C(*args, **kw) @params_map @@ -4202,75 +4205,79 @@ def adapt_get_obs_local_part_tests_for_get_local_part( defects.append(not_even_obs_local_part_defect) else: defects.append(non_dot_atom_local_part_obs_defect) - # XXX XXX indexes won't be right mid-refactor, remove when - # get_local_part refactored. - if 'ew_indexes' in kw: - kw['ew_indexes'] = ... yield '', C(*args, defects=defects, **kw) - params_test_get_local_part = old_api_only( + params_test_get_local_part = for_each_api( # An RFC compliant local part can be a dot atom or a quoted string, so # it should pass some of the tests for those. - adapt_get_dot_atom_tests_for_get_local_part( - include_unless( - lambda n, *a, **k: - n.has_any( - # Get local part handles multiple atoms. - 'two_ew_two_atoms', - 'atom_ends_at_noncfws', - # There are some things get_dot_atom raises for that - # get_local_part treats as obs-local-part. - 'two_dots_raises', - 'trailing_dot_raises', - 'space_ends_dot_atom', - # XXX XXX These need a logic fix to whitespace handling - # in get_local_part itself. - 'ew_and_comments_no_ws', - 'ew_and_empty_comments_no_ws', - ) - or - # get_local_part handles quoted strings (tested above), - # and leading dots or \ are handled as obs-local-part. + add_ew_defects( + adapt_get_dot_atom_tests_for_get_local_part( + include_unless( + lambda n, *a, **k: n.has_any( - 'up_to_special', - 'leading_special_raises', - 'no_atom_before_special', - 'no_atext_before_special_or_wsp', - 'atom_ends_at_special', - 'ends_at_special_after_comment', - 'ends_at_special', + # Get local part handles multiple atoms. + 'two_ew_two_atoms', + 'atom_ends_at_noncfws', + # There are some things get_dot_atom raises for + # that get_local_part treats as obs-local-part. + 'two_dots_raises', + 'trailing_dot_raises', + 'space_ends_dot_atom', + # XXX XXX These need a logic fix to whitespace + # handling in get_local_part itself. + 'ew_and_comments_no_ws', + 'ew_and_empty_comments_no_ws', ) - and n.has_any( - 'reverse_solidus', - 'quotation_mark', - 'full_stop', - ), - label='from_test_get_dot_atom', - )(params_test_get_dot_atom), + or + # get_local_part handles quoted strings (tested + # above), and leading dots or \ are handled as + # obs-local-part. + n.has_any( + 'up_to_special', + 'leading_special_raises', + 'no_atom_before_special', + 'no_atext_before_special_or_wsp', + 'atom_ends_at_special', + 'ends_at_special_after_comment', + 'ends_at_special', + ) + and n.has_any( + 'reverse_solidus', + 'quotation_mark', + 'full_stop', + ), + label='from_test_get_dot_atom', + )(params_test_get_dot_atom), + ), ), - adapt_get_quoted_string_tests_for_get_local_part( - include_unless( - lambda n, *a, **k: n.has_any( - # These tests have an atom first; get_quoted_string raises, - # but get_local_part parses the atom. Atoms are tested above. - 'no_quoted_string', - 'no_leading_dquote_before_non_ws', - # A local part only ends at specials other than " and . - 'qs_ends_at_noncfws', - 'ew_after_dquote', - 'encoded_word_after_dquote_with_no_ws', - 'end_dquote_mid_word', - ), - label='from_test_get_quoted_string', - )(params_test_get_quoted_string), + add_ew_defects( + adapt_get_quoted_string_tests_for_get_local_part( + include_unless( + lambda n, *a, **k: n.has_any( + # These tests have an atom first; get_quoted_string + # raises, but get_local_part parses the atom. Atoms + # are tested above. + 'no_quoted_string', + 'no_leading_dquote_before_non_ws', + # A local part only ends at specials other than " and . + 'qs_ends_at_noncfws', + 'ew_after_dquote', + 'encoded_word_after_dquote_with_no_ws', + 'end_dquote_mid_word', + ), + label='from_test_get_quoted_string', + )(params_test_get_quoted_string), + ), ), - add_label('from_test_get_obs_local_part')( - adapt_get_obs_local_part_tests_for_get_local_part( - params_test_get_obs_local_part, + add_ew_defects( + add_label('from_test_get_obs_local_part')( + adapt_get_obs_local_part_tests_for_get_local_part( + params_test_get_obs_local_part, + ), ), ), @@ -4357,7 +4364,7 @@ def adapt_get_obs_local_part_tests_for_get_local_part( # XXX XXX there should be exactly one missing whitespace here, # but it will change until we refactor get_local_part. #missing_whitespace_after_ew_defect, - # XXX XXX there should be a defect for there being an EW at all. + ew_in_local_part_defect, ], local_part='exámple', ew_indexes=[0], @@ -4376,6 +4383,7 @@ def adapt_get_obs_local_part_tests_for_get_local_part( missing_whitespace_before_ew_defect, missing_whitespace_after_ew_defect, # XXX XXX There should also be an ew in local part defect. + *[ew_in_local_part_defect]*2, ], ew_indexes=[0, 17], ),