diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py index 05a34a4c1052336..65a6ddd2386f0ff 100644 --- a/Lib/email/_encoded_words.py +++ b/Lib/email/_encoded_words.py @@ -163,8 +163,8 @@ def decode(ew): the encoded_string decoded first from its Content Transfer Encoding and then from the resulting bytes into unicode using the specified charset. If the cte-decoded string does not successfully decode using the specified - character set, a defect is added to the defects list and the unknown octets - are replaced by the unicode 'unknown' character \\uFDFF. + character set, a defect is added to the defects list. If the charset + is invalid or not found, a defect is added to the defects list. The specified charset and language are returned. The default for language, which is rarely if ever encountered, is the empty string. @@ -172,6 +172,20 @@ def decode(ew): """ _, charset, cte, cte_string, _ = ew.split('?') charset, _, lang = charset.partition('*') + string, defects = _decode(charset, cte, cte_string) + return string, charset, lang, defects + + +def _decode(charset, cte, cte_string): + """Return cte_string decoded using cte and charset and a list of defects. + + Use cte to turn cte_string into bytes, then decode those bytes using + charset and the surrogateescape error handler. Return a possibly empty + list of defects: return a CharsetError if the charset name is invalid or + unknown, and an UndecodableBytesDefect if there are any bytes the charset + cannot decode. + + """ cte = cte.lower() # Recover the original bytes and do CTE decoding. bstring = cte_string.encode('ascii', 'surrogateescape') @@ -184,11 +198,13 @@ def decode(ew): f"contains bytes not decodable using {charset!r} charset")) string = bstring.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): + # In this context a UnicodeEncodeError results when the charset name is + # not a valid ASCII string. string = bstring.decode('ascii', 'surrogateescape') if charset.lower() != 'unknown-8bit': defects.append(errors.CharsetError(f"Unknown charset {charset!r} " f"in encoded word; decoded as unknown bytes")) - return string, charset, lang, defects + return string, defects _cte_encoders = { diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 792072ab9f6128a..ab652d945101ca1 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -70,20 +70,25 @@ import re import sys import urllib # For urllib.parse.unquote -from string import hexdigits from operator import itemgetter from email import _encoded_words as _ew from email import errors from email import utils +from functools import partial, wraps # # Useful constants and functions # +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 _WSP = ' \t' WSP = set(_WSP) +# This isn't an RFC concept but is useful for parsing. CFWS_LEADER = WSP | set('(') +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 SPECIALS = set(r'()<>@,:;.\"[]') +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 +# These are the characters that *can't* appear in an atom/dot-atom (non-atext). ATOM_ENDS = SPECIALS | WSP DOT_ATOM_ENDS = ATOM_ENDS - set('.') # '.', '"', and '(' do not end phrases in order to support obs-phrase @@ -114,7 +119,7 @@ def quote_string(value): # Match a RFC 2047 word, looks like =?utf-8?q?someword?= -rfc2047_matcher = re.compile(r''' +_deprecated_rfc2047_matcher = re.compile(r''' =\? # literal =? [^?]* # charset \? # literal ? @@ -138,6 +143,7 @@ class TokenList(list): def __init__(self, *args, **kw): super().__init__(*args, **kw) self.defects = [] + self.ew_indexes = [] def __str__(self): return ''.join(str(x) for x in self) @@ -146,6 +152,23 @@ def __repr__(self): return '{}({})'.format(self.__class__.__name__, super().__repr__()) + def append(self, value): + super().append(value) + if hasattr(value, 'ew_indexes'): + self.ew_indexes += value.ew_indexes + + def push(self, value): + super().insert(0, value) + if hasattr(value, 'ew_indexes'): + self.ew_indexes[:0] = value.ew_indexes + + def extend(self, value): + super().extend(value) + if hasattr(value, 'defects'): + self.defects.extend(value.defects) + if hasattr(value, 'ew_indexes'): + self.ew_indexes += value.ew_indexes + @property def value(self): return ''.join(x.value for x in self if x.value) @@ -155,7 +178,10 @@ def all_defects(self): return sum((x.all_defects for x in self), self.defects) def startswith_fws(self): - return self[0].startswith_fws() + return self and self[0].startswith_fws() + + def endswith_fws(self): + return self and self[-1].endswith_fws() as_ew_allowed = True @@ -946,6 +972,9 @@ def value(self): def startswith_fws(self): return self and self[0] in WSP + def endswith_fws(self): + return self and self[-1] in WSP + class ValueTerminal(Terminal): @@ -956,6 +985,9 @@ def value(self): def startswith_fws(self): return False + def endswith_fws(self): + return False + class EWWhiteSpaceTerminal(WhiteSpaceTerminal): @@ -967,7 +999,7 @@ def __str__(self): return '' -class _InvalidEwError(errors.HeaderParseError): +class _deprecated__InvalidEwError(errors.HeaderParseError): """Invalid encoded word found while parsing headers.""" @@ -980,31 +1012,146 @@ class _InvalidEwError(errors.HeaderParseError): ListSeparator.syntactic_break = False RouteComponentMarker = ValueTerminal('@', 'route-component-marker') + +# XXX POSTDEP: Remove from here... +# +# Temporary backward compatibility and deprecation support. Although this is +# an internal module and not a public API, and therefore we *will* eventually +# remove the backward compatibility support, we're still doing backward +# compatibility to minimize disruption for anyone who made use of these +# internal APIs. +# + +OLDAPIREMVER = (3, 18) + +_REPLACED_NAMES = dict( + ) + +def __getattr__(name): + from warnings import _deprecated, _DEPRECATED_MSG + if f'_deprecated_{name}' not in globals(): + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + if name in _REPLACED_NAMES: + _deprecated( + name, + _DEPRECATED_MSG + f", try {_REPLACED_NAMES[name]!r} instead", + remove=OLDAPIREMVER, + ) + else: + _deprecated(name, remove=OLDAPIREMVER) + return globals()[f'_deprecated_{name}'] + +def _replaced_with(name): + def _(func): + _REPLACED_NAMES[func.__name__.removeprefix('_deprecated_')] = name + return func + return _ + +_API_CHANGE_MSG = ( + "The API of the internal function {name!r} has changed; the backward" + " compatibility wrapper will be removed in {remove}" + ) + +def _deprecate_old_api(func): + @wraps(func) + def dispatch(value, *args, **kw): + if args and isinstance(args[0], int): + return func(value, *args, **kw) + # The runtime error is going to say the function should be removed, but + # it's only the decorator that needs to be removed. + from warnings import _deprecated + _deprecated(func.__name__, _API_CHANGE_MSG, remove=OLDAPIREMVER) + result, start, *other = func(value, 0, *args, **kw) + return result, value[start:], *other + return dispatch + +# A specialized deprecation for some functions that should be raising +# errors when handed input that is empty or doesn't contain the expected +# value, but current return an empty object instead. The return signature +# of the wrapped function must be either (result, start) or (result, start, +# exception, warning). If present, 'exception' will be raised from the new +# api, and 'warning' will be passed to 'warn' as a DeprecationWarning for +# the old api. +def _deprecate_old_api_and_lack_of_raise_on_invalid_input(func): + @wraps(func) + def dispatch(value, *args, **kw): + if args and isinstance(args[0], int): + result, start, *error = func(value, *args, **kw) + if error: + raise error[0] + return result, start + from warnings import _deprecated, warn + _deprecated(func.__name__, _API_CHANGE_MSG, remove=OLDAPIREMVER) + result, start, *error = func(value, 0, *args, **kw) + if error: + warn(error[1], DeprecationWarning, stacklevel=2) + return result, value[start:] + return dispatch + +# XXX XXX By the end of the refactoring, calls to _deprecate will be replaced by +# renaming the functions with _deprecated_ in front and adding any new names to +# _REPLACED_NAMES. The deprecation testing will need to be adjusted. This +# decorator should not exist in the final version of the branch. + +from functools import singledispatch +from collections.abc import Callable + +def __deprecate(msg, new_name=None): + def _(func): + @wraps(func) + def deprecate(*args, **kw): + from warnings import _deprecated + _deprecated(func.__name__, msg, remove=OLDAPIREMVER) + return func(*args, **kw) + return deprecate + return _ + +@singledispatch +def _deprecate(new_name): + from warnings import _DEPRECATED_MSG + return __deprecate(_DEPRECATED_MSG + f", try {new_name} instead") + +@_deprecate.register(Callable) +def _(func): + from warnings import _DEPRECATED_MSG + return __deprecate(_DEPRECATED_MSG)(func) + +# XXX POSTDEP: ...to here. + + # # Parser # # Parse strings according to RFC822/2047/2822/5322 rules. # -# This is a stateless parser. Each get_XXX function accepts a string and -# returns either a Terminal or a TokenList representing the RFC object named -# by the method and a string containing the remaining unparsed characters -# from the input. Thus a parser method consumes the next syntactic construct -# of a given type and returns a token representing the construct plus the -# unparsed remainder of the input string. +# This is a stateless parser. Each get_XXX function accepts a string and a +# starting position and returns either a Terminal or a TokenList representing +# the RFC (or local concept) object named by the method and a pointer to +# remaining unparsed characters in the string. Thus a parser method consumes +# the next syntactic construct of a given type and returns a token representing +# the construct plus a pointer to the unparsed remainder of the input string. # # For example, if the first element of a structured header is a 'phrase', # then: # -# phrase, value = get_phrase(value) +# phrase, rest = get_phrase(value, start) # -# returns the complete phrase from the start of the string value, plus any -# characters left in the string after the phrase is removed. +# returns a complete 'phrase' from 'start' to 'rest' in the value. + +# Often used Defects. XXX These could become subclasses. -_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split -_non_atom_end_matcher = re.compile(r"[^{}]+".format( +_MissingWhitespaceBeforeEWDefect = errors.InvalidHeaderDefect( + "missing whitespace before encoded-word", + ) + +_MissingWhitespaceAfterEWDefect = errors.InvalidHeaderDefect( + "missing whitespace after encoded-word", + ) + +_deprecated__wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split +_deprecated__non_atom_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(ATOM_ENDS)))).match -_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall _non_token_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(TOKEN_ENDS)))).match _non_attribute_end_matcher = re.compile(r"[^{}]+".format( @@ -1012,6 +1159,25 @@ class _InvalidEwError(errors.HeaderParseError): _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 for non_printable. +_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall +def _make_xtext(text, terminal_class, token_type): + """Return text wrapped in terminal_class of token_type, with defects if any. + + If text contains non-printable ASCII or undecodable bytes, add those + defects to the returned terminal_class object. + + """ + vt = terminal_class(text, token_type=token_type) + non_printables = _non_printable_finder(text) + if non_printables: + vt.defects.append(errors.NonPrintableDefect(non_printables)) + if utils._has_surrogates(text): + vt.defects.append(errors.UndecodableBytesDefect( + "Non-ASCII characters found in header token")) + return vt + +@_deprecate('_get_xtext') def _validate_xtext(xtext): """If input token contains ASCII non-printables, register a defect.""" @@ -1022,6 +1188,23 @@ def _validate_xtext(xtext): xtext.defects.append(errors.UndecodableBytesDefect( "Non-ASCII characters found in header token")) +# _make_non_match_re is for use by the callers of _get_xtext. +_make_non_match_re = lambda s: re.compile(rf'[^{re.escape(s)}]+') +def _get_xtext(value, start, regex, terminal_class, token_type, err=None): + """Return text matching regex via _make_xtext, raise err if no match. + + Use the regex 'match' to identify a substring. If there is no match, raise + err. If there is a match, pass it to _make_xtext to create a + terminal_class of terminal_type. Return the terminal and the index of the + end of the match. + + """ + m = regex.match(value, start) + if m is None: + raise err + return _make_xtext(m.group(), terminal_class, token_type), m.end() + +@_deprecate('content_getter') def _get_ptext_to_endchars(value, endchars): """Scan printables/quoted-pairs until endchars and return unquoted ptext. @@ -1033,7 +1216,7 @@ def _get_ptext_to_endchars(value, endchars): """ if not value: return '', '', False - fragment, *remainder = _wsp_splitter(value, 1) + fragment, *remainder = _deprecated__wsp_splitter(value, 1) vchars = [] escape = False had_qp = False @@ -1047,6 +1230,7 @@ def _get_ptext_to_endchars(value, endchars): continue if escape: escape = False + had_qp = True elif fragment[pos] in endchars: break vchars.append(fragment[pos]) @@ -1054,135 +1238,310 @@ def _get_ptext_to_endchars(value, endchars): pos = pos + 1 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp -def get_fws(value): +_wsp_matcher = re.compile(fr'[{_WSP}]+').match +@_deprecate_old_api_and_lack_of_raise_on_invalid_input +def get_fws(value, start): """FWS = 1*WSP - This isn't the RFC definition. We're using fws to represent tokens where - folding can be done, but when we are parsing the *un*folding has already - been done so we don't need to watch out for CRLF. + If start does not point to a WSP character in value, raise a HeaderParse + error. Otherwise, return a WhiteSpaceTerminal of token_type 'fws' + containing all of the WSP characters from start to the next non-WSP + character (or the end of value), and the index of the non-WSP character (or + the len of value). - """ - newvalue = value.lstrip() - fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') - return fws, newvalue + This is a subset of the RFC 5322 definition of FWS: the strings passed to + the parser should already have been unfolded, so there should be no + legitimate CRLF characters in value. -def get_encoded_word(value, terminal_type='vtext'): + """ + m = _wsp_matcher(value, start) + if m is None: + # XXX POSTDEP: change this to raise the exception. + return ( + WhiteSpaceTerminal('', 'fws'), + start, + errors.HeaderParseError( + f'expected whitespace but found {value[start:]!r}' + ), + ( + "Calling get_fws when there is no whitespace at the start" + " is deprecated and will raise an error in the future." + ), + ) + fws = WhiteSpaceTerminal(m.group(), 'fws') + return fws, m.end() + +# We need a custom deprecation for this one because we want terminal_type to be +# required, a return of None instead of exceptions, and for the trailing +# whitespace defect addition to move elsewhere. +def _deprecate_old_encoded_word_api(func): + @wraps(func) + def dispatch(value, *args, **kw): + if args and isinstance(args[0], int): + return func(value, *args, **kw) + from warnings import _deprecated + _deprecated(func.__name__, _API_CHANGE_MSG, remove=OLDAPIREMVER) + kw.setdefault('terminal_type', args[0] if args else 'vtext') + result = func(value, 0, **kw) + if result is None: + raise _deprecated__InvalidEwError( + f"expected encoded word but found {value}", + ) + result, start = result + ew, value = result, value[start:] + if value and value[0] not in WSP: + ew.defects.append(_MissingWhitespaceAfterEWDefect) + return ew, value + return dispatch + +# This match is generous; defects are detected during ew parsing. +_ew_finder = re.compile(r''' + =\? # literal =? + ( # We might have 'charset' or 'charset*lang' next. + ( # First case: no * + (?P[^?*]*?) # non-greedy to next ? if no * is the charset + \? # literal ? + ) + | + ( # Second case: charset*lang + (?P[^?*]*?) # non-greedy to * is the charset + \* # literal * + (?P[^?]*?) # non-greedy to next ? is the lang + \? # literal ? + ) + ) + (?P[^?]*?) # non-greedy up to the next ? is the CTE + \? # literal ? + (?P.*?) # non-greedy to next ?= is the encoded string + \?= # literal ?= + ''', re.VERBOSE | re.DOTALL).match +_wsp_finder = re.compile(rf'[{_WSP}]+').search +_non_wsp_re = _make_non_match_re(_WSP) +@_deprecate_old_encoded_word_api +def get_encoded_word(value, start, terminal_type, *, decode_qp=False): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + If something interpretable as an encoded word occurs starting at start, + return an EncodedWord token list with the decoded text decomposed into + whitespace and non-whitespace value terminals, and the index of the last + character of the encoded word (the '=') plus one. Register a defect if + there is un-encoded whitespace inside the encoded word, and register + defects for any any non-printable or invalid characters in the + non-whitespace ValueTerminals. + + If decode_qp is True, decode any quoted pairs in the payload of the encoded + word before decoding. + + If the characters starting at start are not interpretable as an encoded + word such that it can be decoded from the content transfer encoding, return + None. + """ + ew_match = _ew_finder(value, start) + if ew_match is None: + return ew = EncodedWord() - if not value.startswith('=?'): - raise errors.HeaderParseError( - "expected encoded word but found {}".format(value)) - tok, *remainder = value[2:].split('?=', 1) - if tok == value[2:]: - raise errors.HeaderParseError( - "expected encoded word but found {}".format(value)) - remstr = ''.join(remainder) - if (len(remstr) > 1 and - remstr[0] in hexdigits and - remstr[1] in hexdigits and - tok.count('?') < 2): - # The ? after the CTE was followed by an encoded word escape (=XX). - rest, *remainder = remstr.split('?=', 1) - tok = tok + '?=' + rest - if len(tok.split()) > 1: - ew.defects.append(errors.InvalidHeaderDefect( - "whitespace inside encoded word")) - ew.cte = value - value = ''.join(remainder) + csnolang, cslang, lang, cte, encoded = ew_match.group( + 'csnolang', 'cslang', 'lang', 'cte', 'encoded') + charset, lang = cslang or csnolang or '', lang or '' + ew.charset = charset.strip() + ew.lang = lang.strip() + encoded, _ = _qp_unquote(encoded) if decode_qp else (encoded, 0) try: - text, charset, lang, defects = _ew.decode('=?' + tok + '?=') - except (ValueError, KeyError): - raise _InvalidEwError( - "encoded word format invalid: '{}'".format(ew.cte)) - ew.charset = charset - ew.lang = lang + text, defects = _ew._decode(ew.charset, cte, encoded) + except KeyError: + # With an unknown CTE we can't decode the content. We could just + # return it, but that would be less clear than leaving the ew alone. + return None + if any(isinstance(x, errors.InvalidBase64LengthDefect) for x in defects): + return None ew.defects.extend(defects) - while text: - if text[0] in WSP: - token, text = get_fws(text) + if _wsp_finder(ew_match.group()): + ew.defects.append(errors.InvalidHeaderDefect( + "whitespace inside encoded-word")) + tptr, tlen = 0, len(text) + while tptr < tlen: + if text[tptr] in WSP: + token, tptr = get_fws(text, tptr) ew.append(token) continue - chars, *remainder = _wsp_splitter(text, 1) - vtext = ValueTerminal(chars, terminal_type) - _validate_xtext(vtext) - ew.append(vtext) - text = ''.join(remainder) - # Encoded words should be followed by a WS - if value and value[0] not in WSP: - ew.defects.append(errors.InvalidHeaderDefect( - "missing trailing whitespace after encoded-word")) - return ew, value + t, tptr = _get_xtext( + text, + tptr, + _non_wsp_re, + ValueTerminal, + terminal_type, + ) + ew.append(t) + return ew, ew_match.end() + +# In theory encoded words should only appear in certain places. In +# practice they tend to appear any where "normal text" tokens appear. This +# outside-the-rfc-grammar function-generator provides the tools to handle that. +_make_content_re = lambda s: re.compile(rf'[^{re.escape(s)}]*') +_make_qp_content_re = lambda s: re.compile( rf"([^{re.escape(s)}\\]|\\.)*") +_qp_finder = re.compile(r'\\(.)') +_qp_unquote = lambda s: _qp_finder.subn(r'\1', s) +def content_getter( + tl_class, + text_type, + end_chars='', + qp=False, + ): + """Return a function that can be used to parse up to certain end chars. + + The returned function has the following contract: + + new_function(value, start) + + Return a token list containing decoded text tokens and WSP. + + Process value from start until the first occurrence of any of the + characters in the iterable end_chars, breaking it up into whitespace and + non-whitespace tokens, and decoding encoded words wherever they are found + regardless of whitespace. Return the resulting list of tokens in an + instance of tl_type and then index of whichever end_char was found first + (or the len of value if none were found). Decoded encoded words should be + EncodedWord token lists, non-encoded word tokens should be of type + ValueTerminal with a token_type text_type, and whitespace tokens should be + WhiteSpaceTerminals or EWWhiteSpaceTerminals, as appropriate. + + Encoded word detection should take precedence over end_chars detection: an + end_char inside an encoded word should be treated as part of the encoded + word content rather than ending the processing. + + If qp is true, ignore end characters that are part of quoted pairs when + looking for the end of the parsable text, and unquote any quoted pairs in + the parsed text. + + if an encoded word is found, set the `has_ew` attribute of the returned + token list to `True`. -def get_unstructured(value): + """ + end_chars = ''.join(list(end_chars)) + if qp: + pre_ew_re = _make_qp_content_re(end_chars + _WSP + '=') + post_ew_re = _make_qp_content_re(end_chars + _WSP) + else: + pre_ew_re = _make_content_re(end_chars + _WSP + '=') + post_ew_re = _make_content_re(end_chars + _WSP) + return partial( + _get_content, + tl_class=tl_class, + text_type=text_type, + qp=qp, + end_chars=end_chars, + pre_ew_re=pre_ew_re, + post_ew_re=post_ew_re, + ) + +def _get_content( + value, + start=0, + *, + tl_class, + text_type, + pre_ew_re, + post_ew_re, + end_chars, + qp, + ): + tl = tl_class() + vlen = len(value) + while start < vlen: + if value[start] in end_chars: + break + if value[start] in WSP: + token, start = get_fws(value, start) + tl.append(token) + continue + ew = None + m = pre_ew_re.match(value, start) + end = m.end() + if end < vlen: + if value[end] == '=': + res = get_encoded_word(value, end, text_type, decode_qp=qp) + if res: + # XXX save the index; some day the defects will use it + tl.ew_indexes.append(end) + ew, end = res + else: + m = post_ew_re.match(value, start) + ew, end = None, m.end() + text = m.group() + # At this point we have text, an ew, or both; we can't have neither. + if tl and tl[-1].token_type == 'encoded-word': + tl.defects.append(_MissingWhitespaceAfterEWDefect) + if text: + text, _ = _qp_unquote(text) if qp else (text, 0) + tl.append(_make_xtext(text, ValueTerminal, text_type)) + if ew: + if tl: + if tl[-1].token_type == 'fws': + if len(tl) > 1 and tl[-2].token_type == 'encoded-word': + tl[-1] = EWWhiteSpaceTerminal(tl[-1], 'fws') + else: + tl.defects.append(_MissingWhitespaceBeforeEWDefect) + tl.append(ew) + start = end + return tl, start + +_get_unstructured_content = content_getter(UnstructuredTokenList, 'utext') +def parse_unstructured(value): """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) obs-utext = %d0 / obs-NO-WS-CTL / LF / CR + obs-NO-WS-CTL = - obs-NO-WS-CTL is control characters except WSP/CR/LF. + Return an UnstructuredTokenList containing whitespace and non-whitespace + tokens obtained from value, decoding any encoded words found, regardless of + whitespace, into EncodedWord tokens lists. Register defects if the encoded + words are not correctly surrounded by whitespace or the ends of the value + or have internal whitespace. Register defects if the non-whitespace tokens + contain any non-printable or invalid characters. All ValueTerminals + should have the token_type 'utext'. - So, basically, we have printable runs, plus control characters or nulls in - the obsolete syntax, separated by whitespace. Since RFC 2047 uses the - obsolete syntax in its specification, but requires whitespace on either - side of the encoded words, I can see no reason to need to separate the - non-printable-non-whitespace from the printable runs if they occur, so we - parse this into xtext tokens separated by WSP tokens. + """ + # We don't actually handle CR or LF in obs, instead we treat them as a + # non-printable defect. Normally they won't even appear in value, since + # the code that calls the parser will have done header unfolding. + unstructured, _ = _get_unstructured_content(value, 0) + return unstructured - Because an 'unstructured' value must by definition constitute the entire - value, this 'get' routine does not return a remaining value, only the - parsed TokenList. +@_deprecate('parse_unstructured') +def get_unstructured(value): + return parse_unstructured(value) - """ - # XXX: but what about bare CR and LF? They might signal the start or - # end of an encoded word. YAGNI for now, since our current parsers - # will never send us strings with bare CR or LF. +_get_ccontent_content = content_getter( + TokenList, + 'ptext', + end_chars='()', + qp=True, + ) +def get_ccontent_sequence(value, start): + """ccontent_sequence = *([FWS] qp_ctext / encoded_word [FWS]) - unstructured = UnstructuredTokenList() - while value: - if value[0] in WSP: - token, value = get_fws(value) - unstructured.append(token) - continue - valid_ew = True - if value.startswith('=?'): - try: - token, value = get_encoded_word(value, 'utext') - except _InvalidEwError: - valid_ew = False - except errors.HeaderParseError: - # XXX: Need to figure out how to register defects when - # appropriate here. - pass - else: - have_ws = True - if len(unstructured) > 0: - if unstructured[-1].token_type != 'fws': - unstructured.defects.append(errors.InvalidHeaderDefect( - "missing whitespace before encoded word")) - have_ws = False - if have_ws and len(unstructured) > 1: - if unstructured[-2].token_type == 'encoded-word': - unstructured[-1] = EWWhiteSpaceTerminal( - unstructured[-1], 'fws') - unstructured.append(token) - continue - tok, *remainder = _wsp_splitter(value, 1) - # Split in the middle of an atom if there is a rfc2047 encoded word - # which does not have WSP on both sides. The defect will be registered - # the next time through the loop. - # This needs to only be performed when the encoded word is valid; - # otherwise, performing it on an invalid encoded word can cause - # the parser to go in an infinite loop. - if valid_ew and rfc2047_matcher.search(tok): - tok, *remainder = value.partition('=?') - vtext = ValueTerminal(tok, 'utext') - _validate_xtext(vtext) - unstructured.append(vtext) - value = ''.join(remainder) - return unstructured + This bridges the RFC ctext, ccontent, and comment into something that + makes recovery from errors in the input easier. + + Return a (possibly empty) TokenList containing all characters up to the + next unquoted open or close parenthesis outside of an encoded word (or the + end of value if there isn't one) and the index of that parenthesis (or the + len of value), unquoting any quoted pairs and decoding any encoded words. + All ValueTerminals returned should have the token_type 'ptext'. + + Encoded words should be decoded even if there is non-whitespace around + them, and whether or not they contain any RFC invalid whitespace. Register + defects for any internal or missing whitespace. + + Register defects if there are any non-printable or undecodable characters + in the non-whitespace tokens. -def get_qp_ctext(value): + """ + return _get_ccontent_content(value, start) + +@_replaced_with('get_ccontent_sequence') +def _deprecated_get_qp_ctext(value): r"""ctext = This is not the RFC ctext, since we are handling nested comments in comment @@ -1199,6 +1558,7 @@ def get_qp_ctext(value): _validate_xtext(ptext) return ptext, value +@_deprecate('get_bare_quoted_string') def get_qcontent(value): """qcontent = qtext / quoted-pair @@ -1214,13 +1574,45 @@ def get_qcontent(value): _validate_xtext(ptext) return ptext, value -def get_atext(value): +_get_atext_content = content_getter(TokenList, 'atext', end_chars=ATOM_ENDS) +def get_atext_sequence(value, start): + """atext = Printable US-ASCII characters not including specials + + This augments the RFC atext by handling encoded words at a level that makes + it easier to recover from errors in the input. + + Return a TokenList containing all characters up to the next special or WSP + outside of an encoded word (or the end of value), and the index of the + special or WSP (or the len of value), decoding any encoded words. + + Raise a HeaderParseError if no characters are found before the special, + WSP, or end of value. + + Encoded words should be decoded even if there is non-whitespace around + them, and whether or not they contain any RFC invalid whitespace. Register + internal or missing whitespace defects. + + Register defects if there are any non-printable or undecodable characters + in the non-whitespace tokens. + + All ValueTerminals returned should have the type 'atext'. + + """ + atext, end = _get_atext_content(value, start) + if not atext: + raise errors.HeaderParseError( + f"expected atext but found {value[start:]!r}", + ) + return atext, end + +@_replaced_with('get_atext_sequence') +def _deprecated_get_atext(value): """atext = We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to the token's defects list if we find non-atext characters. """ - m = _non_atom_end_matcher(value) + m = _deprecated__non_atom_end_matcher(value) if not m: raise errors.HeaderParseError( "expected atext but found '{}'".format(value)) @@ -1230,323 +1622,445 @@ def get_atext(value): _validate_xtext(atext) return atext, value -def get_bare_quoted_string(value): +_get_bare_quoted_string_content = content_getter( + BareQuotedString, + 'ptext', + end_chars='"', + qp=True, + ) +@_deprecate_old_api +def get_bare_quoted_string(value, start): """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE - A quoted-string without the leading or trailing white space. Its - value is the text between the quote marks, with whitespace - preserved and quoted pairs decoded. + This is a subset of the RFC 5322 quoted-string: the quoted string without + any of the CFWS that might come before or after the '"'s. + + If start does not point to a double quote in value, raise an error. + Otherwise return a (possibly empty) BareQuotedString incorporating all + characters up to the next unquoted double quote (or the end of value if + there is no double quote) and the index of the character after the double + quote (or the len of value), unquoting any quoted pairs. The returned + BareQuotedString should not contain any ValueTerminals for the double quote + marks, but when stringified the quotes should be added, whether the + trailing quote was present in value or not. If the trailing quote is not + present register a defect. + + If the content after quoted pair decoding contains any RFC 2047 encoded + words, decode them, whether they are correctly bracketed by whitespace + or not, and whether they contain internal whitespace or not. Register + a defect for the presence of any such word, as well as defects for + any whitespace issues. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ - if not value or value[0] != '"': + # This implementation bypasses the RFC qcontent BNF element in favor of + # using our generic content_getter to decode (RFC invalid) encoded words. + vlen = len(value) + if start >= vlen or value[start] != '"': raise errors.HeaderParseError( - "expected '\"' but found '{}'".format(value)) - bare_quoted_string = BareQuotedString() - value = value[1:] - if value and value[0] == '"': - return bare_quoted_string, value[1:] - while value and value[0] != '"': - if value[0] in WSP: - token, value = get_fws(value) - elif value[:2] == '=?': - valid_ew = False - try: - token, value = get_encoded_word(value) - bare_quoted_string.defects.append(errors.InvalidHeaderDefect( - "encoded word inside quoted string")) - valid_ew = True - except errors.HeaderParseError: - token, value = get_qcontent(value) - # Collapse the whitespace between two encoded words that occur in a - # bare-quoted-string. - if valid_ew and len(bare_quoted_string) > 1: - if (bare_quoted_string[-1].token_type == 'fws' and - bare_quoted_string[-2].token_type == 'encoded-word'): - bare_quoted_string[-1] = EWWhiteSpaceTerminal( - bare_quoted_string[-1], 'fws') - else: - token, value = get_qcontent(value) - bare_quoted_string.append(token) - if not value: - bare_quoted_string.defects.append(errors.InvalidHeaderDefect( - "end of header inside quoted string")) - return bare_quoted_string, value - return bare_quoted_string, value[1:] - -def get_comment(value): + f"expected '\"' but found {value[start:]!r}" + ) + start += 1 + bare_quoted_string, start = _get_bare_quoted_string_content(value, start) + if bare_quoted_string.ew_indexes: + # XXX some day we'll put each index into its own defect. + bare_quoted_string.defects.extend( + [ + errors.InvalidHeaderDefect('encoded-word inside quoted string'), + ] * len(bare_quoted_string.ew_indexes) + ) + if start < vlen: + return bare_quoted_string, start + 1 + bare_quoted_string.defects.append( + errors.InvalidHeaderDefect("end of header inside quoted string"), + ) + return bare_quoted_string, start + +@_deprecate_old_api +def get_comment(value, start): """comment = "(" *([FWS] ccontent) [FWS] ")" - ccontent = ctext / quoted-pair / comment + ccontent = ctext / quoted-pair / encoded_word / comment + + If start does not point to an open parenthesis, raise an error. Otherwise + return a (possibly empty) Comment that incorporates all characters up to + the corresponding close parenthesis (or the end of the value if there is no + corresponding close parenthesis) and the index to the character after that + closing parenthesis (or the len of input), unquoting any quoted printables, + and decoding any encoded words. The Comment should be a nested token list + structure containing any nested comments. The Comment should not contain + any ValueTerminals for the parentheses, but when stringified the + parentheses should be added, whether the trailing parenthesis was present + or not. If the trailing parenthesis is not present register a defect. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. - We handle nested comments here, and quoted-pair in our qp-ctext routine. """ - if value and value[0] != '(': + vlen = len(value) + if start >= vlen or value[start] != '(': raise errors.HeaderParseError( - "expected '(' but found '{}'".format(value)) + f"expected '(' but found {value[start:]!r}" + ) comment = Comment() - value = value[1:] - while value and value[0] != ")": - if value[0] in WSP: - token, value = get_fws(value) - elif value[0] == '(': - token, value = get_comment(value) + start += 1 + while start < vlen: + if (c := value[start]) == ")": + break + elif c == '(': + token, start = get_comment(value, start) + comment.append(token) else: - token, value = get_qp_ctext(value) - comment.append(token) - if not value: - comment.defects.append(errors.InvalidHeaderDefect( - "end of header inside comment")) - return comment, value - return comment, value[1:] - -def get_cfws(value): + tl, start = get_ccontent_sequence(value, start) + comment.extend(tl) + else: + comment.defects.append( + errors.InvalidHeaderDefect("end of header inside comment"), + ) + return comment, start + return comment, start + 1 + +@_deprecate_old_api_and_lack_of_raise_on_invalid_input +def get_cfws(value, start): """CFWS = (1*([FWS] comment) [FWS]) / FWS + Raise an error if start does not point to either whitespace or an open + parenthesis in value. Otherwise return a CFWSList containing any + whitespace or comments up to the next non-CFWS character outside of a + comment (or the end of value), and the index of that next character (or the + len of value). + """ cfws = CFWSList() - while value and value[0] in CFWS_LEADER: - if value[0] in WSP: - token, value = get_fws(value) + vlen = len(value) + while start < vlen: + if (c := value[start]) in WSP: + token, start = get_fws(value, start) + elif c == '(': + token, start = get_comment(value, start) else: - token, value = get_comment(value) + break cfws.append(token) - return cfws, value + if not cfws: + # XXX POSTDEP: change this to raise the exception. + return ( + cfws, + start, + errors.HeaderParseError( + f'expected cfws but found {value[start:]!r}' + ), + ( + "Calling get_cfws when there is no whitespace or comment at" + " the start is deprecated and will raise an error in the" + " future." + ), + ) + return cfws, start + +@_deprecate_old_api +def get_quoted_string(value, start): + """quoted-string = [CFWS] bare-quoted-string [CFWS] + + Return a QuotedString containing the leading CFWSList (if any), the + BareQuotedString, and the trailing CFWSList (if any), plus the index of the + character after the parsed text (or the len of value if there is no text + left unparsed). + + If no bare-quoted-string is found raise a HeaderParseError. -def get_quoted_string(value): - """quoted-string = [CFWS] [CFWS] - - 'bare-quoted-string' is an intermediate class defined by this - parser and not by the RFC grammar. It is the quoted string - without any attached CFWS. """ quoted_string = QuotedString() - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) quoted_string.append(token) - token, value = get_bare_quoted_string(value) + token, start = get_bare_quoted_string(value, start) quoted_string.append(token) - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) quoted_string.append(token) - return quoted_string, value + return quoted_string, start -def get_atom(value): +@_deprecate_old_api +def get_atom(value, start): """atom = [CFWS] 1*atext [CFWS] - An atom could be an rfc2047 encoded word. + Return an Atom containing the leading and trailing CFWSList tokens + if appropriate, as well as ValueTerminals of token_type atext, containing + all characters up to the next SPECIAL character or the end of value, and a + pointer to the special or the len of value. + + Decode any encoded words, regardless of whitespace, registering defects + if the RFC required whitespace is missing. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ + # We decode encoded words mixed in to atext without whitespace to in-total + # comprise the body of the atom. This might qualify as a separate defect. atom = Atom() - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) atom.append(token) - if value and value[0] in ATOM_ENDS: + if start >= vlen or value[start] in ATOM_ENDS: raise errors.HeaderParseError( - "expected atom but found '{}'".format(value)) - if value.startswith('=?'): - try: - token, value = get_encoded_word(value) - except errors.HeaderParseError: - # XXX: need to figure out how to register defects when - # appropriate here. - token, value = get_atext(value) - else: - token, value = get_atext(value) - atom.append(token) - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + "expected atom but found '{}'".format(value[start:])) + tl, start = get_atext_sequence(value, start) + if (tl[0].token_type == 'encoded-word' + and atom and not atom[-1].endswith_fws() + ): + atom.defects.append(_MissingWhitespaceBeforeEWDefect) + atom.extend(tl) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) + if tl[-1].token_type == 'encoded-word' and not token.startswith_fws(): + atom.defects.append(_MissingWhitespaceAfterEWDefect) atom.append(token) - return atom, value + return atom, start + +@_deprecate_old_api +def get_dot_atom_text(value, start): + """ dot-atom-text = 1*atext *("." 1*atext) + + Return a DotAtomText containing all characters up to the next non-'.' + special or WSP outside of an enocded word or the end of value, and the + index of the special, WSP, or the len of value, decoding any encoded words. + All ValueTerminals returned should have the type 'atext'. '.' characters + should be returned as ValueTermibnals of token_type 'dot'. + + Encoded words should be decoded even if there is non-whitespace around + them, and whether or not they contain any RFC invalid whitespace. Register + defects for any missing whitespace. -def get_dot_atom_text(value): - """ dot-text = 1*atext *("." 1*atext) + Register defects if there are any non-printable or undecodable characters + in the non-whitespace tokens. """ + # The only legitimate way an encoded word can be in a dot-atom-text + # position is if it is the only thing there. Following our policy of + # generous decoding we accept them anywhere in the dot-atom-text. The only + # defects we're registering are the whitespace defects. An encoded word is + # legitimate here; it's the whitespace that's wrong. To get it right the + # text, including the dots, would end up inside the encoded word. dot_atom_text = DotAtomText() - if not value or value[0] in ATOM_ENDS: - raise errors.HeaderParseError("expected atom at a start of " - "dot-atom-text but found '{}'".format(value)) - while value and value[0] not in ATOM_ENDS: - token, value = get_atext(value) - dot_atom_text.append(token) - if value and value[0] == '.': + vlen = len(value) + if start >= vlen or value[start] in ATOM_ENDS: + raise errors.HeaderParseError( + f"expected atom at a start of dot-atom-text" + f" but found {value[start:]!r}" + ) + while start < vlen and value[start] not in ATOM_ENDS: + token, start = get_atext_sequence(value, start) + if token[0].token_type == 'encoded-word' and dot_atom_text: + dot_atom_text.defects.append(_MissingWhitespaceBeforeEWDefect) + dot_atom_text.extend(token) + if start < vlen and value[start] == '.': + if dot_atom_text[-1].token_type == 'encoded-word': + dot_atom_text.defects.append(_MissingWhitespaceAfterEWDefect) dot_atom_text.append(DOT) - value = value[1:] + start += 1 if dot_atom_text[-1] is DOT: - raise errors.HeaderParseError("expected atom at end of dot-atom-text " - "but found '{}'".format('.'+value)) - return dot_atom_text, value + raise errors.HeaderParseError( + f"expected atom at end of dot-atom-text" + f" but found {value[start-1:]!r}" + ) + return dot_atom_text, start -def get_dot_atom(value): +@_deprecate_old_api +def get_dot_atom(value, start): """ dot-atom = [CFWS] dot-atom-text [CFWS] - Any place we can have a dot atom, we could instead have an rfc2047 encoded - word. + Return a DotAtom containing leading and trailing CFWSList tokens, if + appropriate, as well as a DotAtomText token, containing all of the + characters up to the next SPECIAL character or the end of value, + and a pointer to the special or the len of value. + + Decode any encoded words, regardless of whitespace, registering defects + if the RFC required whitespace is missing. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. + """ dot_atom = DotAtom() - if value[0] in CFWS_LEADER: - token, value = get_cfws(value) + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) dot_atom.append(token) - if value.startswith('=?'): - try: - token, value = get_encoded_word(value) - except errors.HeaderParseError: - # XXX: need to figure out how to register defects when - # appropriate here. - token, value = get_dot_atom_text(value) - else: - token, value = get_dot_atom_text(value) - dot_atom.append(token) - if value and value[0] in CFWS_LEADER: - token, value = get_cfws(value) + tl, start = get_dot_atom_text(value, start) + if (tl[0].token_type == 'encoded-word' + and dot_atom and not dot_atom[-1].endswith_fws() + ): + dot_atom.defects.append(_MissingWhitespaceBeforeEWDefect) + dot_atom.append(tl) + if start < vlen and value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) + if tl[-1].token_type == 'encoded-word' and not token.startswith_fws(): + dot_atom.defects.append(_MissingWhitespaceAfterEWDefect) dot_atom.append(token) - return dot_atom, value + return dot_atom, start -def get_word(value): +@_deprecate_old_api +def get_word(value, start): """word = atom / quoted-string - Either atom or quoted-string may start with CFWS. We have to peel off this - CFWS first to determine which type of word to parse. Afterward we splice - the leading CFWS, if any, into the parsed sub-token. - - If neither an atom or a quoted-string is found before the next special, a - HeaderParseError is raised. - - The token returned is either an Atom or a QuotedString, as appropriate. - This means the 'word' level of the formal grammar is not represented in the - parse tree; this is because having that extra layer when manipulating the - parse tree is more confusing than it is helpful. + Return either an Atom or a QuotedString, as appropriate, containing any + leading or trailing whitespace, up to the next non-whitespace + non-special character, and a pointer to the special or the len of value. + If no quoted string or atom is found, raise a HeaderParseError. """ - if value[0] in CFWS_LEADER: - leader, value = get_cfws(value) + # The 'word' level of the RFC grammar is not represented in the parse tree; + # having that extra layer when manipulating the parse tree is more + # confusing than it is helpful, and would not affect re-folding. + vlen = len(value) + if start < vlen and value[start] in CFWS_LEADER: + leader, start = get_cfws(value, start) else: leader = None - if not value: + if start >= vlen: raise errors.HeaderParseError( "Expected 'atom' or 'quoted-string' but found nothing.") - if value[0]=='"': - token, value = get_quoted_string(value) - elif value[0] in SPECIALS: - raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " - "but found '{}'".format(value)) + if value[start]=='"': + token, start = get_quoted_string(value, start) + elif value[start] in SPECIALS: + raise errors.HeaderParseError( + f"Expected 'atom' or 'quoted-string' but found {value[start:]!r}" + ) else: - token, value = get_atom(value) + token, start = get_atom(value, start) if leader is not None: - token[:0] = [leader] - return token, value + if not leader.endswith_fws() and token[0].token_type == 'encoded-word': + token.defects.append(_MissingWhitespaceBeforeEWDefect) + token.push(leader) + return token, start -def get_phrase(value): +@_deprecate_old_api_and_lack_of_raise_on_invalid_input +def get_phrase(value, start): """ phrase = 1*word / obs-phrase obs-phrase = word *(word / "." / CFWS) - This means a phrase can be a sequence of words, periods, and CFWS in any - order as long as it starts with at least one word. If anything other than - words is detected, an ObsoleteHeaderDefect is added to the token's defect - list. We also accept a phrase that starts with CFWS followed by a dot; - this is registered as an InvalidHeaderDefect, since it is not supported by - even the obsolete grammar. + Return a Phrase containing the any sequence of words, periods, and CFWS in + any order up to the next unquoted character that is not allowed in a phrase + or obsolete phrase, and a pointer to that character or the len of value. + If periods or cfws without adjacent words are found, add an + ObsoleteHeaderDefect to the token's defect list. If one or more periods + are found before the first word (or if there are no words, only periods and + whitespace), add an InvalidHeaderDefect. If there are no words or periods, + raise a HeaderParseError. """ + origstart = start + found_content = False phrase = Phrase() + vlen = len(value) try: - token, value = get_word(value) + token, start = get_word(value, start) + found_content = True phrase.append(token) except errors.HeaderParseError: phrase.defects.append(errors.InvalidHeaderDefect( "phrase does not start with word")) - while value and value[0] not in PHRASE_ENDS: - if value[0]=='.': + while start < vlen and value[start] not in PHRASE_ENDS: + if value[start]=='.': phrase.append(DOT) + found_content = True phrase.defects.append(errors.ObsoleteHeaderDefect( "period in 'phrase'")) - value = value[1:] + start += 1 else: try: - token, value = get_word(value) - if (token[0].token_type == 'encoded-word' - and phrase - and phrase[-1].token_type == 'atom' - and len(phrase[-1]) > 1 - and phrase[-1][-2].token_type == 'encoded-word' - and phrase[-1][-1].token_type == 'cfws' - and not phrase[-1][-1].comments - ): - # linear ws between ews needs special handing... - phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws') + token, start = get_word(value, start) + found_content = True except errors.HeaderParseError: - if value[0] in CFWS_LEADER: - token, value = get_cfws(value) + if value[start] in CFWS_LEADER: + token, start = get_cfws(value, start) phrase.defects.append(errors.ObsoleteHeaderDefect( - "comment found without atom")) + "cfws found without atom")) else: raise + if phrase and phrase[-1].token_type == 'atom': + if phrase[-1][-1].token_type == 'encoded-word': + if not token.startswith_fws(): + phrase.defects.append(_MissingWhitespaceAfterEWDefect) + elif (token[0].token_type == 'encoded-word' + and len(phrase[-1]) > 1 + and phrase[-1][-2].token_type == 'encoded-word' + and phrase[-1][-1].token_type == 'cfws' + and not phrase[-1][-1].comments + ): + phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws') + if (phrase + and token[0].token_type == 'encoded-word' + and not phrase.endswith_fws() + ): + phrase.defects.append(_MissingWhitespaceBeforeEWDefect) phrase.append(token) - return phrase, value + if found_content: + return phrase, start + # XXX POSTDEP: change this to raise the exception. + return ( + phrase, + start, + errors.HeaderParseError( + f"expected phrase but found {value[origstart:]!r}", + ), + "Calling get_phrase when there is not at least one word or" + " period in addition to whitespace is deprecated and will" + " raise an error in the future." + ) + +@_deprecate_old_api +def get_obs_local_part(value, start): + """ obs-local-part = word *("." word) -def get_local_part(value): - """ local-part = dot-atom / quoted-string / obs-local-part + Return an ObsLocalPart containing a list of words and DOTs containing + all of the characters up to the next character not allowed in a phrase or + the end of the value, and a pointer to the SPECIAL or the len of value. - """ - local_part = LocalPart() - leader = None - if value and value[0] in CFWS_LEADER: - leader, value = get_cfws(value) - if not value: - raise errors.HeaderParseError( - "expected local-part but found '{}'".format(value)) - try: - token, value = get_dot_atom(value) - except errors.HeaderParseError: - try: - token, value = get_word(value) - except errors.HeaderParseError: - if value[0] != '\\' and value[0] in PHRASE_ENDS: - raise - token = TokenList() - if leader is not None: - token[:0] = [leader] - local_part.append(token) - if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): - obs_local_part, value = get_obs_local_part(str(local_part) + value) - if obs_local_part.token_type == 'invalid-obs-local-part': - local_part.defects.append(errors.InvalidHeaderDefect( - "local-part is not dot-atom, quoted-string, or obs-local-part")) - else: - local_part.defects.append(errors.ObsoleteHeaderDefect( - "local-part is not a dot-atom (contains CFWS)")) - local_part[0] = obs_local_part - return local_part, value + Decode any encoded words, registering a defect if any are found. + Missing whitespace defects may also be registered. + + Register defects if there are any non-printable or invalid characters in + the non-whitespace tokens. -def get_obs_local_part(value): - """ obs-local-part = word *("." word) """ obs_local_part = ObsLocalPart() + vlen = len(value) last_non_ws_was_dot = False - while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): - if value[0] == '.': + while start < vlen and ((c := value[start]) == '\\' or c not in PHRASE_ENDS): + if c == '.': if last_non_ws_was_dot: obs_local_part.defects.append(errors.InvalidHeaderDefect( - "invalid repeated '.'")) + "invalid repeated '.' in local-part") + ) obs_local_part.append(DOT) last_non_ws_was_dot = True - value = value[1:] + start += 1 continue - elif value[0]=='\\': - obs_local_part.append(ValueTerminal(value[0], - 'misplaced-special')) - value = value[1:] + elif c == '\\': + # RFC 5322 doesn't allow \, but the old email code parsed it. + obs_local_part.append(ValueTerminal(c,'misplaced-special')) + start += 1 obs_local_part.defects.append(errors.InvalidHeaderDefect( "'\\' character outside of quoted-string/ccontent")) last_non_ws_was_dot = False continue if obs_local_part and obs_local_part[-1].token_type != 'dot': - obs_local_part.defects.append(errors.InvalidHeaderDefect( - "missing '.' between words")) + obs_local_part.defects.append( + errors.InvalidHeaderDefect("missing '.' between words"), + ) try: - token, value = get_word(value) + token, start = get_word(value, start) last_non_ws_was_dot = False except errors.HeaderParseError: - if value[0] not in CFWS_LEADER: + if value[start] not in CFWS_LEADER: raise - token, value = get_cfws(value) + # There will be a 'dot' defect; no need for no-word defect here. + token, start = get_cfws(value, start) obs_local_part.append(token) if not obs_local_part: raise errors.HeaderParseError( @@ -1556,16 +2070,66 @@ def get_obs_local_part(value): len(obs_local_part) > 1 and obs_local_part[1].token_type=='dot'): obs_local_part.defects.append(errors.InvalidHeaderDefect( - "Invalid leading '.' in local part")) + "Invalid leading '.' in local-part")) if (obs_local_part[-1].token_type == 'dot' or obs_local_part[-1].token_type=='cfws' and len(obs_local_part) > 1 and obs_local_part[-2].token_type=='dot'): obs_local_part.defects.append(errors.InvalidHeaderDefect( - "Invalid trailing '.' in local part")) + "Invalid trailing '.' in local-part")) if obs_local_part.defects: obs_local_part.token_type = 'invalid-obs-local-part' - return obs_local_part, value + return obs_local_part, start + +@_deprecate_old_api +def get_local_part(value, start): + """ local-part = dot-atom / quoted-string / obs-local-part + + """ + local_part = LocalPart() + vlen = len(value) + leader = None + if start < vlen and value[start] in CFWS_LEADER: + leader, start = get_cfws(value, start) + text_start = start + if start >= vlen: + raise errors.HeaderParseError( + "expected local-part but found '{}'".format(value)) + try: + token, start = get_dot_atom(value, start) + except errors.HeaderParseError: + try: + token, start = get_word(value, start) + except errors.HeaderParseError: + if value[start] != '\\' and value[start] in PHRASE_ENDS: + # XXX XXX should this be a separate message mentioning + # both dot atom and word? + raise + token = TokenList() + if start < vlen and (value[start]=='\\' or value[start] not in PHRASE_ENDS): + # Even if we started with valid text there is more, so start over as obs + token, start = get_obs_local_part(value, text_start) + if token.token_type == 'invalid-obs-local-part': + local_part.defects.append(errors.InvalidHeaderDefect( + "local-part is not dot-atom, quoted-string, or obs-local-part")) + else: + local_part.defects.append( + errors.ObsoleteHeaderDefect( + "local-part is not a valid dot-atom" + " (it contains internal CFWS)" + ) + ) + if leader is not None: + token.push(leader) + local_part.append(token) + if local_part.ew_indexes: + # XXX some day we'll put each index into its own defect. + local_part.defects.extend( + [ + errors.InvalidHeaderDefect('encoded-word in local-part'), + ] * len(local_part.ew_indexes) + ) + return local_part, start def get_dtext(value): r""" dtext = / obs-dtext @@ -2214,8 +2778,7 @@ def parse_message_ids(value): # XXX: As I begin to add additional header parsers, I'm realizing we probably # have two level of parser routines: the get_XXX methods that get a token in # the grammar, and parse_XXX methods that parse an entire field value. So -# get_address_list above should really be a parse_ method, as probably should -# be get_unstructured. +# get_address_list above should really be a parse_ method. # def parse_mime_version(value): @@ -2916,7 +3479,9 @@ def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy): continue tstr = str(part) if not want_encoding: - if part.token_type in ('ptext', 'vtext'): + # XXX At the end of the old API deprecation period 'vtext' can + # be removed from this list as it will no longer exist at all. + if part.token_type in ('ptext', 'atext', 'vtext'): # Encode if tstr contains special characters. want_encoding = not SPECIALSNL.isdisjoint(tstr) else: diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 859307dd85be111..a836bf6efeb891b 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -2,6 +2,15 @@ # Author: Barry Warsaw # Contact: email-sig@python.org +def __getattr__(name): + if name == "NonASCIILocalPartDefect": + import warnings + warnings._deprecated( + "email.errors.NonASCIILocalPartDefect", + remove=(3, 17), + ) + return _NonASCIILocalPartDefect + """email package exception classes.""" @@ -108,7 +117,7 @@ def __str__(self): class ObsoleteHeaderDefect(HeaderDefect): """Header uses syntax declared obsolete by RFC 5322""" -class NonASCIILocalPartDefect(HeaderDefect): +class _NonASCIILocalPartDefect(HeaderDefect): """Unused. Note: this error is deprecated and may be removed in the future.""" # RFC 6532 permits a non-ASCII local-part. _header_value_parser previously # treated this as a parse-time defect (when parsing Unicode, but not bytes). diff --git a/Lib/test/temp b/Lib/test/temp new file mode 100644 index 000000000000000..e69de29bb2d1d64 diff --git a/Lib/test/test_email/__init__.py b/Lib/test/test_email/__init__.py index 5d708e6e97efe7b..be433c5a88617d3 100644 --- a/Lib/test/test_email/__init__.py +++ b/Lib/test/test_email/__init__.py @@ -1,11 +1,15 @@ -import os -import unittest import collections import email +import os +import re +import unicodedata +import unittest +from curses.ascii import controlnames from email.message import Message from email._policybase import compat32 from test.support import load_package_tests from test.test_email import __file__ as landmark +from test.test_email.params import C, params_map, ParamsMixin # Load all tests in package def load_tests(*args): @@ -18,9 +22,63 @@ def openfile(filename, *args, **kws): path = os.path.join(os.path.dirname(landmark), 'data', filename) return open(path, *args, **kws) +def charname(c): + try: + n = unicodedata.name(c).lower().replace(' ', '_').replace('-', '_') + except ValueError: + try: + n = controlnames[ord(c)] + except IndexError: + assert c == '\x7F' + return 'DEL' + return n + +def for_each_character(chars, skip=''): + """Create a filter that expands each input into a test per character. + + chars should be an iterable of characters (eg a string), as should skip. + + For each character in chars that is not in skip, the filter should process + all arguments and keywords, creating a new call spec. For any objects and + (recursively} sub-objects found that have a 'format' attribute, replace the + object in the new call spec with the results of calling the object's format + method, passing the method three keyword arguments: 'char', set to the + character, 'echar', set to the character passed through re.escape, and + 'erchar', set to the repr of the character (without the quotes) passed + through re.escape. + + Process any dictionary object's values, but not its keys. Assume that any + other object that is an iterator can be recreated by passing its type a + list of objects. + + Return the character name as derived from unicodedata or the curses ascii + module as as the name string to be added to the test name. + + """ + chars = {charname(v): v for v in chars if v not in skip} + @params_map + def for_each_character_in(*args, **kw): + for name, c in chars.items(): + subs = dict( + char=c, + echar=re.escape(c), + erchar=re.escape(repr(c)[1:-1]), + ) + yield name, C(*args, **kw).fmtall(**subs) + return for_each_character_in + # Base test class -class TestEmailBase(unittest.TestCase): +class TestEmailBase(ParamsMixin, unittest.TestCase): + + # XXX XXX Delete this at end of refactor. We will be putting in temporary + # empty parameter lists during the refactoring process. + paramsRequired = False + + # XXX XXX temporary usability hack, edit this out before publishing PR. + def __str__(self): + from unittest.util import strclass + return "%s.%s" % (strclass(self.__class__), self._testMethodName) maxDiff = None # Currently the default policy is compat32. By setting that as the default @@ -65,13 +123,90 @@ def assertBytesEqual(self, first, second, msg): """Our byte strings are really encoded strings; improve diff output""" self.assertEqual(self._bytes_repr(first), self._bytes_repr(second)) - def assertDefectsEqual(self, actual, expected): - self.assertEqual(len(actual), len(expected), actual) - for i in range(len(actual)): - self.assertIsInstance(actual[i], expected[i], - 'item {}'.format(i)) - - + def assertDefectsMatch(self, actual, expected): + """Assert list of defects matches a list of expected defect patterns + + actual should be a list of actual defect instances. expected should + a list of patterns. Match the patterns against the actual list, + and report any defects that do not match a pattern or any patterns + that do not match a defect. Matching must be one to one: if there + are two identical defects in the actual list, it should be an error + if there are not two patterns that match those defects in the + expected list. + + A pattern can be one of three things: + 1) a defect class (eg: InvalidHeaderDefect) + 2) a tuple of (defect_class, regex), where the regex must + match the message produced by calling str on the actual defect + 3) a tuple of (callable, *args) where calling the callable + with the args must produce a tuple as in (2). + + """ + aleft = list(actual) + eleft = [] + for x in expected: + p = None + while not p: + if type(x) is type: + p = (x, '.*') + elif not hasattr(x, '__getitem__'): + raise ValueError(f'invalid defect pattern: {x!r}') + elif type(x[0]) is type: + p = x + elif callable(x[0]): + x = x[0](*x[1:]) + else: + raise ValueError(f'invalid defect pattern: {x!r}') + eleft.append(p) + for t, s in list(eleft): + for a in aleft: + if type(a) == t and re.search(s, str(a), flags=re.I): + eleft.remove((t, s)) + aleft.remove(a) + break + if eleft or aleft: + areprs = [repr((type(a), str(a))) for a in aleft] + ereprs = [repr(e) for e in eleft] + matched = f"{len(actual) - len(aleft)} defects matched" + if len(eleft) == len(aleft): + raise self.failureException( + f"{matched}, {len(aleft)} defects did not match:" + f"\n unmatched expected:\n {'\n '.join(ereprs)}" + f"\n unmatched actual:\n {'\n '.join(areprs)}" + ) + if len(eleft) == 0: + raise self.failureException( + f"{matched}, {len(aleft)} extra defects:" + f"\n {'\n '.join(areprs)}" + ) + if len(aleft) == 0: + raise self.failureException( + f"{matched}, {len(eleft)} missing defects:" + f"\n {'\n '.join(ereprs)}" + ) + else: + raise self.failureException( + f"Expected {len(expected)} defects but got {len(actual)};" + f" {matched}, {len(eleft)} missing, {len(aleft)} extra:" + f"\n unmatched actual:\n {'\n '.join(areprs)}" + f"\n unmatched expected:\n {'\n '.join(ereprs)}" + ) + + # XXX assertDefectsEqual can go away when it is no longer used. + assertDefectsEqual = assertDefectsMatch + + +# A more stringent version of the test.support check_warnings helper. +from contextlib import contextmanager +from test.support.warnings_helper import _filterwarnings +@contextmanager +def check_all_warnings(*filters): + """Raise an error if the generated warnings to not exactly match filters.""" + return _filterwarnings(filters) + + +# XXX Don't use this for new tests, use params instead. @parameterized will be +# deprecated and removed eventually. def parameterize(cls): """A test method parameterization class decorator. diff --git a/Lib/test/test_email/params.py b/Lib/test/test_email/params.py new file mode 100644 index 000000000000000..6c3723850574b4f --- /dev/null +++ b/Lib/test/test_email/params.py @@ -0,0 +1,384 @@ +"""Support for Parameterized Tests""" + +import collections +from functools import wraps +from string import Formatter + + +class SafeFormatter(Formatter): + + def format(self, format_string, *args, **kw): + self.args = args + self.kw = kw + return super().format(format_string, *args, **kw) + + def parse(self, format_string): + for text, varname, spec, conv in super().parse(format_string): + if varname and varname not in self.kw: + spec = ':' + spec if spec else '' + conv = '!' + conv if conv else '' + text = text + '{' + varname + spec + conv + '}' + varname, spec, conv = None, None, None + yield text, varname, spec, conv + +safe_format = SafeFormatter().format + +def _fmt(fmtfunc, obj, subs): + if hasattr(obj, 'format'): + return safe_format(obj, **subs) + try: + i = iter(obj) + except TypeError: + return obj + if hasattr(obj, 'items'): + return type(obj)({k: fmtfunc(v, subs) for k, v in obj.items()}) + return type(obj)(fmtfunc(x, subs) for x in i) + +def fmt(obj, subs): + return _fmt( + lambda obj, subs: + safe_format(obj, **subs) if hasattr(obj, 'format') else obj, + obj, + subs, + ) + +def fmtall(obj, subs): + return _fmt(fmtall, obj, subs) + + +class C: + + """Call specification""" + + def __init__(self, *args, **kw): + """Return object holding a concrete set of arguments for a callable. + + Store any positional arguments as a tuple in self.args, and any + keyword arguments in as a dict in self.kw. + + """ + self.args = args + self.kw = kw + + def __call__(self, func): + """Call func using the concrete arguments from self.args and self.kw""" + return func(*self.args, **self.kw) + + def __eq__(self, other): + try: + return self.args == other.args and self.kw == other.kw + except AttributeError: + return False + + def _repr(self, fname): + args = ', '.join(repr(arg) for arg in self.args) + kw = ', '.join(f'{k}={repr(v)}' for k, v in self.kw.items()) + return f"{fname}({', '.join(filter(None, (args, kw)))})" + + def __repr__(self): + return self._repr(type(self).__name__) + + def repr_call(self, func): + return self._repr(func.__name__) + + def fmt(self, **subs): + return C(*fmt(self.args, subs), **fmt(self.kw, subs)) + + def fmtall(self, **subs): + return C(*fmtall(self.args, subs), **fmtall(self.kw, subs)) + + +class Params(collections.UserDict): + + def __init__(self, *args, **kw): + super().__init__() + self.update(*args, **kw) + + def __repr__(self): + items = ', '.join(f'{k}={repr(v)}' for k, v in self.items()) + return f'{type(self).__name__}({items})' + + def update(self, *args, **kw): + for arg in args: + if not isinstance(arg, self.__class__): + raise TypeError( + f"Invalid argument {arg!r}, arguments" + f" must be of type {type(self).__name__}," + f" not {type(arg).__name__!r}" + ) + super().update(arg) + super().update(kw) + + def __setitem__(self, name, value): + if not name.isidentifier(): + raise ValueError( + f"parameter names must be identifiers, {name!r} is invalid", + ) + if name in self: + raise ValueError( + f"cannot add {name}={value!r}, a callspec with that name" + f" already exists" + ) + if not isinstance(value, C): + value = C(value) + super().__setitem__(name, value) + + +debug = False +_tupify = lambda x: x.items() if isinstance(x, Params) else [('', x)] +def _params_map(func, *, with_name=False, with_namelist=False, debug=False): + if not callable(func): + raise TypeError( + f"argument must be callable, not {type(func).__name__!r}" + ) + if with_name and with_namelist: + raise ValueError("with_name and with_namelist cannot both be True") + @wraps(func) + def params_mapper(*args, **kw): + if __debug__ and debug: print(f'flattening using {func.__name__}') + if __debug__ and debug > 1: print(f'{args=} {kw=}') + param_set = Params() + for an, av in [t for a in [*args, Params(**kw)] for t in _tupify(a)]: + if debug: print(f'{an=} {av=}') + cs = C(*av.args, **av.kw) if isinstance(av, C) else C(av) + if with_name: + cs.args = (an, *cs.args) + elif with_namelist: + cs.args = (NameList(an), *cs.args) + try: + for n, v in cs(func): + try: + name = '__'.join(filter(None, (an, n))) + except Exception: + raise ValueError(f"Invalid label: {n!r}") from None + if __debug__ and debug: print(f'{n=} {v=} {name=}') + if not name: + raise ValueError('missing test label') + param_set[name] = v + except Exception as ex: + val = f'{an}={av!r}' if an else repr(av) + raise ValueError( + f'{func.__name__} failed on {val}', + ) from ex + return param_set + return params_mapper + +def params_map(*args, **kw): + if not kw: + return _params_map(*args) + else: + def _(func): + return _params_map(func, *args, **kw) + return _ + + +def only(*args): + yield ('', args[0]) if len(args) == 1 else args + + +@params_map +def as_value(name): + yield name, C(name) + + +@params_map(with_name=True) +def with_names(name, *args, **kw): + yield '', C(name, *args, **kw) + + +def for_each_name(*names): + @params_map + def for_each_name(*args, **kw): + for name in names: + yield name, C(name, *args, **kw) + return for_each_name + + +def add_label(label): + @params_map + def add_label(*args, **kw): + yield label, C(*args, **kw) + return add_label + + +def for_each_function(*functions): + @params_map + def for_each_function(*args, **kw): + for function in functions: + yield function.__name__, C(function, *args, **kw) + return for_each_function + + +# We could factor a common core out of these next two, but the error +# messages when the selection function fails would be more confusing. + +def include_if(include, *, label=''): + @params_map(with_name=True) + def include_if(name, *args, **kw): + if include(NameList(name), *args, **kw): + yield label, C(*args, **kw) + return include_if + +def include_unless(omit, *, label=''): + @params_map(with_name=True) + def include_unless(name, *args, **kw): + if omit(NameList(name), *args, **kw): + return + yield label, C(*args, **kw) + return include_unless + + +class NameList(list): + + def __new__(cls, name): + """Return a specialized list facilitating operations on test names. + + Split the test name into a list at '__' characters, so that it is at + base a list of the name components that were used to construct the name + by one or more param_maps, assuming that '__' has only been used in + names via param_maps concatenation. Calling string on the returned + object should yield the original name. + + """ + return super().__new__(cls) + + def __init__(self, name): + super().__init__(name.split('__') if name else []) + + def __str__(self): + return '__'.join(self) + + def has_any(self, *names): + """Return True if any name is an element of the name list + + names may be passed as a single tuple or a list of arguments. + """ + if len(names) == 1 and not hasattr(names[0], 'encode'): + names = names[0] + return any(name and name in self for name in names) + + + def has_all(self, *names): + """Return True if all of the names are elements of the name list""" + names = [n for n in names if n] + if not names: + return False + return all(name in self for name in names) + + +def params(*args, **kw): + """Mark decorated func so that it is called using specified C instances. + + If one or more dictionaries, and/or any keyword arguments are supplied, + raise an error if the dictionary keys are not unique across all arguments. + Combine these dictionaries, adding the name of the test followed by '__' as + a prefix to each of the dictionary keys. If called with the function as + the only argument, create an empty dictionary instead. If the class + contains attributes whose names starts with 'params_' plus the name of the + decorated function, add the attribute name with 'params_' removed from the + front and '__' added to the end as a prefix to each key in the parameter's + value. If the resultant names duplicate the names in the existing combined + dictionary, raise an error. Otherwise add them to the combined dictionary. + + For each element of the combined dictionary, create a function whose name + is the key, where the result of calling the function is to call the wrapped + function with the arguments specified by the C instance (or equivalent) + that must be the value of each dictionary entry. + + """ + if len(args) == 1 and not kw and callable( (func:= args[0]) ): + func._params_ = False + return func + def params_decorator(func): + func._params_ = Params(*args, **kw) + return func + return params_decorator + + +class ParamsMixin: + + """XXX docstring goes here once I write the docs.""" + + paramsAttributePrefix = 'params_' + paramsDebug = False + paramsRequired = True + + @classmethod + def __init_subclass__(cls, *args, **kwargs): + """Turn each test decorated with @params into a series of tests. + + """ + super().__init_subclass__(*args, **kwargs) + params_func_attrs = {} + params_attrs = {} + for name, attr in cls.__dict__.items(): + if hasattr(attr, '_params_'): + params_func_attrs[name] = attr + if __debug__ and cls.paramsDebug: + print(f'@params method {name!r}') + elif name.startswith(cls.paramsAttributePrefix): + if __debug__ and cls.paramsDebug: + print(f'{cls.paramsAttributePrefix} attribute {name!r}') + params_attrs[name] = attr + # Associate the params_ with the function with the matching name. + params = collections.defaultdict(list) + for pname, paramset in params_attrs.items(): + if not isinstance(paramset, Params): + raise ValueError( + f'value of params constant {pname} must be a Params' + f' dictionary, not {type(paramset)}' + ) + n = pname.removeprefix(cls.paramsAttributePrefix) + # Loop, in case the test name has one or more '__' in it... + tn = [] + while n: + if n in params_func_attrs: + break + if '__' not in n: + raise ValueError(f'No @params test found for {pname!r}') + n, t = n.rsplit('__', 1) + if cls.paramsDebug: + print(f'{n=} {t=}') + tn.insert(0, t) + params[n].append(('__'.join(tn), paramset)) + for fname, func in params_func_attrs.items(): + if __debug__ and cls.paramsDebug: + print( + f"{fname!r} has{'' if func._params_ else ' no'} decorator" + f" params and {(n := len(params[fname]))}" + f" {cls.paramsAttributePrefix}" + f" attribute{'' if n == 1 else 's'}", + ) + all_params = func._params_ + if all_params is False: + if not params[fname]: + raise ValueError(f'No params found for {fname!r}') + all_params = Params() + for pn, ps in params[fname]: + try: + pr = pn + '__' if pn else pn + all_params.update( + **{f'{pr}{n}' if pn else n: v for n, v in ps.items()} + ) + except Exception as ex: + raise ValueError( + f"error combining '{cls.paramsAttributePrefix}" + f"{fname}{'__' if pn else ''}{pn}'" + f" with existing params" + ) from ex + if cls.paramsRequired and not all_params: + raise ValueError( + f"paramsRequired is set and {fname!r} has no params", + ) + impl_name = '__' + fname + delattr(cls, fname) + setattr(cls, impl_name, func) + for (test_name, callspec) in all_params.items(): + test = ( + lambda self, impl_name=impl_name, callspec=callspec: + callspec(getattr(self, impl_name)) + ) + test.__name__ = fname + '__' + test_name + setattr(cls, test.__name__, test) + if __debug__ and cls.paramsDebug: + print(f'generated {callspec.repr_call(test)}') diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index 1713962f94caef2..bcbdce9ac4ecffd 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -2,205 +2,321 @@ from email import _encoded_words as _ew from email import errors from test.test_email import TestEmailBase +from test.test_email.params import C, params, Params + + +class TestDecoders(TestEmailBase): + + def _test(self, function, source, result, defects=[]): + actual_result, actual_defects = function(source) + self.assertEqual(actual_result, result) + self.assertDefectsEqual(actual_defects, defects) + + + @params + def test_decode_q(self, *args, **kw): + return self._test(_ew.decode_q, *args, **kw) + + params_test_decode_q = Params( + no_encoded = C(b'foobar', b'foobar'), + encoded_spaces = C(b'foo=20bar=20', b'foo bar '), + underline_space = C(b'foo_bar_', b'foo bar '), + run_of_encoded = C(b'foo=20=20=21=2Cbar', b'foo !,bar'), + ) + + + @params + def test_decode_b(self, *args, **kw): + return self._test(_ew.decode_b, *args, **kw) + + params_test_decode_b = Params( + + simple = C( + b'Zm9v', + b'foo', + ), + + missing_1_padding_char = C( + b'dmk', + b'vi', + defects=[errors.InvalidBase64PaddingDefect], + ), + + missing_2_padding_char = C( + b'dg', + b'v', + defects=[errors.InvalidBase64PaddingDefect], + ), + + invalid_character = C( + b'dm\x01k===', + b'vi', + defects=[errors.InvalidBase64CharactersDefect], + ), + + invalid_character_and_bad_padding = C( + b'dm\x01k', + b'vi', + defects=[ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + ), + + invalid_length = C( + b'abcde', + b'abcde', + defects=[errors.InvalidBase64LengthDefect], + ), + + ) + + + @params + def test_decode_raises_if_value(self, value, exception=ValueError): + with self.assertRaises(exception): + _ew.decode(value) + + params_test_decode_raises_if_value = Params( + missing_middle = C('=?badone?='), + beginning_only = C('=?'), + empty_string = C(''), + invalid_encoding = C('=?utf-8?X?somevalue?=', exception=KeyError), + ) + + + @params + def test__decode( + self, + cte_encoded, + *, + cte, + charset='us-ascii', + result, + defects=[], + ): + actual, actual_defects = _ew._decode(charset, cte, cte_encoded) + self.assertEqual(actual, result) + self.assertDefectsEqual(actual_defects, defects) + + @params + def test_decode( + self, + cte_encoded, + *, + cte, + charset='us-ascii', + lang='', + result, + defects=[], + ): + ew = f'=?{charset}{lang and '*'}{lang}?{cte}?{cte_encoded}?=' + actual, actual_charset, actual_lang, actual_defects = _ew.decode(ew) + self.assertEqual(actual, result) + self.assertEqual(actual_charset, charset) + self.assertEqual(actual_lang, lang) + self.assertDefectsEqual(actual_defects, defects) + + params_test__decode = params_test_decode = Params( + + simple_q = C( + 'foo', + cte='q', + result='foo', + ), + + simple_b = C( + 'dmk=', + cte='b', + result='vi', + ), + + q_case_ignored = C( + 'foo', + cte='Q', + result='foo', + ), + + b_case_ignored = C( + 'dmk=', + cte='B', + result='vi', + ), + + non_trivial_q = C( + '=20F=fcr=20Elise=20', + cte='q', + result=' Für Elise ', + charset='latin-1', + ), + + q_escaped_bytes_preserved = C( + b'=20\xACfoo'.decode('us-ascii', 'surrogateescape'), + cte='q', + result=' \uDCACfoo', + defects=[errors.UndecodableBytesDefect], + ), + + b_undecodable_bytes_ignored_with_defect = C( + b'dm\xACk'.decode('us-ascii', 'surrogateescape'), + cte='b', + result='vi', + defects=[ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + ), + + b_invalid_bytes_ignored_with_defect = C( + 'dm\x01k===', + cte='b', + result='vi', + defects=[errors.InvalidBase64CharactersDefect], + ), + + b_invalid_bytes_incorrect_padding = C( + 'dm\x01k', + cte='b', + result='vi', + defects=[ + errors.InvalidBase64CharactersDefect, + errors.InvalidBase64PaddingDefect, + ], + ), + + b_padding_defect = C( + 'dmk', + cte='b', + result='vi', + defects=[errors.InvalidBase64PaddingDefect], + ), + + unknown_8bit_charset = C( + 'foo=ACbar', + cte='q', + result=b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset='unknown-8bit', + defects=[], + ), + + unknown_charset = C( + 'foo=ACbar', + cte='q', + result=b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset='foobar', + # XXX Should this be a new Defect instead? + defects=[errors.CharsetError], + ), + + invalid_character_in_charset = C( + 'foo=ACbar', + cte='q', + result=b'foo\xacbar'.decode('ascii', 'surrogateescape'), + charset='utf-8\udce2\udc80\udc9d', + # XXX Should this be a new Defect instead? + defects=[errors.CharsetError], + ), + + q_nonascii = C( + '=C3=89ric', + cte='q', + result='Éric', + charset='utf-8', + ), + + ) + + params_test_decode__lang = Params( + + nonnull_lang = C( + 'test', + cte='q', + result='test', + lang='jive', + ), + + ) + + +class TestEncoders(TestEmailBase): + + def _test(self, function, source, expected): + self.assertEqual(function(source), expected) + + @params( + all_safe = C(b'foobar', 'foobar'), + spaces = C(b'foo bar ', 'foo_bar_'), + run_of_encodables = C(b'foo ,,bar', 'foo__=2C=2Cbar'), + ) + def test_encode_q(self, *args, **kw): + return self._test(_ew.encode_q, *args, **kw) + + + @params( + simple = C(b'foo', 'Zm9v'), + padding = C(b'vi', 'dmk='), + ) + def test_encode_b(self, *args, **kw): + return self._test(_ew.encode_b, *args, **kw) + + + @params + def test_encode(self, callspec, expected): + self.assertEqual(callspec(_ew.encode), expected) + + params_test_encode = Params( + + q = C( + C('foo', 'utf-8', 'q'), + '=?utf-8?q?foo?=', + ), + + b = C( + C('foo', 'utf-8', 'b'), + '=?utf-8?b?Zm9v?=', + ), + + auto_q = C( + C('foo', 'utf-8'), + '=?utf-8?q?foo?=', + ), + + auto_q_if_short_mostly_safe = C( + C('vi.', 'utf-8'), + '=?utf-8?q?vi=2E?=', + ), + + auto_b_if_enough_unsafe = C( + C('.....', 'utf-8'), + '=?utf-8?b?Li4uLi4=?=', + ), + + auto_b_if_long_unsafe = C( + C('vi.vi.vi.vi.vi.', 'utf-8'), + '=?utf-8?b?dmkudmkudmkudmkudmku?=', + ), + + auto_q_if_long_mostly_safe = C( + C('vi vi vi.vi ', 'utf-8'), + '=?utf-8?q?vi_vi_vi=2Evi_?=', + ), + + utf8_default = C( + C('foo'), + '=?utf-8?q?foo?=', + ), + + lang = C( + C('foo', lang='jive'), + '=?utf-8*jive?q?foo?=', + ), + + unknown_8bit = C( + C('foo\uDCACbar', charset='unknown-8bit'), + '=?unknown-8bit?q?foo=ACbar?=', + ), - -class TestDecodeQ(TestEmailBase): - - def _test(self, source, ex_result, ex_defects=[]): - result, defects = _ew.decode_q(source) - self.assertEqual(result, ex_result) - self.assertDefectsEqual(defects, ex_defects) - - def test_no_encoded(self): - self._test(b'foobar', b'foobar') - - def test_spaces(self): - self._test(b'foo=20bar=20', b'foo bar ') - self._test(b'foo_bar_', b'foo bar ') - - def test_run_of_encoded(self): - self._test(b'foo=20=20=21=2Cbar', b'foo !,bar') - - -class TestDecodeB(TestEmailBase): - - def _test(self, source, ex_result, ex_defects=[]): - result, defects = _ew.decode_b(source) - self.assertEqual(result, ex_result) - self.assertDefectsEqual(defects, ex_defects) - - def test_simple(self): - self._test(b'Zm9v', b'foo') - - def test_missing_padding(self): - # 1 missing padding character - self._test(b'dmk', b'vi', [errors.InvalidBase64PaddingDefect]) - # 2 missing padding characters - self._test(b'dg', b'v', [errors.InvalidBase64PaddingDefect]) - - def test_invalid_character(self): - self._test(b'dm\x01k===', b'vi', [errors.InvalidBase64CharactersDefect]) - - def test_invalid_character_and_bad_padding(self): - self._test(b'dm\x01k', b'vi', [errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]) - - def test_invalid_length(self): - self._test(b'abcde', b'abcde', [errors.InvalidBase64LengthDefect]) - - -class TestDecode(TestEmailBase): - - def test_wrong_format_input_raises(self): - with self.assertRaises(ValueError): - _ew.decode('=?badone?=') - with self.assertRaises(ValueError): - _ew.decode('=?') - with self.assertRaises(ValueError): - _ew.decode('') - with self.assertRaises(KeyError): - _ew.decode('=?utf-8?X?somevalue?=') - - def _test(self, source, result, charset='us-ascii', lang='', defects=[]): - res, char, l, d = _ew.decode(source) - self.assertEqual(res, result) - self.assertEqual(char, charset) - self.assertEqual(l, lang) - self.assertDefectsEqual(d, defects) - - def test_simple_q(self): - self._test('=?us-ascii?q?foo?=', 'foo') - - def test_simple_b(self): - self._test('=?us-ascii?b?dmk=?=', 'vi') - - def test_q_case_ignored(self): - self._test('=?us-ascii?Q?foo?=', 'foo') - - def test_b_case_ignored(self): - self._test('=?us-ascii?B?dmk=?=', 'vi') - - def test_non_trivial_q(self): - self._test('=?latin-1?q?=20F=fcr=20Elise=20?=', ' Für Elise ', 'latin-1') - - def test_q_escaped_bytes_preserved(self): - self._test(b'=?us-ascii?q?=20\xACfoo?='.decode('us-ascii', - 'surrogateescape'), - ' \uDCACfoo', - defects = [errors.UndecodableBytesDefect]) - - def test_b_undecodable_bytes_ignored_with_defect(self): - self._test(b'=?us-ascii?b?dm\xACk?='.decode('us-ascii', - 'surrogateescape'), - 'vi', - defects = [ - errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]) - - def test_b_invalid_bytes_ignored_with_defect(self): - self._test('=?us-ascii?b?dm\x01k===?=', - 'vi', - defects = [errors.InvalidBase64CharactersDefect]) - - def test_b_invalid_bytes_incorrect_padding(self): - self._test('=?us-ascii?b?dm\x01k?=', - 'vi', - defects = [ - errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect]) - - def test_b_padding_defect(self): - self._test('=?us-ascii?b?dmk?=', - 'vi', - defects = [errors.InvalidBase64PaddingDefect]) - - def test_nonnull_lang(self): - self._test('=?us-ascii*jive?q?test?=', 'test', lang='jive') - - def test_unknown_8bit_charset(self): - self._test('=?unknown-8bit?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), - charset = 'unknown-8bit', - defects = []) - - def test_unknown_charset(self): - self._test('=?foobar?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), - charset = 'foobar', - # XXX Should this be a new Defect instead? - defects = [errors.CharsetError]) - - def test_invalid_character_in_charset(self): - self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=', - b'foo\xacbar'.decode('ascii', 'surrogateescape'), - charset = 'utf-8\udce2\udc80\udc9d', - # XXX Should this be a new Defect instead? - defects = [errors.CharsetError]) - - def test_q_nonascii(self): - self._test('=?utf-8?q?=C3=89ric?=', - 'Éric', - charset='utf-8') - - -class TestEncodeQ(TestEmailBase): - - def _test(self, src, expected): - self.assertEqual(_ew.encode_q(src), expected) - - def test_all_safe(self): - self._test(b'foobar', 'foobar') - - def test_spaces(self): - self._test(b'foo bar ', 'foo_bar_') - - def test_run_of_encodables(self): - self._test(b'foo ,,bar', 'foo__=2C=2Cbar') - - -class TestEncodeB(TestEmailBase): - - def test_simple(self): - self.assertEqual(_ew.encode_b(b'foo'), 'Zm9v') - - def test_padding(self): - self.assertEqual(_ew.encode_b(b'vi'), 'dmk=') - - -class TestEncode(TestEmailBase): - - def test_q(self): - self.assertEqual(_ew.encode('foo', 'utf-8', 'q'), '=?utf-8?q?foo?=') - - def test_b(self): - self.assertEqual(_ew.encode('foo', 'utf-8', 'b'), '=?utf-8?b?Zm9v?=') - - def test_auto_q(self): - self.assertEqual(_ew.encode('foo', 'utf-8'), '=?utf-8?q?foo?=') - - def test_auto_q_if_short_mostly_safe(self): - self.assertEqual(_ew.encode('vi.', 'utf-8'), '=?utf-8?q?vi=2E?=') - - def test_auto_b_if_enough_unsafe(self): - self.assertEqual(_ew.encode('.....', 'utf-8'), '=?utf-8?b?Li4uLi4=?=') - - def test_auto_b_if_long_unsafe(self): - self.assertEqual(_ew.encode('vi.vi.vi.vi.vi.', 'utf-8'), - '=?utf-8?b?dmkudmkudmkudmkudmku?=') - - def test_auto_q_if_long_mostly_safe(self): - self.assertEqual(_ew.encode('vi vi vi.vi ', 'utf-8'), - '=?utf-8?q?vi_vi_vi=2Evi_?=') - - def test_utf8_default(self): - self.assertEqual(_ew.encode('foo'), '=?utf-8?q?foo?=') - - def test_lang(self): - self.assertEqual(_ew.encode('foo', lang='jive'), '=?utf-8*jive?q?foo?=') - - def test_unknown_8bit(self): - self.assertEqual(_ew.encode('foo\uDCACbar', charset='unknown-8bit'), - '=?unknown-8bit?q?foo=ACbar?=') + ) if __name__ == '__main__': diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9d9fe418ee4d067..a57ca188b40954b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1,9 +1,427 @@ +import re import string import unittest +from contextlib import ExitStack from email import _header_value_parser as parser from email import errors from email import policy -from test.test_email import TestEmailBase, parameterize +from importlib import import_module +from random import choices, randint, sample +from test.support.import_helper import CleanImport +from test.test_email import ( + charname, + check_all_warnings, + for_each_character, + TestEmailBase, + parameterize, + ) +from test.test_email.params import ( + add_label, + as_value, + C, + for_each_name, + include_unless, + only, + params, + Params, + params_map, + with_names, + ) + +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 +RFC_PRINTABLES = bytes(range(33, 127)).decode('ascii') + +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 +RFC_WSP = chr(32) + chr(9) + +# https://datatracker.ietf.org/doc/html/rfc5322#section-2.2 +RFC_NONPRINTABLES = bytes([*range(0, 33), 127]).decode('ascii') + +# https://datatracker.ietf.org/doc/html/rfc2978#section-2.3 +# Except that like +# https://datatracker.ietf.org/doc/html/rfc8187#section-3.2.1 +# we omit the "'" character as otherwise it is difficult to correctly parse +# extended parameters values absent a complete registry. In any case charset +# names generally do not include special characters in practice. +RFC_CHARSET_CHARS = ''.join(( + string.ascii_letters, + string.digits, + "!#$%&+-^_`{}~", + )) + +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 +RFC_ATEXT = ''.join(( + string.ascii_letters, + string.digits, + "!#$%&'*+-/=?^_`{|}~", + )) + +# https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 +RFC_SPECIALS = r'()<>[]:;@\,."' + +# This isn't an RFC concept, but it is as useful in tests as it is in the code. +CFWS_LEADER = RFC_WSP + '(' + +ALL_ASCII = bytes(range(0, 128)).decode('ascii') + + +# ---> Defect Expectations + +undecodable_bytes_defect = ( + errors.UndecodableBytesDefect, + 'Non-ASCII characters found in header token', + ) + +def undecodable_bytes_in_ew_defect(chars): + return ( + errors.UndecodableBytesDefect, + f"Encoded word contains bytes not decodable using '{chars}' charset", + ) + +def nonprintable_defect(chars): + return ( + errors.NonPrintableDefect, + 'the following ASCII non-printables found in header:' + f' {re.escape(repr(list(chars)))}', + ) + +whitespace_inside_ew_defect = ( + errors.InvalidHeaderDefect, + 'whitespace inside encoded-word', + ) + +missing_whitespace_before_ew_defect = ( + errors.InvalidHeaderDefect, + 'missing whitespace before encoded-word', + ) + +missing_whitespace_after_ew_defect = ( + errors.InvalidHeaderDefect, + 'missing whitespace after encoded-word', + ) + +def charset_defect(chars): + return ( + errors.CharsetError, + f"Unknown charset '{chars}' in encoded word; decoded as unknown bytes", + ) + +invalid_base64_padding_defect = ( + errors.InvalidBase64PaddingDefect, + '', + ) + +invalid_base64_characters_defect = ( + errors.InvalidBase64CharactersDefect, + '', + ) + +invalid_base64_length_defect = ( + errors.InvalidBase64LengthDefect, + '', + ) + +end_inside_quoted_string_defect = ( + errors.InvalidHeaderDefect, + 'end of header inside quoted string', + ) + +ew_inside_quoted_string_defect = ( + errors.InvalidHeaderDefect, + 'encoded-word inside quoted string', + ) + +end_inside_comment_defect = ( + errors.InvalidHeaderDefect, + 'end of header inside comment', + ) + +period_in_phrase_obs_defect = ( + errors.ObsoleteHeaderDefect, + "period in 'phrase'", + ) + +cfws_without_atom_in_phrase_obs_defect = ( + errors.ObsoleteHeaderDefect, + 'cfws found without atom', + ) + +non_word_phrase_start_defect = ( + errors.InvalidHeaderDefect, + "phrase does not start with word", + ) + +non_dot_atom_local_part_obs_defect = ( + errors.ObsoleteHeaderDefect, + r'local-part is not a valid dot-atom \(it contains internal CFWS\)', + ) + +not_even_obs_local_part_defect = ( + errors.InvalidHeaderDefect, + 'local-part is not dot-atom, quoted-string, or obs-local-part', + ) + +missing_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "missing '.' between words", + ) + +trailing_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "invalid trailing '.' in local-part", + ) + +leading_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "invalid leading '.' in local-part", + ) + +repeated_dot_in_local_part_defect = ( + errors.InvalidHeaderDefect, + "invalid repeated '.' in local-part", + ) + +misplaced_backslash_defect = ( + errors.InvalidHeaderDefect, + r"'\\' character outside of quoted-string/ccontent", + ) + +ew_in_local_part_defect = ( + errors.InvalidHeaderDefect, + 'encoded-word in local-part', + ) + +# ---> End Defect Expectations + + +# XXX POSTDEP: Delete this test case, from here... + +class TestDeprecations(TestEmailBase): + + def test___getattr___attribute_error(self): + nonsense = 'this_does_not_exist' + with self.assertRaisesRegex(AttributeError, nonsense): + getattr(parser, nonsense) + + def test___getattr___deprecation(self): + with CleanImport(parser.__name__): + foo = import_module(parser.__name__) + foo._deprecated_foo = lambda: 42 + foo._deprecated_bar = 1 + with check_all_warnings(( + r"(?=.*'bar')(?=.*is deprecated)", + DeprecationWarning, + )): + self.assertEqual(foo.bar, 1) + self.assertEqual(foo.foo(), 42) + + def test___getattr___replacement(self): + with CleanImport(parser.__name__): + foo = import_module(parser.__name__) + a_func = lambda: 42 + foo._deprecated_foo = a_func + foo._REPLACED_NAMES['foo'] = 'bird' + foo._deprecated_bar = 2 + foo._REPLACED_NAMES['bar'] = 'brain' + with check_all_warnings(( + r"(?=.*'foo')(?=.*is deprecated)(?=.*'bird')", + DeprecationWarning, + )): + self.assertEqual(foo.foo, a_func) + self.assertEqual(foo.foo(), 42) + with check_all_warnings(( + r"(?=.*'bar')(?=.*is deprecated)(?=.*'brain')", + DeprecationWarning, + )): + self.assertEqual(foo.bar, 2) + + def test__replaced_with(self): + with CleanImport(parser.__name__): + p = import_module(parser.__name__) + @p._replaced_with('foo') + def _deprecated_bar(a): + return a + p._deprecated_bar = _deprecated_bar + with check_all_warnings(( + r"(?=.*'bar')(?=.*is deprecated)(?=.*'foo')", + DeprecationWarning, + )): + self.assertEqual(p.bar(2), 2) + + @params(as_value( + # XXX XXX make sure this is completely filled in with all the + # names we expect to be deprecated. + '_InvalidEwError', + 'rfc2047_matcher', + '_wsp_splitter', + '_non_atom_end_matcher', + )) + def test_deprecated_names(self, name): + with check_all_warnings(( + rf'(?=.*{name})(?=.*is.*deprecated)', + DeprecationWarning, + )): + getattr(parser, name) + + @params(with_names( + # XXX XXX make sure this is completely filled in with all the names + # we've replaced. + get_qp_ctext='get_ccontent_sequence', + get_atext='get_atext_sequence', + )) + def test_replaced_names(self, oldname, newname): + with check_all_warnings(( + rf'(?=.*{oldname!r}.*is deprecated)(?=.*{newname})', + DeprecationWarning, + )): + getattr(parser, oldname) + + @params( + old_simple = C('foo x', '', res=('f', 'oo x', 9), warn=True), + old_with_arg = C('foo x', ' ', res=('fo', 'o x', 9), warn=True), + old_with_kw = C('foo x', '', b=2, res=('foo', ' x', 9), warn=True), + new_with_zero = C('foo x', 0, '', res=('f', 1, 9)), + new_with_nonzero = C('foo x', 3, '', res=(' ', 4, 9)), + new_with_arg = C('foo x', 1, ' ', res=('oo', 3, 9)), + new_with_kw = C('foo x', 2, '', b=2, res=('o x', 5, 9)), + ) + def test__deprecate_old_api(self, value, *args, b=0, warn=False, res): + @parser._deprecate_old_api + def t(value, start, a, b=0): + end = start + 1 + len(a) + b + return value[start:end], end, 9 + warnings = [] + if warn: + warnings += [( + r"(?=.*'t')(?=.*API)(?=.*has changed)", + DeprecationWarning, + )] + with check_all_warnings(*warnings): + self.assertEqual(t(value, *args, b=b), res) + + @params( + old_api_no_error = C(C('abc')), + new_api_no_error = C(C('abc', 0)), + old_api_error = C(C(''), warning=True), + new_api_error = C(C('', 0), exception=True), + new_api_no_error_with_non_zero_start = C(C('abc', 2)), + new_api_error_with_non_zero_start = C(C('abc', 3), exception=True), + ) + def test__deprecate_old_api_and_lack_of_raise_on_invalid_input( + self, + callspec, + exception=False, + warning=False, + ): + @parser._deprecate_old_api_and_lack_of_raise_on_invalid_input + def foo(value, start): + if not value[start:]: + return parser.TokenList(['']), start, Exception('bar'), 'bird' + return parser.TokenList([value]), start + len(value) + value, *start = callspec.args + warnings = [] + if start == []: + warnings += [ + (r"(?=.*'foo')(?=.*API)(?=.*has changed)", DeprecationWarning) + ] + if warning: + warnings += [('bird', DeprecationWarning)] + if exception: + exceptioncheck = self.assertRaisesRegex(Exception, 'bar') + else: + exceptioncheck = ExitStack() + with exceptioncheck: + with check_all_warnings(*warnings): + tl, rest = callspec(foo) + if exception: + return + start = start[0] if start else 0 + self.assertEqual(tl, parser.TokenList([value])) + rest = (len(value) - len(rest)) if hasattr(rest, 'encode') else rest + self.assertEqual(rest, start + len(tl[0])) + + # XXX XXX _deprecate will go away by the end of refactoring. + + def test__deprecate_no_arg(self): + @parser._deprecate + def t(a, b): + return a, b + with self.assertWarnsRegex( + DeprecationWarning, + r"(?=.*'t'.*is deprecated)", + ): + self.assertEqual(t(1, 2), (1, 2)) + + def test__deprecate_with_arg(self): + @parser._deprecate('t2') + def t(a, b): + return a, b + with self.assertWarnsRegex( + DeprecationWarning, + r"(?=.*'t'.*is deprecated)(?=.*t2)", + ): + self.assertEqual(t(1, 2), (1, 2)) + +# XXX POSTDEP: ...to here + + +class TestTokenList(TestEmailBase): + + @params( + none_none = C([], []), + one_none = C([errors.InvalidHeaderDefect('a')], []), + none_one = C([], [errors.InvalidHeaderDefect('b')]), + one_one = C( + [errors.InvalidHeaderDefect('a')], + [errors.InvalidHeaderDefect('b')], + ), + two_two = C( + [errors.InvalidHeaderDefect('a'), errors.NonPrintableDefect('y')], + [errors.NonPrintableDefect('b'), errors.InvalidHeaderDefect('z')], + ), + ) + def test_extend_copies_defects(self, existing, new): + tl1 = parser.TokenList() + tl1.defects.extend(existing) + tl2 = parser.TokenList(['fake', 'values']) + tl2.defects.extend(new) + tl1.extend(tl2) + self.assertEqual(tl1.defects, existing + new) + + def test_extend_with_non_token_list_leaves_defects_unchanged(self): + tl = parser.TokenList() + defects = [errors.InvalidHeaderDefect('a')] + tl.defects.extend(defects) + tl.extend(['fake', 'values']) + self.assertEqual(tl.defects, defects) + + for_each_method = for_each_name('append', 'extend', 'push') + + @params( + for_each_method( + none_none = C([], [] ), + one_none = C([1], [] ), + none_one = C([], [20] ), + one_one = C([1], [20] ), + two_two = C([1, 20], [27, 40] ), + ) + ) + def test_ew_indexes(self, method, existing, new): + expected = new + existing if method == 'push' else existing + new + tl1 = parser.TokenList() + tl1.ew_indexes = list(existing) + tl2 = parser.TokenList(['fake', 'values']) + tl2.ew_indexes = list(new) + getattr(tl1, method)(tl2) + self.assertEqual(tl1.ew_indexes, expected) + + @params(for_each_method(C([1, 20]))) + def test_non_token_list_leaves_ew_indexes_unchanged(self, method, idxs): + tl1 = parser.TokenList() + tl1.ew_indexes = idxs + getattr(tl1, method)(['fake', parser.Terminal('values', 'fake')]) + self.assertEqual(tl1.ew_indexes, idxs) + class TestTokens(TestEmailBase): @@ -41,1271 +459,3937 @@ def _test_parse_x(self, method, input, string, value, defects, self._assert_results(tl, '', string, value, defects, '', comments) return tl + def _test_parse( + self, + method, + callspec, + stringified=None, + value=None, + defects=None, + remainder='', + comments=None, + *, + commenttree=None, + exception=None, + warnings=None, + test_start=True, + no_end=False, + ew_indexes=[], + pprint=False, + ): + """Call method with callspec, make asserts, and return results of call. + + Expect method to be a parsing method that takes a string as its first + argument and returns a Terminal or TokenList as its return value, + possibly followed by an "unparsed remainder" index, and possibly + additional return values. + + If test_start is true (the default), modify the callspec to add a + random prefix to its first (string) argument, and add a new parameter + after it consisting of the length of the added prefix. If the callspec + contains a value for 'end', modify that value by adding the prefix + length. + + If exception has a value, assert that using callspec to call method + raises the exception that must be the first element of value tuple with + a string value that matches the regex that must be the second element + of the value tuple. + + Otherwise use the (possibly modified) callspec to call the method, + capturing its return value, which should either be a single Terminal or + TokenList, or a tuple whose first element is a Terminal or TokenList. + + If no_end is True, assert that the return value was not a tuple or its + second value was not an integer. + + If warnings has a value, use it as the argument value to a + check_all_warnings assert around the callspec call. + + If pprint is true, call the pprint method of returned object. + + If the return value is not a singleton and the second element of + the return value is an integer, use it, modified by the length of + the prefix if test_start s true, to assert that the unparsed + remainder matches the value of 'remainder'. + + Assert that str called on the returned object matches the value + of stringified, or the characters from start to end or the end + of the string if stringified is None. + + Assert that the value attribute of the returned object matches + value, or stringified is value is None. + + Assert that the comments attribute of the returned object matches + comments. + + If commenttree is not None, assert that the comment tree of the + returned object matches it. XXX commenttree is an internal testing + hack, a real API is needed some day. + + Assert that the defects attribute of the returned object matches + defects. + + Assert that the ew_indexes attribute of the returned object matches + ew_indexes. + + Return whatever the called method returned. + + """ + s, *args = callspec.args + base = s[:-len(remainder)] if remainder else s + prefix_len = 0 + if test_start: + # XXX I'm not at sure the overhead of this randomization is worth + # it. We do at least need to test having a prefix though... + prefix_len = randint(1, 20) + prefix = ''.join(choices(ALL_ASCII, k=prefix_len)) + kw = dict(callspec.kw) + callspec = C(prefix + s, prefix_len, *args, **kw) + # XXX POSTDEP: Change this if to do only what's in the else clause. + if warnings is ...: + warningscheck = ExitStack() + else: + warnings = [(x[1], x[0]) for x in warnings] if warnings else [] + warningscheck = check_all_warnings(*warnings) + if exception: + with warningscheck: + with self.assertRaisesRegex(exception[0], exception[1]): + callspec(method) + return + stringified = base if stringified is None else stringified + value = stringified if value is None else value + comments = [] if comments is None else comments + defects = [] if defects is None else defects + with warningscheck: + result = callspec(method) + if result is None: + return + if isinstance(result, (parser.TokenList, parser.Terminal)): + other = [] + else: + result, *other = result + if pprint: + print(f'\n{result.ppstr()}') + # XXX POSTDEP: remove str from this 'if' + if other and isinstance(other[0], (int, str)): + if no_end: + self.fail( + "It looks like the function incorrectly returned an" + " end of parsing pointer" + ) + # a get_x method that returns a remainder or pointer. + actual_remainder, *other = other + if isinstance(actual_remainder, int): + if test_start: + actual_remainder -= prefix_len + actual_remainder = s[actual_remainder:] + self.assertEqual(actual_remainder, remainder) + self.assertEqual(str(result), stringified) + if isinstance(result, parser.TokenList): + self.assertEqual(result.value, value) + self.assertDefectsMatch(result.all_defects, defects) + # XXX XXX at the end of the refactor get rid of this conditional. + if comments != ...: + self.assertEqual(result.comments, comments) + if commenttree is not None: + self.assertEqual(self.ctree(result), commenttree) + if ew_indexes is not ...: + self.assertEqual( + [x - prefix_len for x in result.ew_indexes], + ew_indexes, + ) + return (result, *other) if other else result + + def verify_terminal_types(self, tl, *text_types): + """Raise error if token_type of any Terminal is not in text_types.""" + self.assertIsInstance(tl, (parser.Terminal, parser.TokenList)) + if isinstance(tl, parser.Terminal): + self.assertIn(tl.token_type, text_types, repr(tl)) + elif isinstance(tl, parser.TokenList): + for t in tl: + # Some functions return a TokenList, but there should never be + # a plain TokenList anywhere deeper. This will catch failures + # to use 'extend' when consuming returned a TokenList. + self.assertIsNotNone(t.token_type, t) + self.verify_terminal_types(t, *text_types) + + def ctree(self, tl, cnt=0): + """Return a testing-adequate depiction of the nested comments""" + if isinstance(tl, parser.Comment): + return self._ctree(tl) + comments = [] + for t in tl: + if isinstance(t, parser.Comment): + comments.append(self._ctree(t)) + elif isinstance(t, parser.TokenList): + comments.extend(self.ctree(t)) + return comments + + def _ctree(self, tl): + comments = [] + empty = True + text = '' + for t in tl: + if isinstance(t, parser.Comment): + if text: + comments.append(text) + text = '' + comments.append(self._ctree(t)) + empty = False + else: + text += str(t) + if text or empty: + comments.append(text) + return comments + + +# XXX XXX temporary step-wise refactoring tool, goes away at end of refactor. +@params_map(with_namelist=True) +def old_api_only(nl, *args, **kw): + if 'newapi' in nl: + return + kw['warnings'] = ... # Ignore pre-refactoring warnings. + kw.setdefault('test_start', False) + yield '' if 'oldapi' in nl else 'oldapionly', C(*args, **kw) + +# XXX POSTDEP: Delete this params_map and replace calls to it with params_set. +@params_map(with_namelist=True) +def for_each_api(nl, *args, **kw): + if nl.has_any('oldapi', 'newapi'): + # Reused tests; they've been through here before. + yield '', C(*args, **kw) + return + yield 'newapi', C(*args, **kw) + kw['warnings'] = kw.get('warnings', []) + [ + (DeprecationWarning, r'.*API.*has changed') + ] + yield 'oldapi', C(*args, **kw, test_start=False) -class TestParser(TestParserMixin, TestEmailBase): - # _wsp_splitter +class TestParser(TestParserMixin, TestEmailBase): rfc_printable_ascii = bytes(range(33, 127)).decode('ascii') - rfc_atext_chars = (string.ascii_letters + string.digits + - "!#$%&\'*+-/=?^_`{}|~") rfc_dtext_chars = rfc_printable_ascii.translate(str.maketrans('','',r'\[]')) - def test__wsp_splitter_one_word(self): - self.assertEqual(parser._wsp_splitter('foo', 1), ['foo']) + # XXX POSTDEP: delete from here... + # + # _wsp_splitter + + @params + def test__wsp_splitter(self, s, res): + self.assertEqual(parser._deprecated__wsp_splitter(s, 1), res) + + params_test__wsp_splitter = Params( + one_word = C('foo', ['foo']), + two_words = C('foo def', ['foo', ' ', 'def']), + ws_runs = C('foo \t def jik', ['foo', ' \t ', 'def jik']), + ) + + # XXX POSTDEP: ...to here + + + # _make_xtext + + @params + def test__make_xtext( + self, + s, + terminal_class=parser.ValueTerminal, + token_type='test', + **kw, + ): + vt = self._test_parse( + parser._make_xtext, + C(s, terminal_class, token_type), + stringified=('' if terminal_class.__name__.startswith('EW') + else None), + value=' ' if terminal_class.__name__.startswith('White') else None, + test_start=False, + **kw, + ) + self.assertEqual(vt.token_type, token_type) + + @params_map + def for_each_terminal_type(*args, **kw): + vt_types = ( + parser.ValueTerminal, + parser.WhiteSpaceTerminal, + parser.EWWhiteSpaceTerminal, + ) + for vt_type in vt_types: + yield vt_type.__name__, C(*args, **kw, terminal_class=vt_type) + + params_test__make_xtext = for_each_terminal_type( + + token_type = C('foo', token_type='bar'), + + # XXX POSTDEP: delete from here... + + ) + + + # _validate_xtext + + @params + def test__validate_xtext(self, s, defects=[]): + vt = parser.ValueTerminal(s, 'test') + parser._validate_xtext(vt) + self.assertDefectsMatch(vt.defects, defects) + + params_test__validate_xtext = Params( + + # XXX POSTDEP: ...to here + + valid = C('foo'), + + # Although it looks a bit odd for unicode to be acceptable when we have + # a non-ascii error, the parser in fact handles unicode. + unicode = C('föö'), + + # The non-ascii error arises only if the input was supposed to be 7-bit + # ASCII but in fact had non-ascii in it, in which case those bytes end + # up as surrogates. Thus the name of the defect. + surrogates = C( + 'föö'.encode().decode('ascii', 'surrogateescape'), + # "Non-ASCII characters found in header token" + defects=[undecodable_bytes_defect], + ), + + multiple_nps = C( + 'a\ttab spaces and\rcarriage return', + defects=[(nonprintable_defect, '\t \r ')], + ), + + nps_and_surrogates = C( + 'föö\t'.encode().decode('ascii', 'surrogateescape'), + defects=[undecodable_bytes_defect, nonprintable_defect('\t')], + ), + + **for_each_character(RFC_NONPRINTABLES)( + non_printable = C( + 'f{char}o', + defects=[(nonprintable_defect, '{char}')], + ), + ), - def test__wsp_splitter_two_words(self): - self.assertEqual(parser._wsp_splitter('foo def', 1), - ['foo', ' ', 'def']) + ) - def test__wsp_splitter_ws_runs(self): - self.assertEqual(parser._wsp_splitter('foo \t def jik', 1), - ['foo', ' \t ', 'def jik']) + # XXX POSTDEP: delete from here... + params_test__make_xtext.update( + add_label('from_test_validate_xtext')( + for_each_terminal_type(params_test__validate_xtext), + ), + ) + # XXX POSTDEP: ...to here. + + + # _get_xtext + + @params + def test__get_xtext( + self, + s, + # DOTALL allows the LF in our first test set to pass...in the + # normal use of _get_xtext LF will terminate the matches we use, + # leaving the LF (which shouldn't normally happen) for later code. + regex=re.compile('.*', re.DOTALL), + terminal_class=parser.ValueTerminal, + token_type='test', + err=None, + **kw, + ): + vt = self._test_parse( + parser._get_xtext, + C(s, regex, terminal_class, token_type, err=err), + stringified=('' if terminal_class.__name__.startswith('EW') + else None), + value=' ' if terminal_class.__name__.startswith('White') else None, + **kw, + ) + if 'exception' in kw: + return + self.assertEqual(vt.token_type, token_type) + + params_test__get_xtext__regex = Params( + + params_test__make_xtext, + + raises_on_no_match = C( + 'foo bar', + regex=re.compile(r'x'), + err=Exception('foo'), + exception=(Exception, 'foo'), + ), + + returns_match = C( + 'foo bar', + regex=re.compile(r'[^ ]+'), + remainder=' bar', + ), + + ignores_non_printable_after_match = C( + 'foobar\x00', + regex=re.compile(r'[^b]+'), + remainder='bar\x00', + ), + + **for_each_character(RFC_WSP + '()')( + regex_from_make_non_match_re = C( + 'foo{char}bar', + regex=parser._make_non_match_re(RFC_WSP + '()'), + remainder='{char}bar', + ), + ), + + ) # get_fws - def test_get_fws_only(self): - fws = self._test_get_x(parser.get_fws, ' \t ', ' \t ', ' ', [], '') + @params + def test_get_fws(self, s, *args, **kw): + fws = self._test_parse(parser.get_fws, C(s), *args, value=' ', **kw) + if 'exception' in kw: + return + self.assertIsInstance(fws, parser.WhiteSpaceTerminal) self.assertEqual(fws.token_type, 'fws') - def test_get_fws_space(self): - self._test_get_x(parser.get_fws, ' foo', ' ', ' ', [], 'foo') + # XXX POSTDEP: delete from here... + @params_map + def deprecate_oldapi_no_raise_behavior(*args, **kw): + kw['warnings'] = kw.get('warnings', []) + [ + (DeprecationWarning, r'.*API.*has changed'), + (DeprecationWarning, r'(?i).*raise'), + ] + yield 'oldapi', C(*args, **kw, test_start=False) + # XXX POSTDEP: ...to here. - def test_get_fws_ws_run(self): - self._test_get_x(parser.get_fws, ' \t foo ', ' \t ', ' ', [], 'foo ') + params_test_get_fws = for_each_api( - # get_encoded_word + wsp_run = C(' \t '), - def test_get_encoded_word_missing_start_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('abc') + **for_each_character(RFC_WSP)( + ends_at_non_wsp_after_wsp = C('{char}foo', remainder='foo'), + ), - def test_get_encoded_word_missing_end_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('=?abc') + **for_each_character(RFC_PRINTABLES)( + ends_at_non_wsp_after_wsp_run = C(' \t{char} ', remainder='{char} '), + ), - def test_get_encoded_word_missing_middle_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('=?abc?=') + # XXX POSTDEP: delete from here... + ) + + # These ought to error, but get_fws should never be called this way + # We'll deprecate the lack of raise during the refactor. + params_test_get_fws.update( + deprecate_oldapi_no_raise_behavior( + empty = C(''), + no_wsp = C('foo', remainder='foo'), + no_leading_wsp = C('foo bar', remainder='foo bar'), + ), + ) + + params_test_get_fws.update( + add_label('newapi')( + # XXX POStDEP: ... to here. And fix the indentation below. + **params_map( + lambda s, **k: only( + C(s, exception=(errors.HeaderParseError, '(?i)expected')) + ) + )( + empty='', + no_wsp='foo', + no_leading_wsp='foo bar', + ) + ), # XXX POSTDEP: delete this line + + ) + + + # get_encoded_word + + @params + def test_get_encoded_word( + self, + s, + *args, + charset='us-ascii', + lang='', + # XXX POSTDEP: delete the following line: + terminal_type=None, + # XXX POSTDEP: uncomment following line: + # terminal_type='ttext', + prefix=None, + expect_none=False, + decode_qp=False, + **kw, + ): + # XXX POSTDEP: delete from here... + if 'warnings' in kw: + # old api + callspec = C(s) if terminal_type is None else C(s, terminal_type) + terminal_type = terminal_type or 'vtext' + if (r := kw.get('remainder')) and r[0] not in RFC_WSP: + kw['defects'] = kw.get('defects', []) + [ + missing_whitespace_after_ew_defect, + ] + else: + terminal_type = terminal_type or 'ttext' + callspec = C(s, terminal_type) + callspec.kw['decode_qp'] = decode_qp + # XXX POSTDEP: ...to here + # XXX POSTDEP: uncomment the following line: + #callspec = C(s, terminal_type, decode_qp=decode_qp) + ew = self._test_parse(parser.get_encoded_word, callspec, *args, **kw) + if 'exception' in kw: + return + if expect_none: + self.assertIsNone(ew) + return + self.assertEqual(ew.charset, charset) + self.assertEqual(ew.lang, lang) + self.verify_terminal_types(ew, terminal_type, 'fws') + + # This params_map will handle either single strings or C objects. + @params_map + def invalid_encoded_words(v, *args, **kw): + # XXX POSTDEP: change 'newapi' to '' in the next line. + yield 'newapi', C(v, expect_none=True) + # XXX POSTDEP: delete from here... + newspec = C( + v, + *args, + # "expected encoded word but found '...'" + exception=(errors.HeaderParseError, re.escape(v)), + warnings=[(DeprecationWarning, r"(?=.*API)(?=.*has changed)")], + test_start=False, + **kw, + ) + yield 'oldapi', newspec + # XXX POSTDEP: ...to here + + params_test_get_encoded_word__invalid_input = invalid_encoded_words( + null_string = '', + no_chrome = 'content', + eq_only = '=content', + start_chrome_only = '=?', + start_and_charset_only = '=?UTF-8', + start_charset_qm_only = '=?UTF-8?', + start_charset_qm_cte_only = '=?UTF-8?q', + start_charset_qm_cte_qm_only = '=?UTF-8?q?', + start_charset_qm_cte_qm_content_only = '=?UTF-8?q?content', + start_charset_qm_cte_qm_content_qm_only = '=?UTF-8?q?content?', + end_eq_only = 'content=', + end_chrome_only = '?=', + end_and_content_only = 'content?=', + end_content_eq_only = '?content?=', + end_content_eq_cte_only = 'q?content?=', + end_content_eq_cte_eq_only = '?q?content?=', + end_content_eq_cte_eq_charset_only = 'UTF-8?q?content?=', + end_content_eq_cte_eq_charset_eq_only = '?UTF-8?q?content?=', + missing_both_middle = '=?content?=', + missing_one_middle = '=?q?content?=', + empty_cte = '=UTF-8??content?=', + empty_charset_and_cte = '=???content?=', + empty_everything = '=????=', + unknown_cte = '=?UTF-8?X?content?=', + invalid_base64_length = '=?utf-8?b?abcde?=', + multicharacter_cte = '=?UTF-8?qq?content?=', + empty_lang = '=?UTF-8*??q?content?=', + lang_with_empty_charset = '=?*foo??q?content?=', + **for_each_character(ALL_ASCII)( + character_before_valid_ew = C('{char}=?us-ascii?q?test?='), + ), + ) + + # XXX POSTDEP: delete from here... + def test_get_encoded_word_old_api_supports_keywords(self): + self._test_parse( + parser.get_encoded_word, + C('=?UTF-8?q?foo?=', terminal_type='a'), + stringified='foo', + warnings=[(DeprecationWarning, r"(?=.*API)(?=.*has changed)")], + test_start=False, + ) + # XXX POSTDEP: ...to here. + + params_test_get_encoded_word = for_each_api( + + valid_ew = C( + '=?us-ascii?q?this_is_a_test?= bird', + stringified='this is a test', + remainder=' bird', + ), + + **for_each_character(ALL_ASCII)( + ew_followed_by = C( + '=?us-ascii?q?foo?={char}', + stringified='foo', + remainder='{char}', + ), + ), + + # XXX some of these characters should result in defects depending on + # the context from which get_encoded_word is called (ex: ()s are + # illegal in comment encoded words), but but at least at the moment + # that it isn't worth the effort to implement. + **for_each_character(RFC_PRINTABLES, skip='_')( + q_content_may_contain = C( + '=?us-ascii?q?foo_{char}_bar_{char}?=', + stringified='foo {char} bar {char}', + ) + ), + + internal_spaces = C( + '=?us-ascii?q?this is a test?= bird', + stringified='this is a test', + # 'whitespace inside encoded word' + defects=[whitespace_inside_ew_defect], + remainder=' bird', + ), + + only_gets_first_ew = C( + '=?us-ascii?q?first?= =?utf-8?q?second?=', + stringified='first', + remainder=' =?utf-8?q?second?=', + ), + + only_gets_first_ew_even_if_no_space = C( + '=?us-ascii?q?first?==?utf-8?q?second?=', + stringified='first', + remainder='=?utf-8?q?second?=', + ), + + lang_set = C( + '=?us-ascii*jive?q?first_second?=', + stringified='first second', + lang='jive', + ), + + utf8_charset = C( + '=?utf-8?q?first_second?=', + stringified='first second', + charset='utf-8', + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_defect = C( + '=?us-ascii?q?first{char}second?=', + stringified='first{char}second', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + # Note that other characters may work as well, but these *must* work. + **for_each_character(RFC_CHARSET_CHARS)( + char_valid_in_charset_name = C( + '=?a_bad_{char}set_name?q?foo?=', + stringified='foo', + defects=[(charset_defect('a_bad_{echar}set_name'))], + charset='a_bad_{char}set_name', + ), + ), + + leading_internal_encoded_space = C( + '=?us-ascii?q?=20foo?=', + stringified=' foo', + ), + + leading_internal_unencoded_space = C( + '=?us-ascii?q? foo?=', + stringified=' foo', + defects=[whitespace_inside_ew_defect], + ), + + trailing_internal_encoded_space = C( + '=?us-ascii?q?foo=20_?= bird', + stringified='foo ', + value='foo ', + remainder=' bird', + ), + + trailing_internal_unencoded_space = C( + '=?us-ascii?q?foo _ ?= bird', + stringified='foo ', + value='foo ', + defects=[whitespace_inside_ew_defect], + remainder=' bird', + ), - def test_get_encoded_word_invalid_cte(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_encoded_word('=?utf-8?X?somevalue?=') - - def test_get_encoded_word_valid_ew(self): - self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?this_is_a_test?= bird', - 'this is a test', - 'this is a test', - [], - ' bird') - - def test_get_encoded_word_internal_spaces(self): - self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?this is a test?= bird', - 'this is a test', - 'this is a test', - [errors.InvalidHeaderDefect], - ' bird') - - def test_get_encoded_word_gets_first(self): - self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?first?= =?utf-8?q?second?=', - 'first', - 'first', - [], - ' =?utf-8?q?second?=') - - def test_get_encoded_word_gets_first_even_if_no_space(self): - self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?first?==?utf-8?q?second?=', - 'first', - 'first', - [errors.InvalidHeaderDefect], - '=?utf-8?q?second?=') - - def test_get_encoded_word_sets_extra_attributes(self): - ew = self._test_get_x(parser.get_encoded_word, - '=?us-ascii*jive?q?first_second?=', - 'first second', - 'first second', - [], - '') - self.assertEqual(ew.charset, 'us-ascii') - self.assertEqual(ew.lang, 'jive') - - def test_get_encoded_word_lang_default_is_blank(self): - ew = self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?first_second?=', - 'first second', - 'first second', - [], - '') - self.assertEqual(ew.charset, 'us-ascii') - self.assertEqual(ew.lang, '') - - def test_get_encoded_word_non_printable_defect(self): - self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?first\x02second?=', - 'first\x02second', - 'first\x02second', - [errors.NonPrintableDefect], - '') - - def test_get_encoded_word_leading_internal_space(self): - self._test_get_x(parser.get_encoded_word, - '=?us-ascii?q?=20foo?=', - ' foo', - ' foo', - [], - '') - - def test_get_encoded_word_quopri_utf_escape_follows_cte(self): # Issue 18044 - self._test_get_x(parser.get_encoded_word, - '=?utf-8?q?=C3=89ric?=', - 'Éric', - 'Éric', - [], - '') - - # get_unstructured - - def _get_unst(self, value): - token = parser.get_unstructured(value) - return token, '' - - def test_get_unstructured_null(self): - self._test_get_x(self._get_unst, '', '', '', [], '') - - def test_get_unstructured_one_word(self): - self._test_get_x(self._get_unst, 'foo', 'foo', 'foo', [], '') - - def test_get_unstructured_normal_phrase(self): - self._test_get_x(self._get_unst, 'foo bar bird', - 'foo bar bird', - 'foo bar bird', - [], - '') - - def test_get_unstructured_normal_phrase_with_whitespace(self): - self._test_get_x(self._get_unst, 'foo \t bar bird', - 'foo \t bar bird', - 'foo bar bird', - [], - '') - - def test_get_unstructured_leading_whitespace(self): - self._test_get_x(self._get_unst, ' foo bar', - ' foo bar', - ' foo bar', - [], - '') - - def test_get_unstructured_trailing_whitespace(self): - self._test_get_x(self._get_unst, 'foo bar ', - 'foo bar ', - 'foo bar ', - [], - '') - - def test_get_unstructured_leading_and_trailing_whitespace(self): - self._test_get_x(self._get_unst, ' foo bar ', - ' foo bar ', - ' foo bar ', - [], - '') - - def test_get_unstructured_one_valid_ew_no_ws(self): - self._test_get_x(self._get_unst, '=?us-ascii?q?bar?=', - 'bar', - 'bar', - [], - '') - - def test_get_unstructured_one_ew_trailing_ws(self): - self._test_get_x(self._get_unst, '=?us-ascii?q?bar?= ', - 'bar ', - 'bar ', - [], - '') - - def test_get_unstructured_one_valid_ew_trailing_text(self): - self._test_get_x(self._get_unst, '=?us-ascii?q?bar?= bird', - 'bar bird', - 'bar bird', - [], - '') - - def test_get_unstructured_phrase_with_ew_in_middle_of_text(self): - self._test_get_x(self._get_unst, 'foo =?us-ascii?q?bar?= bird', - 'foo bar bird', - 'foo bar bird', - [], - '') - - def test_get_unstructured_phrase_with_two_ew(self): - self._test_get_x(self._get_unst, + quopri_utf_escape_follows_cte = C( + '=?utf-8?q?=C3=89ric?=', + stringified='Éric', + charset='utf-8', + ), + + unknown_charset_leads_to_undecodable_bytes_with_non_ascii = C( + '=?invalid?q?=C3=89ric?=', + stringified='\udcc3\udc89ric', + charset='invalid', + defects=[charset_defect('invalid'), undecodable_bytes_defect], + ), + + empty_charset = C( + '=??q?content?=', + stringified='content', + charset='', + defects=[charset_defect('')], + ), + + missing_base64_padding = C( + '=?us-ascii?b?dmk?=', + stringified='vi', + defects=[invalid_base64_padding_defect], + ), + + + invalid_base64_character = C( + '=?us-ascii?b?dm\x01k===?=', + stringified='vi', + defects=[invalid_base64_characters_defect], + ), + + invalid_base64_character_and_bad_padding = C( + '=?us-ascii?b?dm\x01k?=', + stringified='vi', + defects=[ + invalid_base64_padding_defect, + invalid_base64_characters_defect, + ], + ), + + ws_only_charset_leads_to_undecodable_bytes_with_non_ascii = C( + '=? * ?q?=C3=89ric?=', + stringified='\udcc3\udc89ric', + charset='', + defects=[ + charset_defect(''), + undecodable_bytes_defect, + whitespace_inside_ew_defect, + ], + ), + + eq_is_only_special_with_two_digits_after_it = C( + '=?UTF-8?q?=C3=89ric_=_?=', + stringified='Éric = ', + charset='UTF-8', + ), + + ws_around_charset_and_lang = C( + '=? us-ascii\t* jive\t ?q?test?= bird', + stringified='test', + lang='jive', + defects=[whitespace_inside_ew_defect], + remainder=' bird', + ), + + set_terminal_type_on_single_word_content = C( + '=?us-ascii?q?text?=', + stringified='text', + terminal_type='test', + ), + + set_terminal_type_on_multiple_word_content = C( + '=?us-ascii?q?text_and_more_text?=', + stringified='text and more text', + terminal_type='test', + ), + + qp_true_no_qp = C( + r'=?us-ascii?q?test?=', + decode_qp=True, + stringified=r'test', + ), + + qp_true_with_qp = C( + r'=?us-ascii?q?tes\t?=', + decode_qp=True, + stringified='test', + ), + + qp_false_with_qp = C( + r'=?us-ascii?q?tes\t?=', + decode_qp=False, + stringified=r'tes\t', + ), + + ) + + + # content_getter + + @params + def test_content_getter( + self, + s, + *args, + start=0, + tl_class=parser.TokenList, + text_type='ttext', + end_chars='', + qp=False, + **kw, + ): + result = self._test_parse( + parser.content_getter( + tl_class, + text_type, + end_chars=end_chars, + qp=qp, + ), + C(s, start), + *args, + test_start=False, + **kw, + ) + if 'exception' in kw: + return + self.assertIsInstance(result, tl_class) + self.verify_terminal_types(result, text_type, 'fws') + + @params_map + def for_each_endchar_set(*args, **kw): + # The function is general, but these are the ones we actually use. + endchar_sets = dict( + quoted_string='"', + comment='()', + domain_literal='[]', + ) + for name, end_chars in endchar_sets.items(): + yield name, C(*args, end_chars=end_chars, **kw) + + @params_map + def for_each_endchar(*args, **kw): + return for_each_character(kw['end_chars'])(C(*args, **kw)).items() + + # This params_map is used on exactly one expression, which has to contain a + # list of characters with no repeats. + @params_map + def stops_at_first_endchar_found(s): + for i in range(len(s)): + end_chars = ''.join(sample((r := s[i:]), len(r))) + ec = charname(s[i]) + yield f'stops_at_first_endchar_found__string__{ec}', C( + s, + end_chars=end_chars, + remainder=r, + ) + yield f'stops_at_first_endchar_found__set__{ec}', C( + s, + end_chars=set(end_chars), + remainder=r, + ) + + params_test_content_getter = Params( + + specified_tl_class = C( + 'word', + stringified='"word"', + value='word', + tl_class=parser.BareQuotedString, + ), + + text_type_ew = C( + 'A test =?UTF-8?q?foo?= ', + stringified='A test foo ', + text_type='fake', + ew_indexes = [7], + ), + + text_type_ew_missing_ws = C( + 'Never=?utf8?q?_foo_bar_?=do this', + stringified='Never foo bar do this', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + text_type='fake', + ew_indexes = [5], + ), + + text_type_no_ew_unicode = C( + 'A test Éric', + text_type='fake', + ), + + **for_each_character(ALL_ASCII)( + char_after_end_char = C( + '" a test "{char}', + start=1, + end_chars='"', + stringified=' a test ', + remainder='"{char}', + ), + ), + + start_in_middle_of_ew = C( + '=?UTF-8?q?foo?=', + start=3, + stringified='=?UTF-8?q?foo?='[3:], + ), + + end_in_middle_of_ew = C( + 'foo =?UTF-8?q?foo', + ), + + end_char = C( + '"foo"', + start=1, + end_chars='"', + stringified='foo', + remainder='"', + ), + + end_char_at_start = C( + '"foo"', + start=0, + end_chars='"', + stringified='', + remainder='"foo"', + ), + + no_end_char = C( + 'foo bar', + start=0, + end_chars='"', + stringified='foo bar', + ), + + end_char_inside_ew = C( + '"quoted =?UTF-8?q?q"?=" not', + start=1, + end_chars='"', + stringified='quoted q"', + remainder='" not', + ew_indexes = [8], + ), + + first_end_char_ends_parse = C( + "(a comment)bar", + start=1, + end_chars="()", + stringified="a comment", + remainder=')bar', + ), + + second_end_char_ends_parse = C( + "(a comment(nested))", + start=1, + end_chars="()", + stringified="a comment", + remainder='(nested))', + ), + + endchar_inside_ew_preserved = C( + r'"foo =?UTF-8?q?"bar?="', + start=1, + end_chars='"', + stringified='foo "bar', + remainder='"', + ew_indexes = [5], + ), + + qp_decoded_with_qp_true = C( + r"\fo\o", + qp=True, + stringified="foo", + ), + + qp_quoted_endchar_preserved_with_qp_true = C( + r'"foo\"bar"', + start=1, + end_chars='"', + qp=True, + stringified='foo"bar', + remainder='"', + ), + + qp_quoted_endchar_inside_ew_preserved_and_unquoted_with_qp_true = C( + r'"\foo =?UTF-8?q?\"bar?="', + start=1, + end_chars='"', + qp=True, + stringified='foo "bar', + remainder='"', + ew_indexes = [6], + ), + + qp_remains_quoted_if_qp_false = C( + r'"\foo\ =?UTF-8?q?\"bar?="', + start=1, + end_chars='"', + stringified=r'\foo\ \"bar', + qp=False, + remainder='"', + ew_indexes = [7], + ), + + # XXX POSTDEP: delete from here... + + ) + + + # _get_ptext_to_endchars + + # These tests are also passed by the replacement function, content_getter. + + @params + def test__get_ptext_to_endchars(self, s, end_chars, qp=False, **kw): + ptext, had_qp = self._test_parse( + parser._get_ptext_to_endchars, + C(s, end_chars), + warnings=[ + (DeprecationWarning, '.*deprecated.*content_getter'), + ], + test_start=False, + **kw, + ) + self.assertEqual(had_qp, qp) + + params_test__get_ptext_to_endchars = Params( + + # XXX POSTDEP: ...to here + + **for_each_endchar( + wsp_can_be_legal_endchars = C( + 'foo{char}bar"', + end_chars='()' + RFC_WSP, + remainder='{char}bar"', + ), + ), + + **stops_at_first_endchar_found('(random?{})'), + + **for_each_endchar_set( + + one_word_no_wsp = C( + 'foo', + ), + + escaped_letter = C( + r'bar\s', + stringified='bars', + qp=True, + ), + + escaped_escape_char = C( + r'foo\\bar', + stringified=r'foo\bar', + qp=True, + ), + + any_printable_may_be_quoted = C( + ''.join(rf'\{c}' for c in RFC_PRINTABLES), + stringified=RFC_PRINTABLES, + qp=True, + ), + + ), + + **for_each_endchar( + for_each_endchar_set( + + stops_at_endchar = C( + 'foo{char}bar"', + remainder='{char}bar"', + ), + + quoted_endchar_no_actual_endchar = C( + r'foo\{char}bar', + stringified=r'foo{char}bar', + qp=True, + ), + + quoted_endchar_before_actual_endchar = C( + r'foo\{char}bar{char}', + stringified='foo{char}bar', + remainder='{char}', + qp=True, + ), + + multiple_qp = C( + r'\{char}\foo\\\{char}\a{char}', + stringified=r'{char}foo\{char}a', + remainder=r'{char}', + qp=True, + ), + + ), + ), + + ) + + # XXX POSTDEP: delete from here... + # As the replacement function for _get_ptext_to_endchars (among other + # things) content_getter needs to pass the _get_ptext_to_endchars tests, + # which test somewhat different scenarios than the other content_getter + # tests. + params_test_content_getter.update(params_test__get_ptext_to_endchars) + # XXX POSTDEP: ...to here + + + # parse_unstructured + + @params + def test_parse_unstructured(self, s, *args, **kw): + # We ignore kw_indexes, that's for content_getter. + result = self._test_parse( + parser.parse_unstructured, + C(s), + *args, + test_start=False, + no_end=True, + **kw, + ) + self.assertIsInstance(result, parser.UnstructuredTokenList) + self.verify_terminal_types(result, 'utext', 'fws') + + # XXX POSTDEP: delete from here... + @params + def test_get_unstructured(self, s, *args, **kw): + result = self._test_parse( + parser.get_unstructured, + C(s), + *args, + test_start=False, + no_end=True, + warnings=[ + (DeprecationWarning, r".*is.*deprecated.*parse_unstructured"), + ], + **kw, + ) + self.assertIsInstance(result, parser.UnstructuredTokenList) + self.verify_terminal_types(result, 'utext', 'fws') + # XXX POSTDEP: ...to here + + # parse_unstructured should correctly decode anything get_encoded_word does, + # so it should correctly handle most get_encoded_word parameters. + @params_map(with_namelist=True) + def adapt_get_encoded_word_tests_for_parse_unstructured(nl, *args, **kw): + kw.pop('test_start', None) + kw.pop('charset', None) + kw.pop('terminal_type', None) + kw.pop('lang', None) + # parse_unstructured parses all of its input, so it will also parse and + # return anything get_encoded_word treats as a remainder. + remainder = kw.pop('remainder', '') + if '=?' in remainder or 'ew_followed_by' in nl: + # The remainder includes something parse_unstructured would decode, + # or might contain something it would treat as a defect. Either + # way, parse_unstructured isn't expected to handle those parameters. + return + if kw.pop('decode_qp', False): + # parse_unstructured does not unquote quoted printables, so skip + # the tests where they are decoded. + return + if 'stringified' in kw: + stringified = kw['stringified'] + kw['stringified'] = stringified + remainder + rstripped = remainder.lstrip(RFC_WSP) + if remainder != rstripped: + kw['value'] = kw.get('value', stringified) + ' ' + rstripped + if 'oldapi' in nl: + # get_encoded_word is checking for warnings about its old api being + # deprecated, but parse_unstructured don't have an API change. + kw.pop('warnings') + yield 'from_test_get_encoded_word', C(*args, **kw) + + @params_map(with_namelist=True) + def adapt_get_encoded_word_invalid_input_for_parse_unstructured(nl, s, **kw): + # Get unstructured should return the inputs unaltered, + # except for the ones where the ew itself is valid. + if 'character_before_valid_ew' in nl: + return + yield 'from_test_get_encoded_word_invalid_input', C(s) + + @params_map + def add_unstructured_prefix_and_suffix(s, *args, **kw): + # Make sure the reused parameters are correctly interpreted when + # intermixed with other text by adding some text. + prefix = 'pre fix ' + pad = lambda s: f'{prefix}{s} suf fix' + if not s: + # null value is a special case, and we already have a test for it. + return + s = pad(s) + kw = {n: (pad(v) if n in ('stringified', 'value') else v) + for n, v in kw.items() + } + ew_indexes, len_prefix = [], len(prefix) + if s != kw.get('stringified', s): + ew_indexes = [len_prefix] + yield '', C(s, *args, ew_indexes=ew_indexes, **kw) + + # XXX POSTDEP: remove 'params_test_get_unstructured' from next line. + params_test_get_unstructured = params_test_parse_unstructured = Params( + + add_unstructured_prefix_and_suffix( + adapt_get_encoded_word_tests_for_parse_unstructured( + params_test_get_encoded_word, + ), + adapt_get_encoded_word_invalid_input_for_parse_unstructured( + params_test_get_encoded_word__invalid_input, + ), + ), + + null = C( + '', + ), + + one_word = C( + 'foo', + ), + + normal_phrase = C( + 'foo bar bird', + ), + + normal_phrase_with_whitespace = C( + 'foo \t bar bird', + value='foo bar bird', + ), + + leading_whitespace = C( + ' foo bar', + value=' foo bar', + ), + + trailing_whitespace = C( + 'foo bar ', + value='foo bar ', + ), + + leading_and_trailing_whitespace = C( + ' foo bar ', + value=' foo bar ', + ), + + one_valid_ew_no_ws = C( + '=?us-ascii?q?bar?=', + stringified='bar', + value='bar', + ew_indexes = [0], + ), + + one_ew_trailing_ws = C( + '=?us-ascii?q?bar?= ', + stringified='bar ', + value='bar ', + ew_indexes = [0], + ), + + one_valid_ew_trailing_text = C( + '=?us-ascii?q?bar?= bird', + stringified='bar bird', + ew_indexes = [0], + ), + + phrase_with_ew_in_middle_of_text = C( + 'foo =?us-ascii?q?bar?= bird', + stringified='foo bar bird', + ew_indexes = [4], + ), + + phrase_with_two_ew = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?=', - 'foo barbird', - 'foo barbird', - [], - '') + stringified='foo barbird', + ew_indexes = [4, 23], + ), - def test_get_unstructured_phrase_with_two_ew_trailing_ws(self): - self._test_get_x(self._get_unst, + phrase_with_two_ew_trailing_ws = C( 'foo =?us-ascii?q?bar?= =?us-ascii?q?bird?= ', - 'foo barbird ', - 'foo barbird ', - [], - '') + stringified='foo barbird ', + value='foo barbird ', + ew_indexes = [4, 23], + ), - def test_get_unstructured_phrase_with_ew_with_leading_ws(self): - self._test_get_x(self._get_unst, + phrase_with_ew_with_leading_ws = C( ' =?us-ascii?q?bar?=', - ' bar', - ' bar', - [], - '') + stringified=' bar', + value=' bar', + ew_indexes = [2], + ), - def test_get_unstructured_phrase_with_two_ew_extra_ws(self): - self._test_get_x(self._get_unst, + phrase_with_two_ew_extra_ws = C( 'foo =?us-ascii?q?bar?= \t =?us-ascii?q?bird?=', - 'foo barbird', - 'foo barbird', - [], - '') + stringified='foo barbird', + ew_indexes = [4, 26], + ), - def test_get_unstructured_two_ew_extra_ws_trailing_text(self): - self._test_get_x(self._get_unst, + two_ew_extra_ws_trailing_text = C( '=?us-ascii?q?test?= =?us-ascii?q?foo?= val', - 'testfoo val', - 'testfoo val', - [], - '') + stringified='testfoo val', + value='testfoo val', + ew_indexes = [0, 22], + ), - def test_get_unstructured_ew_with_internal_ws(self): - self._test_get_x(self._get_unst, + ew_with_internal_ws = C( '=?iso-8859-1?q?hello=20world?=', - 'hello world', - 'hello world', - [], - '') + stringified='hello world', + ew_indexes = [0], + ), - def test_get_unstructured_ew_with_internal_leading_ws(self): - self._test_get_x(self._get_unst, + ew_with_internal_leading_ws = C( ' =?us-ascii?q?=20test?= =?us-ascii?q?=20foo?= val', - ' test foo val', - ' test foo val', - [], - '') + stringified=' test foo val', + value=' test foo val', + ew_indexes = [3, 28], + ), - def test_get_unstructured_invalid_ew(self): - self._test_get_x(self._get_unst, - '=?test val', - '=?test val', + invalid_ew = C( '=?test val', - [], - '') + ), - def test_get_unstructured_undecodable_bytes(self): - self._test_get_x(self._get_unst, + undecodable_bytes = C( b'test \xACfoo val'.decode('ascii', 'surrogateescape'), - 'test \uDCACfoo val', - 'test \uDCACfoo val', - [errors.UndecodableBytesDefect], - '') + stringified='test \uDCACfoo val', + value='test \uDCACfoo val', + defects=[undecodable_bytes_defect], + ), - def test_get_unstructured_undecodable_bytes_in_EW(self): - self._test_get_x(self._get_unst, + undecodable_bytes_in_EW = C( (b'=?us-ascii?q?=20test?= =?us-ascii?q?=20\xACfoo?=' b' val').decode('ascii', 'surrogateescape'), - ' test \uDCACfoo val', - ' test \uDCACfoo val', - [errors.UndecodableBytesDefect]*2, - '') - - def test_get_unstructured_missing_base64_padding(self): - self._test_get_x(self._get_unst, - '=?utf-8?b?dmk?=', - 'vi', - 'vi', - [errors.InvalidBase64PaddingDefect], - '') + stringified=' test \uDCACfoo val', + value=' test \uDCACfoo val', + defects=[ + undecodable_bytes_defect, + (undecodable_bytes_in_ew_defect, 'us-ascii'), + ], + ew_indexes = [0, 25], + ), - def test_get_unstructured_invalid_base64_character(self): - self._test_get_x(self._get_unst, - '=?utf-8?b?dm\x01k===?=', - 'vi', - 'vi', - [errors.InvalidBase64CharactersDefect], - '') - def test_get_unstructured_invalid_base64_character_and_bad_padding(self): - self._test_get_x(self._get_unst, - '=?utf-8?b?dm\x01k?=', - 'vi', - 'vi', - [errors.InvalidBase64CharactersDefect, - errors.InvalidBase64PaddingDefect], - '') - - def test_get_unstructured_invalid_base64_length(self): - # bpo-27397: Return the encoded string since there's no way to decode. - self._test_get_x(self._get_unst, - '=?utf-8?b?abcde?=', - 'abcde', - 'abcde', - [errors.InvalidBase64LengthDefect], - '') - - def test_get_unstructured_no_whitespace_between_ews(self): - self._test_get_x(self._get_unst, + no_whitespace_between_ews = C( '=?utf-8?q?foo?==?utf-8?q?bar?=', - 'foobar', - 'foobar', - [errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect], - '') - - def test_get_unstructured_ew_without_leading_whitespace(self): - self._test_get_x( - self._get_unst, + stringified='foobar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes = [0, 15], + ), + + ew_without_leading_whitespace = C( 'nowhitespace=?utf-8?q?somevalue?=', - 'nowhitespacesomevalue', - 'nowhitespacesomevalue', - [errors.InvalidHeaderDefect], - '') + stringified='nowhitespacesomevalue', + defects=[missing_whitespace_before_ew_defect], + ew_indexes = [12], + ), - def test_get_unstructured_ew_without_trailing_whitespace(self): - self._test_get_x( - self._get_unst, + ew_without_trailing_whitespace = C( '=?utf-8?q?somevalue?=nowhitespace', - 'somevaluenowhitespace', - 'somevaluenowhitespace', - [errors.InvalidHeaderDefect], - '') + stringified='somevaluenowhitespace', + defects=[missing_whitespace_after_ew_defect], + ew_indexes = [0], + ), - def test_get_unstructured_without_trailing_whitespace_hang_case(self): - self._test_get_x(self._get_unst, + # bpo-37764 + without_trailing_whitespace_hang_case = C( '=?utf-8?q?somevalue?=aa', - 'somevalueaa', - 'somevalueaa', - [errors.InvalidHeaderDefect], - '') - - def test_get_unstructured_invalid_ew2(self): - self._test_get_x(self._get_unst, - '=?utf-8?q?=somevalue?=', - '=?utf-8?q?=somevalue?=', + stringified='somevalueaa', + defects=[missing_whitespace_after_ew_defect], + ew_indexes = [0], + ), + + # Although this is technically invalid (unencoded =) we handle it anyway + # XXX there should be a defect, which is currently missing. + invalid_ew2 = C( '=?utf-8?q?=somevalue?=', - [], - '') + '=somevalue', + ew_indexes = [0], + ), + + **for_each_character(RFC_PRINTABLES)( + printable_around_and_between_ews = C( + '{char} =?utf-8?q?foo?= {char} =?utf-8?q?bar?= {char}', + stringified='{char} foo {char} bar {char}', + ew_indexes = [2, 20], + ), + ), + + **for_each_character(RFC_PRINTABLES, skip='_')( + printable_inside_ews = C( + '=?utf-8?q?rock{char}?= =?utf-8?q?{char}hard_place?=', + stringified='rock{char}{char}hard place', + ew_indexes = [0, 18], + ), + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_wsp_non_printable = C( + 'some {char} text', + stringified='some {char} text', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_wsp_non_printable_inside_ew = C( + '=?utf-8?q?some{char}?= text', + stringified='some{char} text', + defects=[(nonprintable_defect, '{char}')], + ew_indexes = [0], + ), + ), + + unicode = C( + '📦', + ), + + non_ascii_bytes = C( + '📦'.encode().decode('ascii', 'surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + invalid_ew_charset = C( + 'a =?invalid?q?=C3=89ric?= b', + stringified='a \udcc3\udc89ric b', + defects=[charset_defect('invalid'), undecodable_bytes_defect], + ew_indexes = [2], + ), + + ew_start_chrome_before_real_ew = C( + 'z=?xx =?UTF-8?Q?foo?=', + stringified='z=?xx foo', + ew_indexes = [6], + ), + + ) + + # content_getter and parse_unstructured must behave identically for all the + # data parse_unstructured handles. + params_test_content_getter__with_parse_unstructured_params = ( + params_test_parse_unstructured + ) + + + # get_ccontent_sequence + + @params + def test_get_ccontent_sequence(self, s, *args, **kw): + tl = self._test_parse( + parser.get_ccontent_sequence, + C(s), + *args, + **kw, + ) + self.assertIsInstance(tl, parser.TokenList) + self.verify_terminal_types(tl, 'ptext', 'fws') + + params_test_get_ccontent_sequence = Params( + + **for_each_character(RFC_WSP)( + two_words = C( + 'foo{char}de', + value='foo de', + ), + ), + + wsp_before_close_paren = C( + 'foo \t)', + value='foo ', + remainder=')', + ), + + up_to_open_paren_only = C( + 'foo(', + remainder='(', + ), + + wsp_before_open_paren = C( + 'foo \t(', + value='foo ', + remainder='(', + ), + + ew = C( + '=?UTF-8?q?test?=', + stringified='test', + ew_indexes=[0], + ), + + ws_around_ew = C( + ' =?UTF-8?q?test?= ', + stringified=' test ', + ew_indexes=[1], + ), + + ws_inside_ew = C( + '=?UTF-8?q? Test ?=', + stringified=' Test ', + defects=[whitespace_inside_ew_defect], + ew_indexes=[0], + ), + + non_ws_around_ew = C( + 'foo=?UTF-8?q?bar_?=bird', + stringified='foobar bird', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[3], + ), + + multiple_ew = C( + 'foo =?UTF-8?q?a?= =?UTF-8?q?t?=', + stringified='foo at', + ew_indexes=[4, 18], + ), + + ew_missing_whitespace_between_ews = C( + 'foo =?UTF-8?q?a?==?UTF-8?q?t?=', + stringified='foo at', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[4, 17], + ), + + **for_each_character(RFC_WSP)( + inter_ew_whitespace_handled_correctly = C( + '{char}=?UTF-8?q?_foo_?={char}{char}=?UTF-8?q?bar_?= ', + stringified='{char} foo bar ', + value=' foo bar ', + ew_indexes=[1, 20], + ), + ), + + qp_inside_ew = C( + r'=?UTF-8?q?\test\)_?= =?UTF-8?q?\(test?=', + stringified=r'test) (test', + ew_indexes=[0, 21], + ), + + unquoted_parens_inside_ew = C( + '=?UTF-8?q?test)_?= =?UTF-8?q?(test?=) foo', + stringified=r'test) (test', + remainder=') foo', + ew_indexes=[0, 19], + ), + + # XXX POSTDEP: delete from here... + + ) - def test_get_unstructured_invalid_ew_cte(self): - self._test_get_x(self._get_unst, - '=?utf-8?X?=somevalue?=', - '=?utf-8?X?=somevalue?=', - '=?utf-8?X?=somevalue?=', - [], - '') # get_qp_ctext - def test_get_qp_ctext_only(self): - ptext = self._test_get_x(parser.get_qp_ctext, - 'foobar', 'foobar', ' ', [], '') + @params + def test_get_qp_ctext(self, s, *args, value=' ', **kw): + ptext = self._test_parse( + parser._deprecated_get_qp_ctext, + C(s), + *args, + value=value, + warnings=..., + test_start=False, + **kw, + ) + self.assertIsInstance(ptext, parser.Terminal) self.assertEqual(ptext.token_type, 'ptext') - def test_get_qp_ctext_all_printables(self): - with_qp = self.rfc_printable_ascii.replace('\\', '\\\\') - with_qp = with_qp. replace('(', r'\(') - with_qp = with_qp.replace(')', r'\)') - ptext = self._test_get_x(parser.get_qp_ctext, - with_qp, self.rfc_printable_ascii, ' ', [], '') + params_test_get_qp_ctext__wsp_cases = Params( - def test_get_qp_ctext_two_words_gets_first(self): - self._test_get_x(parser.get_qp_ctext, - 'foo de', 'foo', ' ', [], ' de') + two_words_gets_first = C( + 'foo de', + remainder=' de', + ), - def test_get_qp_ctext_following_wsp_preserved(self): - self._test_get_x(parser.get_qp_ctext, - 'foo \t\tde', 'foo', ' ', [], ' \t\tde') + following_wsp_preserved = C( + 'foo \t\tde', + remainder=' \t\tde', + ), - def test_get_qp_ctext_up_to_close_paren_only(self): - self._test_get_x(parser.get_qp_ctext, - 'foo)', 'foo', ' ', [], ')') + wsp_before_close_paren_preserved = C( + 'foo )', + remainder=' )', + ), - def test_get_qp_ctext_wsp_before_close_paren_preserved(self): - self._test_get_x(parser.get_qp_ctext, - 'foo )', 'foo', ' ', [], ' )') + wsp_before_open_paren_preserved = C( + 'foo (', + remainder=' (', + ), - def test_get_qp_ctext_close_paren_mid_word(self): - self._test_get_x(parser.get_qp_ctext, - 'foo)bar', 'foo', ' ', [], ')bar') + ) - def test_get_qp_ctext_up_to_open_paren_only(self): - self._test_get_x(parser.get_qp_ctext, - 'foo(', 'foo', ' ', [], '(') + params_test_get_qp_ctext = Params( - def test_get_qp_ctext_wsp_before_open_paren_preserved(self): - self._test_get_x(parser.get_qp_ctext, - 'foo (', 'foo', ' ', [], ' (') + # XXX POSTDEP: ...to here. - def test_get_qp_ctext_open_paren_mid_word(self): - self._test_get_x(parser.get_qp_ctext, - 'foo(bar', 'foo', ' ', [], '(bar') + value_ends_at_input_end = C( + 'foobar', + ), + + all_printables = C( + RFC_PRINTABLES. + replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)'), + stringified=RFC_PRINTABLES, + ), + + up_to_close_paren_only = C( + 'foo)', + remainder=')', + ), + + close_paren_mid_word = C( + 'foo)bar', + remainder=')bar', + ), + + up_to_open_paren_only = C( + 'foo(', + remainder='(', + ), + + open_paren_mid_word = C( + 'foo(bar', + remainder='(bar', + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printables = C( + 'foo{char}bar)', + defects=[(nonprintable_defect, '{char}')], + remainder=')', + ), + ), + + close_paren_only = C( + ')', + remainder=')', + ), + + open_paren_only = C( + '(', + remainder='(', + ), + + no_content = C( + '', + ), - def test_get_qp_ctext_non_printables(self): - ptext = self._test_get_x(parser.get_qp_ctext, - 'foo\x00bar)', 'foo\x00bar', ' ', - [errors.NonPrintableDefect], ')') - self.assertEqual(ptext.defects[0].non_printables[0], '\x00') + parens_are_content_if_quoted = C( + r'\(bar\)\)bird\(', + stringified='(bar))bird(', + ), - def test_get_qp_ctext_close_paren_only(self): - self._test_get_x(parser.get_qp_ctext, - ')', '', ' ', [], ')') + escapes_are_removed_in_str = C( + r'fairly\&\boring\W\@\!ks', + stringified='fairly&boringW@!ks', + ), - def test_get_qp_ctext_open_paren_only(self): - self._test_get_x(parser.get_qp_ctext, - '(', '', ' ', [], '(') + any_printable_may_be_escaped = C( + ''.join(rf'\{c}' for c in RFC_PRINTABLES), + RFC_PRINTABLES, + ), - def test_get_qp_ctext_no_end_char(self): - self._test_get_x(parser.get_qp_ctext, - '', '', ' ', [], '') + unicode_content = C( + '⛔❌❗', + ), + mixed_unicode_and_ascii = C( + 'ministry✌of⛔silly❌walks❗', + ), - # get_qcontent + unicode_can_be_quoted = C( + r'sillier\❌walks\❗', + stringified='sillier❌walks❗', + ), - def test_get_qcontent_only(self): - ptext = self._test_get_x(parser.get_qcontent, - 'foobar', 'foobar', 'foobar', [], '') - self.assertEqual(ptext.token_type, 'ptext') + ) - def test_get_qcontent_all_printables(self): - with_qp = self.rfc_printable_ascii.replace('\\', '\\\\') - with_qp = with_qp. replace('"', r'\"') - ptext = self._test_get_x(parser.get_qcontent, with_qp, - self.rfc_printable_ascii, - self.rfc_printable_ascii, [], '') + # XXX POSTDEP: delete from here... + # get_ccontent_sequence is handling a superset of what get_qp_ctext used to + # handle. It should pass this subset of get_qp_ctext tests that don't + # involve whitespace. + params_test_get_ccontent_sequence.update( + add_label('from_test_get_qp_ctext')(params_test_get_qp_ctext) + ) + # XXX POSDEP: ...to here. - def test_get_qcontent_two_words_gets_first(self): - self._test_get_x(parser.get_qcontent, - 'foo de', 'foo', 'foo', [], ' de') - def test_get_qcontent_following_wsp_preserved(self): - self._test_get_x(parser.get_qcontent, - 'foo \t\tde', 'foo', 'foo', [], ' \t\tde') + # XXX POSTDEP: delete from here... + # + # get_qcontent - def test_get_qcontent_up_to_dquote_only(self): - self._test_get_x(parser.get_qcontent, - 'foo"', 'foo', 'foo', [], '"') + @params + def test_get_qcontent(self, s, *args, **kw): + ptext = self._test_parse( + parser.get_qcontent, + C(s), + *args, + test_start=False, + warnings=[ + (DeprecationWarning, r".*deprecated.*get_bare_quoted_string"), + (DeprecationWarning, r".*ptext.*deprecated"), + (DeprecationWarning, r".*validate.*deprecated"), + ], + **kw, + ) + self.assertIsInstance(ptext, parser.Terminal) + self.assertEqual(ptext.token_type, 'ptext') - def test_get_qcontent_wsp_before_close_paren_preserved(self): - self._test_get_x(parser.get_qcontent, - 'foo "', 'foo', 'foo', [], ' "') + params_test_get_qcontent = Params( - def test_get_qcontent_close_paren_mid_word(self): - self._test_get_x(parser.get_qcontent, - 'foo"bar', 'foo', 'foo', [], '"bar') + no_qp_no_end_char = C( + 'foobar', + ), + + all_printables = C( + RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"'), + stringified=RFC_PRINTABLES, + ), + + **for_each_character(RFC_WSP)( + two_words_gets_first = C( + 'foo{char}de', + remainder='{char}de', + ), + ), + + following_wsp_preserved = C( + 'foo \t\tde', + remainder=' \t\tde', + ), + + up_to_dquote_only = C( + 'foo"', + remainder='"', + ), + + wsp_before_dquote_preserved = C( + 'foo "', + remainder=' "', + ), + + dquote_mid_word = C( + 'foo"bar', + remainder='"bar', + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable = C( + 'foo{char}bar"', + defects=[(nonprintable_defect, '{char}')], + remainder='"', + ), + ), + + no_content_before_dquote = C( + '"', + remainder='"', + ), + + empty_value = C( + '', + ), - def test_get_qcontent_non_printables(self): - ptext = self._test_get_x(parser.get_qcontent, - 'foo\x00fg"', 'foo\x00fg', 'foo\x00fg', - [errors.NonPrintableDefect], '"') - self.assertEqual(ptext.defects[0].non_printables[0], '\x00') + ) - def test_get_qcontent_empty(self): - self._test_get_x(parser.get_qcontent, - '"', '', '', [], '"') + # XXX POSTDEP: ...to here. + + + # get_atext_sequence + + @params + def test_get_atext_sequence(self, s, *args, **kw): + tl = self._test_parse(parser.get_atext_sequence, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(tl, parser.TokenList) + # There can be fws inside the encoded words. + self.verify_terminal_types(tl, 'atext', 'fws') + + params_test_get_atext_sequence = Params( + + ew_only = C( + '=?utf-8?q?=20bob?=', + stringified=' bob', + ew_indexes=[0], + ), + + # get_atext_sequence doesn't add a missing whitespace error here even + # though the RFC requires one before the special, because adding that + # defect is handled at the next level up in the parser. + # XXX Ideally this should have a defect for the specials. + **for_each_character(RFC_SPECIALS)( + ew_with_unencoded_special = C( + '=?UTF-8?q?bob{char}?=@foo', + stringified='bob{char}', + remainder='@foo', + ew_indexes=[0], + ), + ), + + ew_after_atom_no_ws = C( + 'foo@=?UTF-8?q?bob?=', + value='foo', + remainder='@=?UTF-8?q?bob?=', + ), + + multiple_ew_no_ws = C( + '=?UTF-8?q?foo?==?UTF-8?q?bar?=', + stringified='foobar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], + ), + + ew_in_middle_of_atext = C( + 'foo{=?UTF-8?q?foo?=}{=?UTF-8?q?bar?=}bar', + stringified='foo{foo}{bar}bar', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[4, 21], + ), + + all_non_special_printables_are_allowed = C( + f'{"".join(set(RFC_PRINTABLES) - set(RFC_SPECIALS))}@', + remainder='@', + ), + + # XXX POSTDEP: delete from here... + + ) - def test_get_qcontent_no_end_char(self): - self._test_get_x(parser.get_qcontent, - '', '', '', [], '') # get_atext - def test_get_atext_only(self): - atext = self._test_get_x(parser.get_atext, - 'foobar', 'foobar', 'foobar', [], '') + @params + def test_get_atext(self, s, *args, **kw): + atext = self._test_parse( + parser._deprecated_get_atext, + C(s), + *args, + warnings=..., + test_start=False, + **kw, + ) + if 'exception' in kw: + return + self.assertIsInstance(atext, parser.Terminal) self.assertEqual(atext.token_type, 'atext') - def test_get_atext_all_atext(self): - atext = self._test_get_x(parser.get_atext, self.rfc_atext_chars, - self.rfc_atext_chars, - self.rfc_atext_chars, [], '') + params_test_get_atext = Params( - def test_get_atext_two_words_gets_first(self): - self._test_get_x(parser.get_atext, - 'foo bar', 'foo', 'foo', [], ' bar') + # XXX POSTDEP: ....to here - def test_get_atext_following_wsp_preserved(self): - self._test_get_x(parser.get_atext, - 'foo \t\tbar', 'foo', 'foo', [], ' \t\tbar') + only = C( + 'foobar', + ), + + all_atext = C( + RFC_ATEXT, + ), + + two_words_gets_first = C( + 'foo bar', + remainder=' bar', + ), + + following_wsp_preserved = C( + 'foo \t\tbar', + remainder=' \t\tbar', + ), + + **for_each_character(RFC_SPECIALS)( + up_to_special = C( + RFC_ATEXT. + replace('{', '{{').replace('}', '}}') + '{char}' + 'bar', + remainder='{char}bar', + ), + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printables = C( + 'foo{char}bar(', + defects=[(nonprintable_defect, '{char}')], + remainder='(', + ), + ), + + **for_each_character(RFC_SPECIALS + RFC_WSP)( + no_atext_before_special_or_wsp = C( + '{char}foo', + # XXX POSTDEP: replace 'echar' with 'erchar': + exception=(errors.HeaderParseError, '{echar}foo'), + ), + ), + + undecodable_characters = C( + 'foo🎁bar'.encode().decode('us-ascii', errors='surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + ) - def test_get_atext_up_to_special(self): - self._test_get_x(parser.get_atext, - 'foo@bar', 'foo', 'foo', [], '@bar') + # XXX POSTDEP: Delete from here... + # + # This params_map deals with the fact that get_atext doesn't call repr + # on value in the exception message, but get_atext_sequence does. + @params_map(with_namelist=True) + def atext_repr_fixup(nl, *args, **kw): + if nl.has_all('no_atext_before_special_or_wsp', 'HT'): + kw['exception'] = (kw['exception'][0], re.escape('\\tfoo')) + yield '', C(*args, **kw) + + # get_atext_sequence needs to pass all the get_atext tests. + params_test_get_atext_sequence.update( + atext_repr_fixup(params_test_get_atext) + ) + # XXX POSTDEP: ...to here. - def test_get_atext_non_printables(self): - atext = self._test_get_x(parser.get_atext, - 'foo\x00bar(', 'foo\x00bar', 'foo\x00bar', - [errors.NonPrintableDefect], '(') - self.assertEqual(atext.defects[0].non_printables[0], '\x00') # get_bare_quoted_string - def test_get_bare_quoted_string_only(self): - bqs = self._test_get_x(parser.get_bare_quoted_string, - '"foo"', '"foo"', 'foo', [], '') + @params + def test_get_bare_quoted_string(self, s, *args, **kw): + bqs = self._test_parse( + parser.get_bare_quoted_string, + C(s), + *args, + **kw, + ) + if 'exception' in kw: + return + self.assertIsInstance(bqs, parser.BareQuotedString) self.assertEqual(bqs.token_type, 'bare-quoted-string') - - def test_get_bare_quoted_string_must_start_with_dquote(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_bare_quoted_string('foo"') - with self.assertRaises(errors.HeaderParseError): - parser.get_bare_quoted_string(' "foo"') - - def test_get_bare_quoted_string_only_quotes(self): - self._test_get_x(parser.get_bare_quoted_string, - '""', '""', '', [], '') - - def test_get_bare_quoted_string_missing_endquotes(self): - self._test_get_x(parser.get_bare_quoted_string, - '"', '""', '', [errors.InvalidHeaderDefect], '') - - def test_get_bare_quoted_string_following_wsp_preserved(self): - self._test_get_x(parser.get_bare_quoted_string, - '"foo"\t bar', '"foo"', 'foo', [], '\t bar') - - def test_get_bare_quoted_string_multiple_words(self): - self._test_get_x(parser.get_bare_quoted_string, - '"foo bar moo"', '"foo bar moo"', 'foo bar moo', [], '') - - def test_get_bare_quoted_string_multiple_words_wsp_preserved(self): - self._test_get_x(parser.get_bare_quoted_string, - '" foo moo\t"', '" foo moo\t"', ' foo moo\t', [], '') - - def test_get_bare_quoted_string_end_dquote_mid_word(self): - self._test_get_x(parser.get_bare_quoted_string, - '"foo"bar', '"foo"', 'foo', [], 'bar') - - def test_get_bare_quoted_string_quoted_dquote(self): - self._test_get_x(parser.get_bare_quoted_string, - r'"foo\"in"a', r'"foo\"in"', 'foo"in', [], 'a') - - def test_get_bare_quoted_string_non_printables(self): - self._test_get_x(parser.get_bare_quoted_string, - '"a\x01a"', '"a\x01a"', 'a\x01a', - [errors.NonPrintableDefect], '') - - def test_get_bare_quoted_string_no_end_dquote(self): - self._test_get_x(parser.get_bare_quoted_string, - '"foo', '"foo"', 'foo', - [errors.InvalidHeaderDefect], '') - self._test_get_x(parser.get_bare_quoted_string, - '"foo ', '"foo "', 'foo ', - [errors.InvalidHeaderDefect], '') - - def test_get_bare_quoted_string_empty_quotes(self): - self._test_get_x(parser.get_bare_quoted_string, - '""', '""', '', [], '') - - # Issue 16983: apply postel's law to some bad encoding. - def test_encoded_word_inside_quotes(self): - self._test_get_x(parser.get_bare_quoted_string, + self.verify_terminal_types(bqs, 'ptext', 'fws') + + params_test_get_bare_quoted_string = for_each_api( + + non_ws = C( + '"foo"', + value='foo', + ), + + no_leading_dquote_before_non_ws = C( + 'foo"', + exception=(errors.HeaderParseError, 'expected.*foo'), + ), + + no_leading_dquote_before_ws = C( + ' "foo"', + exception=(errors.HeaderParseError, 'expected.*"foo"'), + ), + + only_quotes = C( + '""', + value='', + ), + + missing_endquote = C( + '"', + stringified='""', + value='', + defects=[end_inside_quoted_string_defect], + ), + + following_wsp_preserved = C( + '"foo"\t bar', + value='foo', + remainder='\t bar', + ), + + multiple_words = C( + '"foo bar moo"', + value='foo bar moo', + ), + + multiple_words_wsp_preserved = C( + '" foo moo\t"', + value=' foo moo\t', + ), + + end_dquote_mid_word = C( + '"foo"bar', + value='foo', + remainder='bar', + ), + + quoted_dquote = C( + r'"foo\"in"@', + value='foo"in', + remainder='@', + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printables = C( + '"a{char}a"', + value='a{char}a', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + all_printables_allowed = C( + f'"{RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"')}"', + value=RFC_PRINTABLES, + ), + + any_printable_may_be_escaped = C( + f'"{''.join(rf'\{c}' for c in RFC_PRINTABLES)}"', + stringified= + f'"{RFC_PRINTABLES.replace('\\', r'\\').replace('"', r'\"')}"', + value=RFC_PRINTABLES, + ), + + no_end_dquote_after_non_ws = C( + '"foo', + stringified='"foo"', + value='foo', + defects=[end_inside_quoted_string_defect], + ), + + no_end_dquote_after_ws = C( + '"foo ', + stringified='"foo "', + value='foo ', + defects=[end_inside_quoted_string_defect], + ), + + # Issue 16983: apply postel's law to some bad encoding. + encoded_word_inside_quotes = C( '"=?utf-8?Q?not_really_valid?="', - '"not really valid"', - 'not really valid', - [errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect], - '') + stringified='"not really valid"', + value='not really valid', + defects=[ew_inside_quoted_string_defect], + ew_indexes=[1], + ), + + mixed_encoded_words_and_regular_text = C( + '"This has=?utf-8?Q?multiple?= =?utf-8?q?errors?=in it', + stringified='"This hasmultipleerrorsin it"', + value='This hasmultipleerrorsin it', + defects=[ + *[ew_inside_quoted_string_defect]*2, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + end_inside_quoted_string_defect, + ], + ew_indexes=[9, 30], + ), + + encoded_word_after_dquote_with_no_ws = C( + '"test"of=?UTF-8?q?bad?=data', + value='test', + remainder='of=?UTF-8?q?bad?=data', + ), + + invalid_charset = C( + '"=?foo?Q?not_really_valid?= at all"', + stringified='"not really valid at all"', + value='not really valid at all', + defects=[ + ew_inside_quoted_string_defect, + charset_defect('foo'), + ], + ew_indexes=[1], + ), + + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + ) + # get_comment - def test_get_comment_only(self): - comment = self._test_get_x(parser.get_comment, - '(comment)', '(comment)', ' ', [], '', ['comment']) - self.assertEqual(comment.token_type, 'comment') + @params + def test_get_comment(self, + s, + *args, + value=' ', + comments=None, + content=None, + commenttree=None, + **kw): + if content is None: + content = comments[0] if comments else None + if commenttree is None: + commenttree = [content] + cmt = self._test_parse( + parser.get_comment, + C(s), + *args, + value=value, + comments=comments, + commenttree=commenttree, + **kw, + ) + if 'exception' in kw: + return + self.assertEqual(cmt.content, content) + self.assertIsInstance(cmt, parser.Comment) + self.assertEqual(cmt.token_type, 'comment') + self.verify_terminal_types(cmt, 'ptext', 'fws') + + @params_map + def adapt_get_ccontent_sequence_tests_for_get_comment( + s, + *args, + stringified=None, + remainder='', + ew_indexes=[], + **kw, + ): + # get_comment parses parens, and quotes them differently in str, so + # tests involving parens in the test string won't pass here. + if '(' in s or ')' in s: + return + if stringified: + kw['comments'] = [stringified] + kw['stringified'] = f"({stringified})" + else: + kw['comments'] = [s] + kw.pop('value', None) + kw['ew_indexes'] = [x + 1 for x in ew_indexes] + yield 'from_test_get_ccontent_sequence', C(f'({s})', *args, **kw) + + params_test_get_comment = for_each_api( + + adapt_get_ccontent_sequence_tests_for_get_comment( + params_test_get_ccontent_sequence, + ), + + simple_comment_only = C( + '(comment)', + comments=['comment'], + ), + + non_wsp_before_left_paren_is_error = C( + 'foo"', + exception=(errors.HeaderParseError, r'(?=.*expected)(?=.*foo)'), + ), + + wsp_before_left_paren_is_error = C( + ' (foo"', + exception=(errors.HeaderParseError, r'(?=.*expected)(?=.* \(foo)'), + ), + + wsp_after_right_paren = C( + '(comment) \t', + remainder=' \t', + comments=['comment'], + ), + + multiple_words = C( + '(foo bar)', + comments=['foo bar'], + ), + + wsp_runs_inside_comment = C( + '( foo bar\t )', + comments=[' foo bar\t '], + ), + + non_wsp_after_right_paren = C( + '(foo)bar', + remainder='bar', + comments=['foo'], + ), + + quoted_parens = C( + r'(foo\) \(\)bar)', + comments=['foo) ()bar'], + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable = C( + '(foo{char}bar)', + defects=[(nonprintable_defect, '{char}')], + comments=['foo{char}bar'], + ), + ), + + no_right_paren_after_non_ws = C( + '(foo bar', + stringified='(foo bar)', + defects=[end_inside_comment_defect], + comments=['foo bar'], + ), + + no_right_paren_after_ws = C( + '(foo bar ', + stringified='(foo bar )', + defects=[end_inside_comment_defect], + comments=['foo bar '], + ), + + nested_comment = C( + '(foo(bar))', + comments=['foo(bar)'], + commenttree=['foo', ['bar']], + ), + + nested_comment_wsp = C( + '(foo ( bar ) )', + comments=['foo ( bar ) '], + commenttree=['foo ', [' bar '], ' '], + ), + + empty_comment = C( + '()', + comments=[''], + commenttree=[''], + ), + + multiple_nesting = C( + '(((((foo)))))', + comments=['((((foo))))'], + commenttree=[[[[['foo']]]]], + ), + + multiple_mesting_missing_two_right_parens = C( + '(((((foo)))', + stringified='(((((foo)))))', + defects=[*[end_inside_comment_defect]*2], + comments=['((((foo))))'], + commenttree=[[[[['foo']]]]], + ), + + quoted_paren_in_nested_comment = C( + r'(foo (b\)))', + comments=[r'foo (b\))'], + commenttree=['foo ', ['b)']], + ), + + any_printable_may_be_escaped = C( + f"({''.join(fr'\{c}' for c in RFC_PRINTABLES)})", + stringified= + f"({RFC_PRINTABLES + .replace('\\', r'\\') + .replace('(', r'\(') + .replace(')', r'\)') + })", + comments=[RFC_PRINTABLES], + ), + + all_printables = C( + f"({RFC_PRINTABLES. + replace('\\', r'\\').replace('(', r'\(').replace(')', r'\)')})", + comments=[RFC_PRINTABLES], + ), + + multiple_nested_comments = C( + '(foo (nest 1) (nest 2 (nest 3)))', + comments=['foo (nest 1) (nest 2 (nest 3))'], + commenttree=['foo ', ['nest 1'], ' ', ['nest 2 ', ['nest 3']]], + ), + + nested_empty_comments = C( + '( () ( ( ) ) )', + comments=[' () ( ( ) ) '], + commenttree=[' ', [''], ' ', [' ', [' '], ' '], ' '], + ), + + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + ew_after_comment_no_ws = C( + '(foo)=?UTF-8?q?foo?=', + stringified='(foo)', + comments=['foo'], + remainder='=?UTF-8?q?foo?=', + ), + + ws_around_ew = C( + '( =?utf-8?q?test?= )', + stringified='( test )', + comments=[' test '], + ew_indexes=[2], + ), + + ew_in_nested_comment = C( + '(foo (=?UTF-8?q?bar?=))', + stringified='(foo (bar))', + comments=['foo (bar)'], + commenttree=['foo ', ['bar']], + ew_indexes=[6], + ), + + ew_missing_whitespace = C( + '(=?UTF-8?q?foo?==?UTF-8?q?bar?=)', + stringified='(foobar)', + comments=['foobar'], + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[1, 16], + ), + + no_ws_around_ew = C( + '(=?UTF-8?q?test?=)', + stringified='(test)', + comments=['test'], + ew_indexes=[1], + ), + + ws_inside_ew = C( + '(=?UTF-8?q? Test ?=)', + stringified='( Test )', + comments=[' Test '], + defects=[whitespace_inside_ew_defect], + ew_indexes=[1], + ), + + non_ws_around_ew = C( + '(foo=?UTF-8?q?bar_?=bird)', + stringified='(foobar bird)', + comments=['foobar bird'], + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[4], + ), + + multiple_ew = C( + '(foo =?UTF-8?q?a?= =?UTF-8?q?t?=)', + stringified='(foo at)', + comments=['foo at'], + ew_indexes=[5, 19], + ), + + **for_each_character(RFC_WSP)( + inter_ew_whitespace_handled_correctly = C( + '({char}=?UTF-8?q?_foo_?={char}{char}=?UTF-8?q?bar_?= )', + stringified='({char} foo bar )', + comments=['{char} foo bar '], + ew_indexes=[2, 21], + ), + ), + + ew_nested_first_comment_valid_no_ws = C( + '((=?UTF-8?q?foo?=)=?UTF-8?q?bar?=)', + stringified='((foo)bar)', + comments=['(foo)bar'], + commenttree=[['foo'], 'bar'], + ew_indexes=[2, 18], + ), + + ew_in_nested_second_comment_valid_no_ws = C( + '(=?UTF-8?q?foo?=(=?UTF-8?q?bar?=))', + stringified='(foo(bar))', + comments=['foo(bar)'], + commenttree=['foo', ['bar']], + ew_indexes=[1, 17], + ), + + # parenthesis inside encoded words in comments is RFC illegal, but + # we handle it anyway. XXX we aren't registering defects for this, but + # ideally we should be. + + qp_inside_ew = C( + r'(=?UTF-8?q?\test\)_?= =?UTF-8?q?\(test?=)', + stringified=r'(test\) \(test)', + comments=['test) (test'], + ew_indexes=[1, 22], + ), + + unquoted_parens_inside_ew = C( + '(=?UTF-8?q?test)_?= =?UTF-8?q?(test?=) foo', + stringified=r'(test\) \(test)', + comments=[r'test) (test'], + remainder=' foo', + ew_indexes=[1, 20], + ), + + ) - def test_get_comment_must_start_with_paren(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_comment('foo"') - with self.assertRaises(errors.HeaderParseError): - parser.get_comment(' (foo"') - - def test_get_comment_following_wsp_preserved(self): - self._test_get_x(parser.get_comment, - '(comment) \t', '(comment)', ' ', [], ' \t', ['comment']) - - def test_get_comment_multiple_words(self): - self._test_get_x(parser.get_comment, - '(foo bar) \t', '(foo bar)', ' ', [], ' \t', ['foo bar']) - - def test_get_comment_multiple_words_wsp_preserved(self): - self._test_get_x(parser.get_comment, - '( foo bar\t ) \t', '( foo bar\t )', ' ', [], ' \t', - [' foo bar\t ']) - - def test_get_comment_end_paren_mid_word(self): - self._test_get_x(parser.get_comment, - '(foo)bar', '(foo)', ' ', [], 'bar', ['foo']) - - def test_get_comment_quoted_parens(self): - self._test_get_x(parser.get_comment, - r'(foo\) \(\)bar)', r'(foo\) \(\)bar)', ' ', [], '', ['foo) ()bar']) - - def test_get_comment_non_printable(self): - self._test_get_x(parser.get_comment, - '(foo\x7Fbar)', '(foo\x7Fbar)', ' ', - [errors.NonPrintableDefect], '', ['foo\x7Fbar']) - - def test_get_comment_no_end_paren(self): - self._test_get_x(parser.get_comment, - '(foo bar', '(foo bar)', ' ', - [errors.InvalidHeaderDefect], '', ['foo bar']) - self._test_get_x(parser.get_comment, - '(foo bar ', '(foo bar )', ' ', - [errors.InvalidHeaderDefect], '', ['foo bar ']) - - def test_get_comment_nested_comment(self): - comment = self._test_get_x(parser.get_comment, - '(foo(bar))', '(foo(bar))', ' ', [], '', ['foo(bar)']) - self.assertEqual(comment[1].content, 'bar') - - def test_get_comment_nested_comment_wsp(self): - comment = self._test_get_x(parser.get_comment, - '(foo ( bar ) )', '(foo ( bar ) )', ' ', [], '', ['foo ( bar ) ']) - self.assertEqual(comment[2].content, ' bar ') - - def test_get_comment_empty_comment(self): - self._test_get_x(parser.get_comment, - '()', '()', ' ', [], '', ['']) - - def test_get_comment_multiple_nesting(self): - comment = self._test_get_x(parser.get_comment, - '(((((foo)))))', '(((((foo)))))', ' ', [], '', ['((((foo))))']) - for i in range(4, 0, -1): - self.assertEqual(comment[0].content, '('*(i-1)+'foo'+')'*(i-1)) - comment = comment[0] - self.assertEqual(comment.content, 'foo') - - def test_get_comment_missing_end_of_nesting(self): - self._test_get_x(parser.get_comment, - '(((((foo)))', '(((((foo)))))', ' ', - [errors.InvalidHeaderDefect]*2, '', ['((((foo))))']) - - def test_get_comment_qs_in_nested_comment(self): - comment = self._test_get_x(parser.get_comment, - r'(foo (b\)))', r'(foo (b\)))', ' ', [], '', [r'foo (b\))']) - self.assertEqual(comment[2].content, 'b)') # get_cfws - def test_get_cfws_only_ws(self): - cfws = self._test_get_x(parser.get_cfws, - ' \t \t', ' \t \t', ' ', [], '', []) + @params + def test_get_cfws(self, s, *args, **kw): + kw.setdefault('value', ' ') + cfws = self._test_parse(parser.get_cfws, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(cfws, parser.CFWSList) self.assertEqual(cfws.token_type, 'cfws') + self.verify_terminal_types(cfws, 'ptext', 'fws') + + # get_cfws should behave exactly the same as get_comment when parsing + # values containing just a comment. + @params_map(with_namelist=True) + def adapt_comment_tests_for_cfws(nl, s, *args, **kw): + # Our 'ctree' nested comment check returns a list of comments instead + # of just the single nested comment it does for Comment. + if 'commenttree' in kw: + kw['commenttree'] = [kw['commenttree']] + # XXX POSTDEP: delete from here... + # get_cfws had the same bug that get_fws had: it did *not* + # raise an error if there is no cfws, and it should. For backward + # compatibility we continue to not raise under the old api. + if ('oldapi' in nl + and nl.has_any('empty', 'non_wsp_before_left_paren_is_error') + ): + kw.pop('exception') + kw['remainder'] = s + kw['warnings'] = kw.get('warnings', []) + [ + ( + DeprecationWarning, + r'(?i)(?=.*no whitespace)(?=.*comment)(?=.*raise)', + ) + ] + # XXX POSTDEP: ...to here + yield 'from_test_get_comment', C(s, *args, **kw) + + params_test_get_cfws = for_each_api( + + # get_cfws should behave exactly the same as get_fws when parsing + # whitespace only strings, except for the case of ending at a '(' + # because cfws *doesn't* end there. + include_unless( + lambda n, *a, **k: 'left_parenthesis' in n, + label="from_test_get_fws", + )(params_test_get_fws), + + # get_cfws should behave exactly the same as get_comment when parsing + # values containing just a comment. Even the tests with remainders + # should pass if the remainder doesn't start with whitespace. + include_unless( + lambda n, *a, remainder=..., **k: + remainder is not ... + and remainder.startswith(tuple(RFC_WSP)) + or 'wsp_before_left_paren_is_error' in n + )(adapt_comment_tests_for_cfws(params_test_get_comment)), + + mixed_comments_and_wsp = C( + ' (foo ) ( bar) ', + comments=['foo ', ' bar'], + commenttree=[['foo '], [' bar']], + ), + + **for_each_character(ALL_ASCII, skip=CFWS_LEADER)( + ends_at_non_comment_non_ws = C( + '(foo) {char}', + remainder='{char}', + comments=['foo'], + commenttree=[['foo']], + ), + ), + + header_ends_in_comment = C( + ' (foo ', + stringified=' (foo )', + defects=[end_inside_comment_defect], + comments=['foo '], + commenttree=[['foo ']], + ), + + multiple_nested_comments = C( + '(foo (bar)) ((a)(a))', + comments=['foo (bar)', '(a)(a)'], + commenttree=[['foo ', ['bar']], [['a'], ['a']]], + ), + + ew_after_comment_no_ws = C( + ' (bar) (foo)=?UTF-8?q?foo?=', + comments=['bar', 'foo'], + remainder='=?UTF-8?q?foo?=', + ), + + ew_in_nested_comment = C( + ' (a) (foo (=?UTF-8?q?bar?=))', + stringified=' (a) (foo (bar))', + comments=['a', 'foo (bar)'], + commenttree=[['a'], ['foo ', ['bar']]], + ew_indexes=[11], + ), + + ew_missing_whitespace = C( + '(=?UTF-8?q?foo?==?UTF-8?q?bar?=) (b)', + stringified='(foobar) (b)', + comments=['foobar', 'b'], + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[1, 16], + ), + + nested_and_unnested_empty_comments = C( + '() (()) ( () ) ( ( ) )', + comments=['', '()', ' () ', ' ( ) '], + commenttree=[[''], [['']], [' ', [''], ' '], [' ', [' '], ' ']], + ), - def test_get_cfws_only_comment(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo)', '(foo)', ' ', [], '', ['foo']) - self.assertEqual(cfws[0].content, 'foo') - - def test_get_cfws_only_mixed(self): - cfws = self._test_get_x(parser.get_cfws, - ' (foo ) ( bar) ', ' (foo ) ( bar) ', ' ', [], '', - ['foo ', ' bar']) - self.assertEqual(cfws[1].content, 'foo ') - self.assertEqual(cfws[3].content, ' bar') - - def test_get_cfws_ends_at_non_leader(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo) bar', '(foo) ', ' ', [], 'bar', ['foo']) - self.assertEqual(cfws[0].content, 'foo') - - def test_get_cfws_ends_at_non_printable(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo) \x07', '(foo) ', ' ', [], '\x07', ['foo']) - self.assertEqual(cfws[0].content, 'foo') - - def test_get_cfws_non_printable_in_comment(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo \x07) "test"', '(foo \x07) ', ' ', - [errors.NonPrintableDefect], '"test"', ['foo \x07']) - self.assertEqual(cfws[0].content, 'foo \x07') - - def test_get_cfws_header_ends_in_comment(self): - cfws = self._test_get_x(parser.get_cfws, - ' (foo ', ' (foo )', ' ', - [errors.InvalidHeaderDefect], '', ['foo ']) - self.assertEqual(cfws[1].content, 'foo ') - - def test_get_cfws_multiple_nested_comments(self): - cfws = self._test_get_x(parser.get_cfws, - '(foo (bar)) ((a)(a))', '(foo (bar)) ((a)(a))', ' ', [], - '', ['foo (bar)', '(a)(a)']) - self.assertEqual(cfws[0].comments, ['foo (bar)']) - self.assertEqual(cfws[2].comments, ['(a)(a)']) + ) # get_quoted_string - def test_get_quoted_string_only(self): - qs = self._test_get_x(parser.get_quoted_string, - '"bob"', '"bob"', 'bob', [], '') + @params + def test_get_quoted_string( + self, + s, + *args, + content=None, + quoted_value=None, + **kw, + ): + qs = self._test_parse(parser.get_quoted_string, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertEqual(qs.content, content) + self.assertEqual(qs.quoted_value, quoted_value) + self.assertIsInstance(qs, parser.QuotedString) self.assertEqual(qs.token_type, 'quoted-string') - self.assertEqual(qs.quoted_value, '"bob"') - self.assertEqual(qs.content, 'bob') - - def test_get_quoted_string_with_wsp(self): - qs = self._test_get_x(parser.get_quoted_string, - '\t "bob" ', '\t "bob" ', ' bob ', [], '') - self.assertEqual(qs.quoted_value, ' "bob" ') - self.assertEqual(qs.content, 'bob') - - def test_get_quoted_string_with_comments_and_wsp(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (foo) "bob"(bar)', ' (foo) "bob"(bar)', ' bob ', [], '') - self.assertEqual(qs[0][1].content, 'foo') - self.assertEqual(qs[2][0].content, 'bar') - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') - - def test_get_quoted_string_with_multiple_comments(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (foo) (bar) "bob"(bird)', ' (foo) (bar) "bob"(bird)', ' bob ', - [], '') - self.assertEqual(qs[0].comments, ['foo', 'bar']) - self.assertEqual(qs[2].comments, ['bird']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') - - def test_get_quoted_string_non_printable_in_comment(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (\x0A) "bob"', ' (\x0A) "bob"', ' bob', - [errors.NonPrintableDefect], '') - self.assertEqual(qs[0].comments, ['\x0A']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob"') - - def test_get_quoted_string_non_printable_in_qcontent(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (a) "a\x0B"', ' (a) "a\x0B"', ' a\x0B', - [errors.NonPrintableDefect], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs.content, 'a\x0B') - self.assertEqual(qs.quoted_value, ' "a\x0B"') - - def test_get_quoted_string_internal_ws(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (a) "foo bar "', ' (a) "foo bar "', ' foo bar ', - [], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs.content, 'foo bar ') - self.assertEqual(qs.quoted_value, ' "foo bar "') - - def test_get_quoted_string_header_ends_in_comment(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (a) "bob" (a', ' (a) "bob" (a)', ' bob ', - [errors.InvalidHeaderDefect], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs[2].comments, ['a']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') - - def test_get_quoted_string_header_ends_in_qcontent(self): - qs = self._test_get_x(parser.get_quoted_string, - ' (a) "bob', ' (a) "bob"', ' bob', - [errors.InvalidHeaderDefect], '') - self.assertEqual(qs[0].comments, ['a']) - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob"') - - def test_get_quoted_string_cfws_only_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_quoted_string(' (foo) ') + self.verify_terminal_types(qs, 'ptext', 'fws') + + # get_quoted_string should pass any get_bare_quoted_string test that + # doesn't involve leading or trailing whitespace. + @params_map + def adapt_bare_quoted_string_tests_for_get_quoted_string(s, *args, **kw): + r = kw.get('remainder', '') + if s.startswith(tuple(RFC_WSP)) or r.startswith(tuple(RFC_WSP)): + return + if not 'exception' in kw: + kw['quoted_value'] = kw.get('stringified', s[:-len(r)] if r else s) + kw['content'] = kw['value'] + kw['quoted_value'] = kw.get('stringified', s[:-len(r)] if r else s) + yield 'from_test_bare_quoted_string', C(s, *args, **kw) + + # If there is no remainder or exception expectation, a cfws test string + # should be valid as a quoted string prefix or suffix, with a few + # exceptions that test for what happens if closing parens are missing. + @params_map(with_namelist=True) + def adapt_get_cfws_tests_for_get_quoted_string( + nl, + s, + *args, + stringified=None, + remainder=None, + exception=None, + **kw, + ): + if remainder or exception or nl.has_any( + 'multiple_mesting_missing_two_right_parens', + 'no_right_paren_after_non_ws', + 'no_right_paren_after_ws', + 'header_ends_in_comment', + 'empty', # XXX POSTDEP remove this line, it's from a deprecation + ): + return + new_s = f'{s} "foo" {s}' + if stringified: + kw['stringified'] = f'{stringified} "foo" {stringified}' + kw['value'] = ' foo ' + kw['quoted_value'] = ' "foo" ' + kw['content'] = 'foo' + for k in ('comments', 'commenttree', 'defects'): + if (v := kw.get(k)): + kw[k] = v * 2 + if (idxs := kw.get('ew_indexes')): + kw['ew_indexes'] = idxs + [x + len(s) + 7 for x in idxs] + yield 'adapted_from_get_cfws', C(new_s, **kw) + + params_test_get_quoted_string = for_each_api( + + adapt_bare_quoted_string_tests_for_get_quoted_string( + params_test_get_bare_quoted_string, + ), + + adapt_get_cfws_tests_for_get_quoted_string(params_test_get_cfws), + + with_wsp = C( + '\t "bob" ', + value=' bob ', + quoted_value=' "bob" ', + content='bob', + ), + + with_comments_and_wsp = C( + ' (foo) "bob"(bar)', + value=' bob ', + quoted_value=' "bob" ', + content='bob', + comments=['foo', 'bar'], + commenttree=[['foo'], ['bar']], + ), + + with_multiple_comments = C( + ' (foo) (bar) "bob"(bird)', + value=' bob ', + quoted_value=' "bob" ', + content='bob', + comments=['foo', 'bar', 'bird'], + commenttree=[['foo'], ['bar'], ['bird']], + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_comment = C( + ' ({char}) "bob"', + value=' bob', + quoted_value=' "bob"', + content='bob', + defects=[(nonprintable_defect, '{char}')], + comments=['{char}'], + ), + ), + + # all the non printables in qcontent are checked by the included + # bare_quoted_string tests, this one proves that the defect is + # correctly copied up even if there is also comment text involved. + non_printable_in_qcontent = C( + ' (a) "a\x0B"', + value=' a\x0B', + quoted_value=' "a\x0B"', + content='a\x0B', + defects=[nonprintable_defect('\x0b')], + comments=['a'], + ), + + internal_ws = C( + ' (a) "foo bar "', + value=' foo bar ', + quoted_value=' "foo bar "', + content='foo bar ', + comments=['a'], + ), + + header_ends_in_comment = C( + ' (a) "bob" (a', + stringified=' (a) "bob" (a)', + value=' bob ', + quoted_value=' "bob" ', + content='bob', + defects=[end_inside_comment_defect], + comments=['a', 'a'], + commenttree=[['a'], ['a']], + ), + + header_ends_in_qcontent = C( + ' (a) "bob', + stringified=' (a) "bob"', + value=' bob', + quoted_value=' "bob"', + content='bob', + defects=[end_inside_quoted_string_defect], + comments=['a'], + ), + + cfws_only_raises = C( + '(foo) ', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + no_quoted_string = C( + '(ab) xyz', + exception=(errors.HeaderParseError, '(?=.*expected.*")(?=.*xyz)'), + ), + + **for_each_character(RFC_PRINTABLES, skip='(')( + qs_ends_at_noncfws = C( + '\t "bob" {char}', + value=' bob ', + quoted_value=' "bob" ', + content='bob', + remainder='{char}', + ), + ), + + ew_after_dquote = C( + '"bob"=?UTF-8?q?foo?=', + value='bob', + quoted_value='"bob"', + content='bob', + remainder='=?UTF-8?q?foo?=', + ), + + empty_quotes_between_comments = C( + ' (a) "" (foo)', + value=' ', + quoted_value=' "" ', + content='', + comments=['a', 'foo'], + ), + + empty_input = C( + '', + exception=(errors.HeaderParseError, r'(?i)expected'), + ), - def test_get_quoted_string_no_quoted_string(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_quoted_string(' (ab) xyz') + ) - def test_get_quoted_string_qs_ends_at_noncfws(self): - qs = self._test_get_x(parser.get_quoted_string, - '\t "bob" fee', '\t "bob" ', ' bob ', [], 'fee') - self.assertEqual(qs.content, 'bob') - self.assertEqual(qs.quoted_value, ' "bob" ') # get_atom - def test_get_atom_only(self): - atom = self._test_get_x(parser.get_atom, - 'bob', 'bob', 'bob', [], '') + @params + def test_get_atom(self, s, *args, **kw): + atom = self._test_parse(parser.get_atom, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atom, parser.Atom) self.assertEqual(atom.token_type, 'atom') + self.verify_terminal_types(atom, 'atext', 'ptext', 'fws') + + # If there is no remainder or exception expectation, a cfws test string + # should be valid as a atom prefix or suffix, with a few exceptions that + # test for what happens if closing parens are missing. + @params_map(with_namelist=True) + def adapt_get_cfws_tests_for_get_atom( + nl, + s, + *args, + stringified=None, + remainder=None, + exception=None, + **kw, + ): + if remainder or exception or nl.has_any( + 'multiple_mesting_missing_two_right_parens', + 'no_right_paren_after_non_ws', + 'no_right_paren_after_ws', + 'header_ends_in_comment', + 'empty', # XXX POSTDEP remove this line, it's from a deprecation + ): + return + new_s = f'{s} foo {s}' + if stringified: + kw['stringified'] = f'{stringified} foo {stringified}' + kw['value'] = ' foo ' + for k in ('comments', 'commenttree', 'defects'): + if (v := kw.get(k)): + kw[k] = v * 2 + if (idxs := kw.get('ew_indexes')): + kw['ew_indexes'] = idxs + [x + len(s) + 5 for x in idxs] + yield 'adapted_from_get_cfws', C(new_s, **kw) + + params_test_get_atom = for_each_api( + + adapt_get_cfws_tests_for_get_atom(params_test_get_cfws), + + # get_atom should pass all the get_atext_sequence tests except for those + # involving leading or trailing whitespace. + include_unless( + lambda n, s, *a, remainder='', **k: + s.startswith(tuple(CFWS_LEADER)) + or remainder.startswith(tuple(CFWS_LEADER)), + label='from_test_get_atext_sequence', + )(params_test_get_atext_sequence), + + with_wsp = C( + '\t bob ', + value=' bob ', + ), + + with_comments_and_wsp = C( + ' (foo) bob(bar)', + value=' bob ', + comments=['foo', 'bar'], + ), + + with_multiple_comments = C( + ' (foo) (bar) bob(bird)', + value=' bob ', + comments=['foo', 'bar', 'bird'], + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_comment = C( + ' ({char}) bob', + value=' bob', + defects=[(nonprintable_defect, '{char}')], + comments=['{char}'], + ), + + non_printable_in_atext = C( + ' (a) a{char}', + value=' a{char}', + defects=[(nonprintable_defect, '{char}')], + comments=['a'], + ), + + ), + + header_ends_in_comment = C( + ' (a) bob (a', + stringified=' (a) bob (a)', + value=' bob ', + defects=[end_inside_comment_defect], + comments=['a', 'a'], + ), + + no_atom = C( + ' (ab) ', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + **for_each_character(RFC_SPECIALS, skip='(')( + + no_atom_before_special = C( + ' (ab) {char}', + exception=( + errors.HeaderParseError, + '(?i)(?=.*expected)(?=.*{echar})', + ), + ), + + atom_ends_at_special = C( + ' (foo) bob(bar) {char}bang', + value=' bob ', + remainder='{char}bang', + comments=['foo', 'bar'], + ), + + ), + + **for_each_character(RFC_PRINTABLES, skip='(')( + atom_ends_at_noncfws = C( + 'bob {char}', + value='bob ', + remainder='{char}', + ), + ), + + ew_only = C( + '=?utf-8?q?=20bob?=', + stringified=' bob', + ew_indexes=[0], + ), + + ew_and_comments = C( + '(a) =?UTF-8?q?bob?= (b)', + stringified='(a) bob (b)', + value=' bob ', + comments=['a', 'b'], + ew_indexes=[4], + ), + + ew_and_comments_no_ws = C( + '(a)=?UTF-8?q?bob?=(b)', + stringified='(a)bob(b)', + value=' bob ', + comments=['a', 'b'], + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[3], + ), + + ew_and_empty_comments_no_ws = C( + '()=?UTF-8?q?bob?=()', + stringified='()bob()', + value=' bob ', + comments=['', ''], + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[2], + ), + + # XXX Ideally this should have a defect for the specials. + **for_each_character(RFC_SPECIALS)( + ew_with_unencoded_special = C( + '=?UTF-8?q?bob{char}?= @foo', + stringified='bob{char} ', + remainder='@foo', + ew_indexes=[0], + ), + ), + + ew_after_atom_no_ws = C( + 'foo@=?UTF-8?q?bob?=', + value='foo', + remainder='@=?UTF-8?q?bob?=', + ), + + multiple_ew_no_ws = C( + '=?UTF-8?q?foo?==?UTF-8?q?bar?=', + stringified='foobar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], + ), + + ew_in_middle_of_atom_text = C( + 'foo{=?UTF-8?q?foo?=}{=?UTF-8?q?bar?=}bar', + stringified='foo{foo}{bar}bar', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[4, 21], + ), + + empty_comments_no_ws = C( + ' ()bob() ', + value=' bob ', + comments=['', ''], + ), + + all_non_special_printables_are_allowed = C( + f'{"".join(set(RFC_PRINTABLES) - set(RFC_SPECIALS))}@', + remainder='@', + ), - def test_get_atom_with_wsp(self): - self._test_get_x(parser.get_atom, - '\t bob ', '\t bob ', ' bob ', [], '') - - def test_get_atom_with_comments_and_wsp(self): - atom = self._test_get_x(parser.get_atom, - ' (foo) bob(bar)', ' (foo) bob(bar)', ' bob ', [], '') - self.assertEqual(atom[0][1].content, 'foo') - self.assertEqual(atom[2][0].content, 'bar') - - def test_get_atom_with_multiple_comments(self): - atom = self._test_get_x(parser.get_atom, - ' (foo) (bar) bob(bird)', ' (foo) (bar) bob(bird)', ' bob ', - [], '') - self.assertEqual(atom[0].comments, ['foo', 'bar']) - self.assertEqual(atom[2].comments, ['bird']) - - def test_get_atom_non_printable_in_comment(self): - atom = self._test_get_x(parser.get_atom, - ' (\x0A) bob', ' (\x0A) bob', ' bob', - [errors.NonPrintableDefect], '') - self.assertEqual(atom[0].comments, ['\x0A']) - - def test_get_atom_non_printable_in_atext(self): - atom = self._test_get_x(parser.get_atom, - ' (a) a\x0B', ' (a) a\x0B', ' a\x0B', - [errors.NonPrintableDefect], '') - self.assertEqual(atom[0].comments, ['a']) - - def test_get_atom_header_ends_in_comment(self): - atom = self._test_get_x(parser.get_atom, - ' (a) bob (a', ' (a) bob (a)', ' bob ', - [errors.InvalidHeaderDefect], '') - self.assertEqual(atom[0].comments, ['a']) - self.assertEqual(atom[2].comments, ['a']) - - def test_get_atom_no_atom(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_atom(' (ab) ') - - def test_get_atom_no_atom_before_special(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_atom(' (ab) @') - - def test_get_atom_atom_ends_at_special(self): - atom = self._test_get_x(parser.get_atom, - ' (foo) bob(bar) @bang', ' (foo) bob(bar) ', ' bob ', [], '@bang') - self.assertEqual(atom[0].comments, ['foo']) - self.assertEqual(atom[2].comments, ['bar']) - - def test_get_atom_atom_ends_at_noncfws(self): - self._test_get_x(parser.get_atom, - 'bob fred', 'bob ', 'bob ', [], 'fred') - - def test_get_atom_rfc2047_atom(self): - self._test_get_x(parser.get_atom, - '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + ) # get_dot_atom_text - def test_get_dot_atom_text(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, - 'foo.bar.bang', 'foo.bar.bang', 'foo.bar.bang', [], '') - self.assertEqual(dot_atom_text.token_type, 'dot-atom-text') - self.assertEqual(len(dot_atom_text), 5) - - def test_get_dot_atom_text_lone_atom_is_valid(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, - 'foo', 'foo', 'foo', [], '') - - def test_get_dot_atom_text_raises_on_leading_dot(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('.foo.bar') - - def test_get_dot_atom_text_raises_on_trailing_dot(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('foo.bar.') + @params + def test_get_dot_atom_text(self, s, *args, **kw): + atom = self._test_parse(parser.get_dot_atom_text, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atom, parser.DotAtomText) + self.assertEqual(atom.token_type, 'dot-atom-text') + # There can be fws inside encoded words. + self.verify_terminal_types(atom, 'dot', 'atext', 'fws') + + params_test_get_dot_atom_text = for_each_api( + + # a bare atext is valid in a dot-atom, so we should pass all the + # get_atext_sequence tests except the ones involving the dot. + include_unless( + lambda n, *a, **k: 'full_stop' in n, + label='from_test_get_atext', + )(params_test_get_atext_sequence), + + only = C( + 'foo.bar.bang', + ), + + raises_on_leading_dot = C( + '.foo.bar', + exception=(errors.HeaderParseError, '.*'), + ), + + raises_on_trailing_dot = C( + 'foo.bar.', + exception=(errors.HeaderParseError, '.*'), + ), + + **for_each_character(RFC_SPECIALS + RFC_WSP)( + raises_on_leading_special_or_wsp = C( + '{char}foo.bar', + exception=(errors.HeaderParseError, r'expected.*{erchar}foo\.'), + ), + ), + + **for_each_character(RFC_SPECIALS + RFC_WSP, skip='.')( + ends_at_special_or_wsp = C( + 'foo.bird{char}bar', + remainder='{char}bar', + ), + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_atext = C( + 'foo.{char}.bar', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + undecodable_characters = C( + 'foo.🎁.bar'.encode().decode('us-ascii', errors='surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + all_atext_characters_allowed = C( + RFC_ATEXT + '.' + RFC_ATEXT + '@foo', + remainder = '@foo', + ), + + raises_on_paired_dots = C( + 'foo..bar', + exception=( + errors.HeaderParseError, + r'(?=.*expected)(?=.*atom)(?=.*\.\.bar)', + ), + ), + + ew = C( + '=?UTF-8?q?foo?=', + stringified='foo', + ew_indexes=[0], + ), + + two_ew_two_atoms = C( + '=?UTF-8?q?foo?= =?UTF-8?q?bar?=', + stringified='foo', + remainder=' =?UTF-8?q?bar?=', + ew_indexes=[0], + ), + + # The tests above are the only RFC valid way for an encoded word to be + # in a dot-atom-text, but we're going to be generous. + + two_ew_with_dot = C( + '=?UTF-8?q?foo?=.=?UTF-8?q?bar?=', + stringified='foo.bar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 16], + ), + + two_ew_no_dot = C( + '=?UTF-8?q?foo?==?UTF-8?q?bar?=', + stringified='foobar', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], + ), + + mixed_ews_and_atext = C( + 'foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar', + stringified='foo.bar foobar.foobar', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + ew_indexes=[7, 27], + ), - def test_get_dot_atom_text_raises_on_leading_non_atext(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text(' foo.bar') - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('@foo.bar') - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom_text('"foo.bar"') - - def test_get_dot_atom_text_trailing_text_preserved(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, - 'foo@bar', 'foo', 'foo', [], '@bar') + ) - def test_get_dot_atom_text_trailing_ws_preserved(self): - dot_atom_text = self._test_get_x(parser.get_dot_atom_text, - 'foo .bar', 'foo', 'foo', [], ' .bar') # get_dot_atom - def test_get_dot_atom_only(self): - dot_atom = self._test_get_x(parser.get_dot_atom, - 'foo.bar.bing', 'foo.bar.bing', 'foo.bar.bing', [], '') - self.assertEqual(dot_atom.token_type, 'dot-atom') - self.assertEqual(len(dot_atom), 1) - - def test_get_dot_atom_with_wsp(self): - self._test_get_x(parser.get_dot_atom, - '\t foo.bar.bing ', '\t foo.bar.bing ', ' foo.bar.bing ', [], '') - - def test_get_dot_atom_with_comments_and_wsp(self): - self._test_get_x(parser.get_dot_atom, - ' (sing) foo.bar.bing (here) ', ' (sing) foo.bar.bing (here) ', - ' foo.bar.bing ', [], '') - - def test_get_dot_atom_space_ends_dot_atom(self): - self._test_get_x(parser.get_dot_atom, - ' (sing) foo.bar .bing (here) ', ' (sing) foo.bar ', - ' foo.bar ', [], '.bing (here) ') - - def test_get_dot_atom_no_atom_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom(' (foo) ') + @params + def test_get_dot_atom(self, s, *args, **kw): + atom = self._test_parse(parser.get_dot_atom, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(atom, parser.DotAtom) + self.assertEqual(atom.token_type, 'dot-atom') + self.verify_terminal_types(atom, 'dot', 'atext', 'ptext', 'fws') + + params_test_get_dot_atom = for_each_api( + + # Atom is a subset of dot atom, so get_dot_atom should pass any + # get_atom test except those involving the dot (full_stop). + include_unless( + lambda n, *a, **k: 'full_stop' in n, + label='from_test_get_atom', + )(params_test_get_atom), + + with_wsp = C( + '\t foo.bar.bing ', + value=' foo.bar.bing ', + ), + + with_comments_and_wsp = C( + ' (sing) foo.bar.bing (here) ', + value=' foo.bar.bing ', + comments=['sing', 'here'], + ), + + space_ends_dot_atom = C( + ' (sing) foo.bar .bing (here) ', + value=' foo.bar ', + remainder='.bing (here) ', + comments=['sing'], + ), + + no_atom_raises = C( + ' (foo) ', + exception=(errors.HeaderParseError, r'expected') + ), + + **for_each_character(RFC_SPECIALS, skip='(')( + leading_special_raises = C( + ' (foo) {char}bar', + exception=(errors.HeaderParseError, r'(?i)expected.*{echar}bar') + ), + ), + + two_dots_raises = C( + 'bar..bang', + exception=(errors.HeaderParseError, r'expected.*\.\.bang') + ), + + trailing_dot_raises = C( + ' (foo) bar.bang. foo', + exception=(errors.HeaderParseError, r'expected.*\. foo') + ), + + rfc2047_atom = C( + '=?utf-8?q?=20bob?=', + stringified=' bob', + ew_indexes=[0], + ), + + **for_each_character(RFC_NONPRINTABLES, skip=RFC_WSP)( + non_printable_in_atext = C( + 'foo.{char}.bar', + defects=[(nonprintable_defect, '{char}')], + ), + ), + + undecodable_characters = C( + 'foo.🎁.bar'.encode().decode('us-ascii', errors='surrogateescape'), + defects=[undecodable_bytes_defect], + ), + + **for_each_character(RFC_SPECIALS, skip='.(')( + ends_at_special = C( + '(hey)foo.bar{char}.bird', + value=' foo.bar', + remainder='{char}.bird', + comments=['hey'], + ), + ), + + **for_each_character(RFC_SPECIALS, skip='(')( + ends_at_special_after_comment = C( + '(hey)foo.bar(hey){char} bird', + value=' foo.bar ', + remainder='{char} bird', + comments=['hey', 'hey'], + ), + ), + + two_ew_two_atoms = C( + '(hey) =?UTF-8?q?foo?= =?UTF-8?q?bar?=', + stringified='(hey) foo ', + value=' foo ', + remainder='=?UTF-8?q?bar?=', + comments=['hey'], + ew_indexes=[6], + ), + + mixed_ews_and_atext = C( + '(hey)foo.bar=?UTF-8?q?_foo?=bar.=?UTF-8?q?foo?=bar (hey)', + stringified='(hey)foo.bar foobar.foobar (hey)', + value=' foo.bar foobar.foobar ', + defects=[ + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + comments=['hey', 'hey'], + ew_indexes=[12, 32], + ), + + two_ew_with_dot = C( + '=?UTF-8?q?foo?=.=?UTF-8?q?bar?=(hey)', + stringified='foo.bar(hey)', + value='foo.bar ', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + ], + comments=['hey'], + ew_indexes=[0, 16], + ), - def test_get_dot_atom_leading_dot_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom(' (foo) .bar') - - def test_get_dot_atom_two_dots_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom('bar..bang') - - def test_get_dot_atom_trailing_dot_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_dot_atom(' (foo) bar.bang. foo') - - def test_get_dot_atom_rfc2047_atom(self): - self._test_get_x(parser.get_dot_atom, - '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') - - # get_word (if this were black box we'd repeat all the qs/atom tests) + ) - def test_get_word_atom_yields_atom(self): - word = self._test_get_x(parser.get_word, - ' (foo) bar (bang) :ah', ' (foo) bar (bang) ', ' bar ', [], ':ah') - self.assertEqual(word.token_type, 'atom') - self.assertEqual(word[0].token_type, 'cfws') - def test_get_word_all_CFWS(self): - # bpo-29412: Test that we don't raise IndexError when parsing CFWS only - # token. - with self.assertRaises(errors.HeaderParseError): - parser.get_word('(Recipients list suppressed') + # get_word + + @params + def test_get_word( + self, + s, + *args, + quoted_value=None, + content=None, + tokenlisttype, + **kw, + ): + word = self._test_parse(parser.get_word, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(word, tokenlisttype) + if quoted_value is not None: + self.assertEqual(word.quoted_value, quoted_value) + if content is not None: + self.assertEqual(word.content, content) + self.verify_terminal_types(word, 'dot', 'atext', 'ptext', 'fws') + + @params_map + def adapt_get_atom_tests_for_get_word(*args, **kw): + kw['tokenlisttype'] = parser.TokenList + yield '', C(*args, **kw) + + @params_map + def adapt_get_quoted_string_tests_for_get_word(*args, **kw): + kw['tokenlisttype'] = parser.QuotedString + yield '', C(*args, **kw) + + params_test_get_word = for_each_api( + + # A word can be an atom, so get_word should pass many of the atom tests. + adapt_get_atom_tests_for_get_word( + include_unless( + lambda n, *a, **k: + # For get_atom a leading quotation mark means there is no + # atom and is therefor an error, but get_word will treat it + # as a quoted_string. Quoted strings are tested below. + n.has_any( + 'no_atom_before_special', + 'no_atext_before_special_or_wsp', + ) + and 'quotation_mark' in n, + label='from_test_get_atom', + )(params_test_get_atom), + ), + + # Or it can be a quoted string, so should pass most quoted_string tests. + adapt_get_quoted_string_tests_for_get_word( + include_unless( + lambda n, *a, **k: + # These tests have an atom first; get_quoted_string raises + # for that, but get_word parses it. Atoms are tested above. + n.has_any( + 'no_quoted_string', + 'no_leading_dquote_before_non_ws', + ), + label='from_test_get_quoted_string', + )(params_test_get_quoted_string), + ), - def test_get_word_qs_yields_qs(self): - word = self._test_get_x(parser.get_word, - '"bar " (bang) ah', '"bar " (bang) ', 'bar ', [], 'ah') - self.assertEqual(word.token_type, 'quoted-string') - self.assertEqual(word[0].token_type, 'bare-quoted-string') - self.assertEqual(word[0].value, 'bar ') - self.assertEqual(word.content, 'bar ') + ) - def test_get_word_ends_at_dot(self): - self._test_get_x(parser.get_word, - 'foo.', 'foo', 'foo', [], '.') # get_phrase - def test_get_phrase_simple(self): - phrase = self._test_get_x(parser.get_phrase, + @params + def test_get_phrase(self, s, *args, obs_dots=0, **kw): + phrase = self._test_parse(parser.get_phrase, C(s), *args, **kw) + if 'exception' in kw: + return + self.assertIsInstance(phrase, parser.Phrase) + self.assertEqual( + len([x for x in phrase if x.token_type == 'dot']), + obs_dots, + phrase.ppstr(), + ) + self.verify_terminal_types(phrase, 'dot', 'atext', 'ptext', 'fws') + + @params_map(with_namelist=True) + def adapt_get_word_tests_for_get_phrase(nl, s, *args, **kw): + kw.pop('tokenlisttype') + kw.pop('quoted_value', None) + kw.pop('content', None) + # XXX POSTDEP: delete from here... + if 'oldapi' in nl: + # A phrase has to have at least one word, but the old code did not + # enforce this. For backward compatibility we preserve that + # behavior in the old api, so for the parameters that expect a + # raise on no content we'll either skip them or adapt them. + if nl.has_any( + 'no_atom_before_special', + 'no_atom', + 'cfws_only_raises', + 'empty_input', + ): + return + # These two tests will serve to test the lack-of-raise deprecation. + if nl.has_any( + 'empty', + 'no_atext_before_special_or_wsp', + ): + kw.pop('exception') + kw['remainder'] = s + kw['warnings'] = kw.get('warnings', []) + [ + ( + DeprecationWarning, + r'(?i)(?=.*word)(?=.*whitespace)(?=.*raise)', + ) + ] + kw['defects'] = kw.get('defects', []) + [ + non_word_phrase_start_defect, + ] + # XXX POSTDEP: ...to here + yield '', C(s, *args, **kw) + + params_test_get_phrase = for_each_api( + + # A phrase is a series of words, and single words are valid, + # so get_phrase should pass many of the get_word tests. + adapt_get_word_tests_for_get_phrase( + # A phrase only ends at specials other than " and ., so skip + # get_word tests that expect parsing to stop on those characters. + include_unless( + lambda n, *a, remainder=False, **k: + n.has_any( + 'atom_ends_at_noncfws', + 'no_atext_before_special_or_wsp', + 'qs_ends_at_noncfws', + 'ew_after_dquote', + 'encoded_word_after_dquote_with_no_ws', + 'end_dquote_mid_word', + ) + or n.has_any('atom_ends_at_special', 'up_to_special') + and n.has_any('full_stop', 'quotation_mark') + or n.has_all('no_atom_before_special', 'full_stop'), + label='from_test_get_word', + )(params_test_get_word), + ), + + simple_phrase = C( '"Fred A. Johnson" is his name, oh.', - '"Fred A. Johnson" is his name', - 'Fred A. Johnson is his name', - [], - ', oh.') - self.assertEqual(phrase.token_type, 'phrase') + value='Fred A. Johnson is his name', + remainder=', oh.', + ), - def test_get_phrase_complex(self): - phrase = self._test_get_x(parser.get_phrase, + complex_phrase = C( ' (A) bird (in (my|your)) "hand " is messy\t<>\t', - ' (A) bird (in (my|your)) "hand " is messy\t', - ' bird hand is messy ', - [], - '<>\t') - self.assertEqual(phrase[0][0].comments, ['A']) - self.assertEqual(phrase[0][2].comments, ['in (my|your)']) + value=' bird hand is messy ', + remainder='<>\t', + comments=['A', 'in (my|your)'], + ), - def test_get_phrase_obsolete(self): - phrase = self._test_get_x(parser.get_phrase, - 'Fred A.(weird).O Johnson', + obsolete_phrase = C( 'Fred A.(weird).O Johnson', - 'Fred A. .O Johnson', - [errors.ObsoleteHeaderDefect]*3, - '') - self.assertEqual(len(phrase), 7) - self.assertEqual(phrase[3].comments, ['weird']) - - def test_get_phrase_pharse_must_start_with_word(self): - phrase = self._test_get_x(parser.get_phrase, + value='Fred A. .O Johnson', + defects=[ + *[period_in_phrase_obs_defect]*2, + cfws_without_atom_in_phrase_obs_defect, + ], + comments=['weird'], + obs_dots=2, + ), + + should_start_with_word = C( '(even weirder).name', - '(even weirder).name', - ' .name', - [errors.InvalidHeaderDefect] + [errors.ObsoleteHeaderDefect]*2, - '') - self.assertEqual(len(phrase), 3) - self.assertEqual(phrase[0].comments, ['even weirder']) - - def test_get_phrase_ending_with_obsolete(self): - phrase = self._test_get_x(parser.get_phrase, + value=' .name', + defects=[ + non_word_phrase_start_defect, + cfws_without_atom_in_phrase_obs_defect, + period_in_phrase_obs_defect, + ], + comments=['even weirder'], + obs_dots=1, + ), + + obsolete_ending = C( 'simple phrase.(with trailing comment):boo', - 'simple phrase.(with trailing comment)', - 'simple phrase. ', - [errors.ObsoleteHeaderDefect]*2, - ':boo') - self.assertEqual(len(phrase), 4) - self.assertEqual(phrase[3].comments, ['with trailing comment']) - - def get_phrase_cfws_only_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_phrase(' (foo) ') + value='simple phrase. ', + defects=[ + period_in_phrase_obs_defect, + cfws_without_atom_in_phrase_obs_defect, + ], + remainder=':boo', + comments=['with trailing comment'], + obs_dots=1, + ), - def test_get_phrase_adjacent_ew(self): # "'linear-white-space' that separates a pair of adjacent # 'encoded-word's is ignored" (rfc2047 section 6.2) - self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '') - def test_get_phrase_adjacent_ew_different_encodings(self): - self._test_get_x( - parser.get_phrase, - '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], '' - ) + adjacent_ew = C( + '=?ascii?q?Joi?= \t =?ascii?q?ned?=', + stringified='Joined', + ew_indexes=[0, 18], + ), + + adjacent_ew_different_encodings = C( + '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', + stringified='Bérénice', + ew_indexes=[0, 21], + ), - def test_get_phrase_adjacent_ew_encoded_spaces(self): - self._test_get_x( - parser.get_phrase, + adjacent_ew_encoded_spaces = C( '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', - 'Encoded spaces preserved', - 'Encoded spaces preserved', - [], - '' - ) + stringified='Encoded spaces preserved', + ew_indexes=[0, 20, 41], + ), - def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self): - self._test_get_x( - parser.get_phrase, + adjacent_ew_comment_is_not_linear_white_space = C( '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=', - 'Comment (is not) linear-white-space', - 'Comment linear-white-space', - [], - '', + stringified='Comment (is not) linear-white-space', + value='Comment linear-white-space', comments=['is not'], - ) + ew_indexes=[0, 29], + ), - def test_get_phrase_adjacent_ew_no_error_on_defects(self): - self._test_get_x( - parser.get_phrase, + adjacent_ew_no_error_on_defects = C( '=?ascii?q?Def?= =?ascii?q?ect still joins?=', - 'Defect still joins', - 'Defect still joins', - [errors.InvalidHeaderDefect], # whitespace inside encoded word - '' - ) + stringified='Defect still joins', + defects=[whitespace_inside_ew_defect], + ew_indexes=[0, 16], + ), - def test_get_phrase_adjacent_ew_ignore_non_ew(self): - self._test_get_x( - parser.get_phrase, + adjacent_ew_ignore_non_ew = C( '=?ascii?q?No?= =?join?= for non-ew', - 'No =?join?= for non-ew', - 'No =?join?= for non-ew', - [], - '' - ) + stringified='No =?join?= for non-ew', + ew_indexes=[0], + ), - def test_get_phrase_adjacent_ew_ignore_invalid_ew(self): - self._test_get_x( - parser.get_phrase, + adjacent_ew_ignore_invalid_ew = C( '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew', - 'No =?ascii?rot13?wbva= for invalid ew', - 'No =?ascii?rot13?wbva= for invalid ew', - [], - '' - ) + stringified='No =?ascii?rot13?wbva= for invalid ew', + ew_indexes=[0], + ), - def test_get_phrase_adjacent_ew_missing_space(self): - self._test_get_x( - parser.get_phrase, + adjacent_ew_missing_space = C( '=?ascii?q?Joi?==?ascii?q?ned?=', - 'Joined', - 'Joined', - [errors.InvalidHeaderDefect], # missing trailing whitespace - '' - ) - - # get_local_part + stringified='Joined', + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + ], + ew_indexes=[0, 15], + ), + + ew_before_quoted_string_missing_space = C( + '=?ascii?q?disjoin?="=?ascii?q?ted?="', + stringified='disjoin"ted"', + value='disjointed', + defects=[ + missing_whitespace_after_ew_defect, + ew_inside_quoted_string_defect, + ], + ew_indexes=[0, 20], + ), + + ew_after_quoted_string_missing_space = C( + '"=?ascii?q?disjoin?="=?ascii?q?ted?=', + stringified='"disjoin"ted', + value='disjointed', + defects=[ + missing_whitespace_before_ew_defect, + ew_inside_quoted_string_defect, + ], + ew_indexes=[1, 21], + ), + + **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + '."')( + ends_at_special = C( + 'complex (obsolete). "phrase" {char}foo', + value='complex . phrase ', + defects=[period_in_phrase_obs_defect], + remainder='{char}foo', + comments=['obsolete'], + obs_dots=1, + ), + ), + + # While these violate the RFC in several ways, allowing the '.' + # as the value of the phrase is the only sensible recovery. + + obsolete_dot_only = C( + '.', + defects=[ + non_word_phrase_start_defect, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + obsolete_dot_with_wsp = C( + '\t . ', + value=' . ', + defects=[ + non_word_phrase_start_defect, + *[cfws_without_atom_in_phrase_obs_defect]*2, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + obsolete_dot_and_comments_only = C( + '(foo).(bar)', + value=' . ', + comments=['foo', 'bar'], + defects=[ + non_word_phrase_start_defect, + *[cfws_without_atom_in_phrase_obs_defect]*2, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + obsolete_dot_and_comments_with_fws = C( + ' (foo). (bar) ', + value=' . ', + comments=['foo', 'bar'], + defects=[ + non_word_phrase_start_defect, + *[cfws_without_atom_in_phrase_obs_defect]*2, + period_in_phrase_obs_defect, + ], + obs_dots=1, + ), + + ) + + + # get_obs_local_part + + @params + def test_get_obs_local_part(self, s, *args, local_part=None, **kw): + lp = self._test_parse(parser.get_obs_local_part, C(s), *args, **kw) + if 'exception' in kw: + return + self.verify_terminal_types( + lp, + 'dot', + 'atext', + 'ptext', + 'fws', + 'misplaced-special', + ) - def test_get_local_part_simple(self): - local_part = self._test_get_x(parser.get_local_part, - 'dinsdale@python.org', 'dinsdale', 'dinsdale', [], '@python.org') - self.assertEqual(local_part.token_type, 'local-part') - self.assertEqual(local_part.local_part, 'dinsdale') + # This function should only get called when the non-obs expressions have + # already been checked for, so we are only testing the obs syntax handling, + # not what it does with non-obs syntax. Anything else is "don't care". + # The 'local_part' specs are checked by the get_local_part tests, since the + # token list returned by get_obs_local_part doesn't have that attribute. + params_test_get_obs_local_part = for_each_api( - def test_get_local_part_with_dot(self): - local_part = self._test_get_x(parser.get_local_part, - 'Fred.A.Johnson@python.org', - 'Fred.A.Johnson', - 'Fred.A.Johnson', - [], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - - def test_get_local_part_with_whitespace(self): - local_part = self._test_get_x(parser.get_local_part, - ' Fred.A.Johnson @python.org', - ' Fred.A.Johnson ', - ' Fred.A.Johnson ', - [], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + simple_obsolete = C( + 'Fred. A.Johnson@python.org', + remainder='@python.org', + local_part='Fred.A.Johnson', + ), - def test_get_local_part_with_cfws(self): - local_part = self._test_get_x(parser.get_local_part, - ' (foo) Fred.A.Johnson (bar (bird)) @python.org', - ' (foo) Fred.A.Johnson (bar (bird)) ', - ' Fred.A.Johnson ', - [], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') - self.assertEqual(local_part[0][0].comments, ['foo']) - self.assertEqual(local_part[0][2].comments, ['bar (bird)']) - - def test_get_local_part_simple_quoted(self): - local_part = self._test_get_x(parser.get_local_part, - '"dinsdale"@python.org', '"dinsdale"', '"dinsdale"', [], '@python.org') - self.assertEqual(local_part.token_type, 'local-part') - self.assertEqual(local_part.local_part, 'dinsdale') - - def test_get_local_part_with_quoted_dot(self): - local_part = self._test_get_x(parser.get_local_part, - '"Fred.A.Johnson"@python.org', - '"Fred.A.Johnson"', - '"Fred.A.Johnson"', - [], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + complex_obsolete_1 = C( + ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', + value=' Fred . A. Johnson.and dogs ', + remainder='@python.org', + comments=['foo ', 'bar', 'bird', 'sheep'], + local_part='Fred.A.Johnson.and dogs ', + ), - def test_get_local_part_quoted_with_whitespace(self): - local_part = self._test_get_x(parser.get_local_part, - ' "Fred A. Johnson" @python.org', - ' "Fred A. Johnson" ', - ' "Fred A. Johnson" ', - [], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred A. Johnson') + complex_obsolete_invalid = C( + ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', + value=' Fred . A. Johnson and dogs', + defects=[missing_dot_in_local_part_defect], + remainder='@python.org', + comments=['foo ', 'bar', 'bird', 'sheep'], + local_part='Fred.A.Johnson and dogs', + ), + + trailing_dot = C( + ' borris.@python.org', + defects=[trailing_dot_in_local_part_defect], + remainder='@python.org', + local_part='borris.', + ), - def test_get_local_part_quoted_with_cfws(self): - local_part = self._test_get_x(parser.get_local_part, - ' (foo) " Fred A. Johnson " (bar (bird)) @python.org', - ' (foo) " Fred A. Johnson " (bar (bird)) ', - ' " Fred A. Johnson " ', - [], - '@python.org') - self.assertEqual(local_part.local_part, ' Fred A. Johnson ') - self.assertEqual(local_part[0][0].comments, ['foo']) - self.assertEqual(local_part[0][2].comments, ['bar (bird)']) + trailing_dot_with_ws = C( + ' borris. @python.org', + defects=[trailing_dot_in_local_part_defect], + remainder='@python.org', + local_part='borris.', + ), + leading_dot = C( + '.borris@python.org', + defects=[leading_dot_in_local_part_defect], + remainder='@python.org', + local_part='.borris', + ), - def test_get_local_part_simple_obsolete(self): - local_part = self._test_get_x(parser.get_local_part, - 'Fred. A.Johnson@python.org', - 'Fred. A.Johnson', - 'Fred. A.Johnson', - [errors.ObsoleteHeaderDefect], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson') + leading_dot_after_ws = C( + ' .borris@python.org', + defects=[leading_dot_in_local_part_defect], + remainder='@python.org', + local_part='.borris', + ), - def test_get_local_part_complex_obsolete_1(self): - local_part = self._test_get_x(parser.get_local_part, - ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "@python.org', - ' (foo )Fred (bar).(bird) A.(sheep)Johnson."and dogs "', - ' Fred . A. Johnson.and dogs ', - [errors.ObsoleteHeaderDefect], - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson.and dogs ') + dots_around_comment = C( + ' borris.(foo).natasha@python.org', + value=' borris. .natasha', + defects=[repeated_dot_in_local_part_defect], + remainder='@python.org', + comments=['foo'], + local_part='borris..natasha', + ), + + quoted_strings_in_atom_list = C( + '""example" example"@example.com', + value='example example', + defects=[*[missing_dot_in_local_part_defect]*2], + remainder='@example.com', + local_part="example example", + ), + + # This is intentionally a weird one: first there is a quoted string + # consisting of a single quoted pair resolving to a single backslash. + # Then there is unquoted atext and an invalid quoted pair that + # therefore gets interpreted as two backslashes. Then there is a + # quoted string containing 'example' with a leading space. + valid_and_invalid_qp_in_atom_list = C( + r'"\\"example\\" example"@example.com', + value=r'\example\\ example', + defects=[ + *[missing_dot_in_local_part_defect]*2, + *[misplaced_backslash_defect]*2, + ], + remainder='@example.com', + local_part=r'\example\\ example', + ), + + # We do want to check that it raises on an empty input, even + # though it should never be called with one. + empty = C( + '', + exception=(errors.HeaderParseError, '(?i)expected'), + ), + + quoted_words_but_no_ws = C( + '"words"."separated".by.dots', + value='words.separated.by.dots', + local_part='words.separated.by.dots', + ), + + backlashes_in_various_places = C( + r"\invali\d\.\really" + '\\', + local_part=r'\invali\d\.\really' + '\\', + defects=[ + *[misplaced_backslash_defect]*5, + *[missing_dot_in_local_part_defect]*3, + ], + ), + + double_dot_no_ws = C( + ' borris..natasha@python.org', + value=' borris..natasha', + defects=[repeated_dot_in_local_part_defect], + remainder='@python.org', + local_part='borris..natasha', + ), + + # The end of this is treated as a quoted string, so the stringified + # version has a trailing quote added, but the local_part attribute + # does not include the quotes. + looks_like_qp_quote_but_quote_is_respected = C( + r'invalid.\"for.sure', + stringified=r'invalid.\"for.sure"', + value=r'invalid.\for.sure', + local_part=r'invalid.\for.sure', + defects=[ + end_inside_quoted_string_defect, + misplaced_backslash_defect, + missing_dot_in_local_part_defect, + ], + ), + + # obs_local_part parses anything that can be in a phrase (cfws + # atoms and quoted strings), plus \ and dots. + **for_each_character(RFC_SPECIALS, skip=CFWS_LEADER + r'\."')( + ends_at_phrase_ends = C( + 'doted.words. and . space{char}', + local_part='doted.words.and.space', + remainder='{char}', + ), + ), + + # Encoded words are not legitimate in local-part, but we decode + # them anyway. + + invalid_ew_atoms = C( + '=?utf-8?q?foo_?="=?utf-8?q?_bar?=".bird', + # It's not clear this str is the best choice. It's + # a consequence of the underlying parsed structures. + stringified='foo " bar".bird', + value="foo bar.bird", + local_part="foo bar.bird", + defects=[ + # XXX XXX There should be exactly one ew whitespace defect + # here, but the number generated will change during refactor, + # until it is fixed when get_obs_local_part is refactored. + #missing_whitespace_after_ew_defect, + missing_dot_in_local_part_defect, + ew_inside_quoted_string_defect, + ], + ew_indexes=[0, 17], + ), + + less_invalid_ew_atoms = C( + '=?utf-8?q?foo_?= . (=?utf-8?q?test?=) =?utf-8?q?_bar?= .bird', + stringified='foo . (test) bar .bird', + value="foo . bar .bird", + local_part="foo . bar.bird", + comments=['test'], + ew_indexes=[0, 20, 38], + ), - def test_get_local_part_complex_obsolete_invalid(self): - local_part = self._test_get_x(parser.get_local_part, - ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"@python.org', - ' (foo )Fred (bar).(bird) A.(sheep)Johnson "and dogs"', - ' Fred . A. Johnson and dogs', - [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, 'Fred.A.Johnson and dogs') + ) - def test_get_local_part_empty_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_local_part('') - def test_get_local_part_no_part_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_local_part(' (foo) ') + # get_local_part - def test_get_local_part_special_instead_raises(self): - with self.assertRaises(errors.HeaderParseError): - parser.get_local_part(' (foo) @python.org') + @params + def test_get_local_part(self, s, *args, local_part=None, **kw): + lp = self._test_parse(parser.get_local_part, C(s), *args, **kw) + if 'exception' in kw: + return + self.verify_terminal_types( + lp, + 'dot', + 'atext', + 'ptext', + 'fws', + 'misplaced-special', + ) + self.assertEqual(lp.local_part, local_part) + + @params_map + def add_ew_defects(*args, ew_indexes=[], defects=[], **kw): + if ew_indexes: + defects = defects + [ew_in_local_part_defect] * len(ew_indexes) + yield '', C(*args, ew_indexes=ew_indexes, defects=defects, **kw) + + @params_map(with_namelist=True) + def adapt_get_dot_atom_tests_for_get_local_part(nl, s, *args, **kw): + r = kw.get('remainder') + if 'value' in kw: + local_part = kw['value'] + else: + local_part = kw.get('stringified', s[:-len(r)] if r else s) + if not nl.has_any('ew_only', 'rfc2047_atom'): + # Except for the above two tests, the leading and trailing + # whitespace in the 'value' is the 'semantic blank' it produces + # for leading and trailing cfws, which local_part doesn't include. + # For those two ew tests the blank comes from inside the ew. + local_part = local_part.removeprefix(' ').removesuffix(' ') + kw['local_part'] = local_part + yield '', C(s, *args, **kw) + + @params_map + def adapt_get_quoted_string_tests_for_get_local_part(*args, **kw): + if 'quoted_value' in kw: + kw['value'] = kw.pop('quoted_value') + if 'exception' not in kw: + kw['local_part'] = kw.pop('content') + yield '', C(*args, **kw) + + @params_map + def adapt_get_obs_local_part_tests_for_get_local_part( + *args, + defects=[], + **kw, + ): + defects = list(defects) + if any( + x in ( + repeated_dot_in_local_part_defect, + misplaced_backslash_defect, + missing_dot_in_local_part_defect, + leading_dot_in_local_part_defect, + trailing_dot_in_local_part_defect, + ) for x in defects + ): + defects.append(not_even_obs_local_part_defect) + else: + defects.append(non_dot_atom_local_part_obs_defect) + yield '', C(*args, defects=defects, **kw) + + params_test_get_local_part = for_each_api( + + # An RFC compliant local part can be a dot atom or a quoted string, so + # it should pass some of the tests for those. + + add_ew_defects( + adapt_get_dot_atom_tests_for_get_local_part( + include_unless( + lambda n, *a, **k: + n.has_any( + # Get local part handles multiple atoms. + 'two_ew_two_atoms', + 'atom_ends_at_noncfws', + # There are some things get_dot_atom raises for + # that get_local_part treats as obs-local-part. + 'two_dots_raises', + 'trailing_dot_raises', + 'space_ends_dot_atom', + # XXX XXX These need a logic fix to whitespace + # handling in get_local_part itself. + 'ew_and_comments_no_ws', + 'ew_and_empty_comments_no_ws', + ) + or + # get_local_part handles quoted strings (tested + # above), and leading dots or \ are handled as + # obs-local-part. + n.has_any( + 'up_to_special', + 'leading_special_raises', + 'no_atom_before_special', + 'no_atext_before_special_or_wsp', + 'atom_ends_at_special', + 'ends_at_special_after_comment', + 'ends_at_special', + ) + and n.has_any( + 'reverse_solidus', + 'quotation_mark', + 'full_stop', + ), + label='from_test_get_dot_atom', + )(params_test_get_dot_atom), + ), + ), + + add_ew_defects( + adapt_get_quoted_string_tests_for_get_local_part( + include_unless( + lambda n, *a, **k: n.has_any( + # These tests have an atom first; get_quoted_string + # raises, but get_local_part parses the atom. Atoms + # are tested above. + 'no_quoted_string', + 'no_leading_dquote_before_non_ws', + # A local part only ends at specials other than " and . + 'qs_ends_at_noncfws', + 'ew_after_dquote', + 'encoded_word_after_dquote_with_no_ws', + 'end_dquote_mid_word', + ), + label='from_test_get_quoted_string', + )(params_test_get_quoted_string), + ), + ), + + add_ew_defects( + add_label('from_test_get_obs_local_part')( + adapt_get_obs_local_part_tests_for_get_local_part( + params_test_get_obs_local_part, + ), + ), + ), + + simple = C( + 'dinsdale@python.org', + remainder='@python.org', + local_part='dinsdale', + ), + + with_dot = C( + 'Fred.A.Johnson@python.org', + remainder='@python.org', + local_part='Fred.A.Johnson', + ), - def test_get_local_part_trailing_dot(self): - local_part = self._test_get_x(parser.get_local_part, - ' borris.@python.org', - ' borris.', - ' borris.', - [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, 'borris.') + with_whitespace = C( + ' Fred.A.Johnson @python.org', + value=' Fred.A.Johnson ', + remainder='@python.org', + local_part='Fred.A.Johnson', + ), - def test_get_local_part_trailing_dot_with_ws(self): - local_part = self._test_get_x(parser.get_local_part, - ' borris. @python.org', - ' borris. ', - ' borris. ', - [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, 'borris.') + with_cfws = C( + ' (foo) Fred.A.Johnson (bar (bird)) @python.org', + value=' Fred.A.Johnson ', + remainder='@python.org', + comments=['foo', 'bar (bird)'], + local_part='Fred.A.Johnson', + ), + + simple_quoted = C( + '"dinsdale"@python.org', + remainder='@python.org', + local_part='dinsdale', + ), + + with_quoted_dot = C( + '"Fred.A.Johnson"@python.org', + remainder='@python.org', + local_part='Fred.A.Johnson', + ), - def test_get_local_part_leading_dot(self): - local_part = self._test_get_x(parser.get_local_part, - '.borris@python.org', - '.borris', - '.borris', - [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, '.borris') + quoted_with_whitespace = C( + ' "Fred A. Johnson" @python.org', + value=' "Fred A. Johnson" ', + remainder='@python.org', + local_part='Fred A. Johnson', + ), - def test_get_local_part_leading_dot_after_ws(self): - local_part = self._test_get_x(parser.get_local_part, - ' .borris@python.org', - ' .borris', - ' .borris', - [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, '.borris') + quoted_with_cfws = C( + ' (foo) " Fred A. Johnson " (bar (bird)) @python.org', + value=' " Fred A. Johnson " ', + remainder='@python.org', + comments=['foo', 'bar (bird)'], + local_part=' Fred A. Johnson ', + ), - def test_get_local_part_double_dot_raises(self): - local_part = self._test_get_x(parser.get_local_part, - ' borris.(foo).natasha@python.org', - ' borris.(foo).natasha', - ' borris. .natasha', - [errors.InvalidHeaderDefect]*2, - '@python.org') - self.assertEqual(local_part.local_part, 'borris..natasha') + empty_raises = C( + '', + exception=(errors.HeaderParseError, '.*'), + ), + + no_part_raises = C( + ' (foo) ', + exception=(errors.HeaderParseError, '.*'), + ), + + special_instead_raises = C( + ' (foo) @python.org', + exception=(errors.HeaderParseError, '.*'), + ), + + unicode = C( + 'exámple@example.com', + remainder='@example.com', + local_part='exámple', + ), + + ew_non_ascii = C( + '=?utf-8?q?ex=c3=a1mple?=@example.com', + stringified='exámple', + remainder='@example.com', + defects=[ + # XXX XXX there should be exactly one missing whitespace here, + # but it will change until we refactor get_local_part. + #missing_whitespace_after_ew_defect, + ew_in_local_part_defect, + ], + local_part='exámple', + ew_indexes=[0], + ), + + # Since we've decided to decode encoded words, this is a "valid" + # dot-atom. But if you clear up the whitespace defects whitespace, it + # turns into an obs_local_part because of the whitespace. + sort_of_valid_ew_dot_atom = C( + '=?utf-8?q?foo_?=.=?utf-8?q?_bar?=.bird', + stringified='foo . bar.bird', + value="foo . bar.bird", + local_part="foo . bar.bird", + defects=[ + missing_whitespace_after_ew_defect, + missing_whitespace_before_ew_defect, + missing_whitespace_after_ew_defect, + # XXX XXX There should also be an ew in local part defect. + *[ew_in_local_part_defect]*2, + ], + ew_indexes=[0, 17], + ), - def test_get_local_part_quoted_strings_in_atom_list(self): - local_part = self._test_get_x(parser.get_local_part, - '""example" example"@example.com', - '""example" example"', - 'example example', - [errors.InvalidHeaderDefect]*3, - '@example.com') - self.assertEqual(local_part.local_part, 'example example') + ) - def test_get_local_part_valid_and_invalid_qp_in_atom_list(self): - local_part = self._test_get_x(parser.get_local_part, - r'"\\"example\\" example"@example.com', - r'"\\"example\\" example"', - r'\example\\ example', - [errors.InvalidHeaderDefect]*5, - '@example.com') - self.assertEqual(local_part.local_part, r'\example\\ example') # get_dtext @@ -2269,7 +5353,7 @@ def test_get_group_list_obs_group_list(self): ', (foo),,(bar)', ', (foo),,(bar)', ', ,, ', - [errors.ObsoleteHeaderDefect], + [errors.ObsoleteHeaderDefect] * 5, '') self.assertEqual(group_list.token_type, 'group-list') self.assertEqual(len(group_list.mailboxes), 0) diff --git a/Lib/test/test_email/test_defect_handling.py b/Lib/test/test_email/test_defect_handling.py index acc4accccac7566..64cd4d3d750af67 100644 --- a/Lib/test/test_email/test_defect_handling.py +++ b/Lib/test/test_email/test_defect_handling.py @@ -375,5 +375,15 @@ def get_defects(self, obj): return obj.defects +class TestDefectDeprecation(TestEmailBase): + + def test_non_ascii_defect_deprecated(self): + with self.assertWarnsRegex( + DeprecationWarning, + rf'(?i)(?=.*NonASCIILocalPartDefect)(?=.*is deprecated)', + ): + errors.NonASCIILocalPartDefect + + if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index aa918255d15c37e..ef4d8f1d319fc6f 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -920,7 +920,7 @@ def content_disp_as_value(self, ' =?UTF-8?Q?pdf?="', 'attachment', {'filename': 'Schulbesuchsbestättigung.pdf'}, - [errors.InvalidHeaderDefect]*3, + [errors.InvalidHeaderDefect]*2, ('attachment; filename="Schulbesuchsbestättigung.pdf"'), ('Content-Disposition: attachment;\n' ' filename*=utf-8\'\'Schulbesuchsbest%C3%A4ttigung.pdf\n'), @@ -1242,8 +1242,7 @@ class TestAddressHeader(TestHeaderBase): 'rfc2047_atom_in_quoted_string_is_decoded': ('"=?utf-8?q?=C3=89ric?=" ', - [errors.InvalidHeaderDefect, - errors.InvalidHeaderDefect], + [errors.InvalidHeaderDefect], 'Éric ', 'Éric', 'foo@example.com', diff --git a/Lib/test/test_email/test_params.py b/Lib/test/test_email/test_params.py new file mode 100644 index 000000000000000..f48c8f312083642 --- /dev/null +++ b/Lib/test/test_email/test_params.py @@ -0,0 +1,1237 @@ +import re +import unittest +from contextlib import contextmanager +from test.support import captured_stdout +from test.test_email.params import ( + add_label, + as_value, + C, + for_each_function, + for_each_name, + fmt, + fmtall, + include_if, + NameList, + include_unless, + only, + params, + Params, + params_map, + ParamsMixin, + with_names, + ) +from textwrap import dedent + +TYPED_VALUES = with_names(int=1, dict=dict(a=1), C=C(C(1)), tuple=(1, 2)) + +class AssertMixin: + + @contextmanager + def assertRaisesRegexEx(self, ex, re, cause_ex=None, cause_re=None): + with super().assertRaisesRegex(ex, re) as cm: + yield + if cause_ex: + self.assertIsNotNone(cm.exception.__cause__) + self.assertIsInstance(cm.exception.__cause__, cause_ex) + self.assertRegex(str(cm.exception.__cause__), cause_re) + + +# We eat our own dogfood here, which could make bugs a bit confusing to sort +# out. But it exercises the machinery pretty well, and demonstrates the power +# of the framework. And we get much better test coverage out of it. + + +class TestFmt(ParamsMixin, unittest.TestCase): + + test_strings = ( + ('', '', '' ), + ('no sub point', 'no sub point', 'no sub point' ), + ('sub an {a}', 'sub an a', 'sub an a' ), + ('{a} and {foo!r}', "a and {foo!r}", "a and 'foo'" ), + ('{x} {y} {z:02}', '{x} 2 {z:02}', '1 2 03' ), + ) + sub = (dict(), dict(a='a', y=2), dict(foo='foo', x=1, z=3)) + expected = list(zip(*test_strings)) + make_list = lambda v: [C(1), *v, [1, 2, 3]] + make_dict = lambda v: dict( + {f'v{i}': v for i, v in enumerate(v)}, foo=C(1), bar=[1, 2, 3], + ) + + substitution_cases = Params( + no_subs = C(expected[0], dict(), expected[0]), + non_subs = C(expected[0], dict(zzz='foo', yyy='bar'), expected[0]), + one_sub = C(expected[0], sub[1], expected[1]), + all_sub = C(expected[0], sub[1] | sub[2], expected[2]), + ) + + @params( + for_each_function(fmt, fmtall)( + params_map(lambda obj, subs, expected, ml=make_list, md=make_dict: + [ + ('list', C(ml(obj), subs, ml(expected))), + ('dict', C(md(obj), subs, md(expected))), + ] + )(substitution_cases), + ) + ) + def test(self, fmter, obj, subs, expected): + self.assertEqual(fmter(obj, subs), expected) + + @params( + fmt_list = C(fmt, make_list), + fmtall_list = C(fmtall, make_list), + fmt_dict = C(fmt, make_dict), + fmtall_dict = C(fmtall, make_dict), + ) + def test_multiple_passes(self, fmter, maker): + unmodified = maker(self.expected[0]) + v0 = fmter(unmodified, dict()) + self.assertEqual(v0, unmodified) + v1 = fmter(v0, dict(foobar=99) | self.sub[1]) + self.assertEqual(v1, maker(self.expected[1])) + v2 = fmter(v1, dict(foobar=99, y=9) | self.sub[2]) + self.assertEqual(v2, maker(self.expected[2])) + + nested = [('{a}', '{b}'), dict(a='{a}', b=['{b}']), [[1, ['{b}']]]] + nested_subs = dict(a=1, b=2) + nested_subbed = [('1', '2'), dict(a='1', b=['2']), [[1, ['2']]]] + + def test_fmt_does_not_recurse(self): + self.assertEqual(fmt(self.nested, self.nested_subs), self.nested) + + def test_fmtall_recurses(self): + self.assertEqual( + fmtall(self.nested, self.nested_subs), + self.nested_subbed, + ) + + +class TestC(ParamsMixin, unittest.TestCase): + + def test_empty_C(self): + p = C() + self.assertEqual(p.args, tuple()) + self.assertEqual(p.kw, {}) + + def test_args_only(self): + p = C('a', 2) + self.assertEqual(p.args, ('a', 2)) + self.assertEqual(p.kw, {}) + + def test_kw_only(self): + p = C(b='a', n=2) + self.assertEqual(p.args, tuple(), p.args) + self.assertEqual(p.kw, dict(b='a', n=2)) + + def test_args_and_kw(self): + p = C(1, 2, b='a', n=2) + self.assertEqual(p.args, (1, 2)) + self.assertEqual(p.kw, dict(b='a', n=2)) + + def test_callable(self): + p = C(1, 2, b='a', n=2) + res = [] + def tester(arg1, arg2, b=None, n=None): + res.extend([arg1, arg2, b, n]) + p(tester) + self.assertEqual(res, [1, 2, 'a', 2]) + + @params( + missing_arg = C(lambda a: 1, C(), "(?i)(?=.*'a')(?=.*missing)"), + extra_arg = C(lambda: 1, C(1), "(?i)(?=.*0)(?=.*positional)"), + missing_kw = C(lambda *, x: 1, C(), "(?i)(?=.*'x')(?=.*missing)"), + extra_kw = C(lambda: 1, C(x=1), "(?i)(?=.*'x')(?=.*unexpected)"), + ) + def test_arguments_mismatch(self, f, cs, msg): + with self.assertRaisesRegex(TypeError, msg): + cs(f) + + expected_reprs = Params( + no_arg = C(C(), "{fn}()"), + one_arg = C(C(1), "{fn}(1)"), + two_args = C(C(1, 2), "{fn}(1, 2)"), + two_str_args = C(C('1', '2'), "{fn}('1', '2')"), + one_kw = C(C(a=1), "{fn}(a=1)"), + two_kw = C(C(a=1, b=2), "{fn}(a=1, b=2)"), + two_str_kw = C(C(a='1', b='2'), "{fn}(a='1', b='2')"), + one_each = C(C(1, a='1'), "{fn}(1, a='1')"), + two_each = C(C(1, 2, a='1', b=3), "{fn}(1, 2, a='1', b=3)"), + ) + + @params(expected_reprs) + def test_repr(self, callspec, expected_repr): + self.assertEqual(repr(callspec), expected_repr.format(fn='C')) + + @params(expected_reprs) + def test_repr_call(self, callspec, expected_repr): + f = lambda: 1 + self.assertEqual( + callspec.repr_call(f), + expected_repr.format(fn=''), + ) + + @params( + params_map(lambda cs, _: only(C(cs, cs)))(expected_reprs), + dict_reversed = C(C(dict(a=1, b=2)), C(dict(b=2, a=1))), + kws_reversed = C(C(a=1, b=2, c=3), C(c=3, b=2, a=1)), + ) + def test_eq(self, callspec1, callspec2): + self.assertEqual(callspec1, callspec2) + + @params( + one_arg_value_mismatch = C(C(1), C(2)), + two_arg_value_mismatch = C(C(1, a='2'), C(2, a='1')), + two_arg_value_mismatch2 = C(C(1, a='2'), C(1, a='1')), + arg_count_mismatch = C(C(1, 2), C(1)), + arg_type_mismatch = C(C('1', '2'), C(1, 2)), + kw_name_mismatch = C(C(a=1), C(b=1)), + kw_count_mismatch = C(C(a=1, b=2), C(a=1)), + kw_type_mismatch = C(C(a='1', b='2'), C(a=1, b=2)), + non_callspec = C(C(1), 1), + ) + def test_neq(self, callspec1, callspec2): + self.assertNotEqual(callspec1, callspec2) + + def test_args_is_settable(self): + cs = C('a', 'b') + cs.args = ('c', 'd') + self.assertEqual(cs(lambda *args: args), ('c', 'd')) + + def test_kw_is_settable(self): + cs = C(b=1, c=2) + cs.kw = dict(c=3, d=3) + self.assertEqual(cs(lambda *_, **kw: kw), dict(c=3, d=3)) + + def test_kw_is_mutable(self): + cs = C(b=1, c=2) + cs.kw['b'] = 2 + cs.kw.update(z=7) + self.assertEqual(cs(lambda *_, **kw: kw), dict(b=2, c=2, z=7)) + + @params(for_each_name('fmt', 'fmtall')(TestFmt.substitution_cases)) + def test_fmt(self, fmtname, unmodified, subs, expected): + cs = C(*TestFmt.make_list(unmodified), **TestFmt.make_dict(unmodified)) + exp = C(*TestFmt.make_list(expected), **TestFmt.make_dict(expected)) + self.assertEqual(getattr(cs, fmtname)(**subs), exp) + + def test_fmt_does_not_recurse(self): + unmodified = C(*TestFmt.nested) + self.assertEqual(unmodified.fmt(**TestFmt.nested_subs), unmodified) + + def test_fmtall_recurses(self): + self.assertEqual( + C(*TestFmt.nested).fmtall(**TestFmt.nested_subs), + C(*TestFmt.nested_subbed), + ) + + +class TestParams(ParamsMixin, unittest.TestCase): + + @params( + ints=C( + C(a=1, b=2, c=3), + expected=dict(a=C(1), b=C(2), c=C(3)), + ), + cs=C( + C(a=C(1), b=C(2), c=C(3)), + expected=dict(a=C(1), b=C(2), c=C(3)), + ), + dict_and_kw=C( + C(z=dict(a=C(1), b=2), c=C(3)), + expected=dict(z=C(dict(a=C(1), b=2)), c=C(3)), + ), + params_and_kw=C( + C(Params(z=dict(a=C(1), b=2)), c=C(3)), + expected=dict(z=C(dict(a=C(1), b=2)), c=C(3)), + ), + params_as_c_arg=C( + C(z=C(Params(y=dict(a=C(1), b=2)))), + expected=dict(z=C(Params(y=C(dict(a=C(1), b=2))))), + ), + params_as_kw=C( + C(z=Params(y=dict(a=C(1), b=2))), + expected=dict(z=C(Params(y=C(dict(a=C(1), b=2))))), + ), + ) + def test_valid_data(self, callspec, expected): + result = callspec(Params) + self.assertEqual(dict(result), expected) + self.assertIsInstance(result, Params) + + def test_repr(self): + self.assertEqual( + repr(Params(a=1, b=C(3), z=dict(a=1, b=2))), + "Params(a=C(1), b=C(3), z=C({'a': 1, 'b': 2}))", + ) + + @params(int=1, dict=dict(a=1), params=Params(a=1), c=C(1), d=C(Params(z=1))) + def test_setitem_results_in_c(self, value): + p = Params() + p['foo'] = value + self.assertEqual(p['foo'], value if isinstance(value, C) else C(value)) + + @params_map + def for_init_and_update(cs, msg): + yield 'init', C(Params, cs, msg) + yield 'update', C(Params().update, cs, msg) + + @params( + for_init_and_update( + params_kw = C(C(Params(a=1), a=7), msg=r'a=7'), + two_params = C(C(Params(a=1), Params(a=7)), msg=r'a=C\(7\)'), + ), + setitem_duplicate = C(Params(a=1).__setitem__, C('a', 7), msg='a=7'), + setitem_non_identifier = C(Params().__setitem__, C('0', 7), msg='0'), + ) + def test_duplicate_keys_disallowed(self, meth, callspec, msg): + with self.assertRaisesRegex(ValueError, msg): + callspec(meth) + + @params(for_init_and_update(TYPED_VALUES)) + def test_invalid_data_types(self, meth, typ, val): + msg = f'(?=.*1)(?=.*Params)(?=.*{typ})' + with self.assertRaisesRegex(TypeError, msg): + meth(val) + + +class TestParameterizingTests(AssertMixin, ParamsMixin, unittest.TestCase): + + def _test_success(self, testcase, testname): + res = unittest.TestResult() + testcase(methodName=testname).run(res) + self.assertEqual(res.testsRun, 1) + self.assertEqual(res.failures, []) + self.assertEqual(res.errors, []) + self.assertTrue(res.wasSuccessful) + + def _test_error(self, testcase, testname, expected_error_regex): + res = unittest.TestResult() + testcase(methodName=testname).run(res) + self.assertEqual(res.testsRun, 1) + self.assertEqual(res.failures, []) + self.assertEqual(len(res.errors), 1, "wrong number of errors raised") + self.assertRegex(res.errors[0][1], expected_error_regex) + self.assertFalse(res.wasSuccessful()) + + def test_normal_tests_run(self): + check = [] + class Test(ParamsMixin, unittest.TestCase): + def test_normal_tests_run(self): + check.append(1) + self._test_success(Test, 'test_normal_tests_run') + self.assertEqual(check, [1]) + + class ParameterizeFixture(ParamsMixin, unittest.TestCase): + + @params(a=C(1), b=C(2), c=C(3)) + def test_kw(self, value): + self.check.append(value) + + @params(Params(a=C(1), b=C(2), c=C(3))) + def test_arg(self, value): + self.check.append(value) + + @params + def test_params(self, value): + self.check.append(value) + params_test_params = Params(a=C(1), b=C(2), c=C(3)) + + @params(Params(a=C(1)), b=C(2)) + def test_multiple_sources(self, value): + self.check.append(value) + params_test_multiple_sources = Params(c=C(3)) + + @params(Params(a=C(1), b=C(2)), Params(c=C(3), d=C(4)), e=C(5), f=C(6)) + def test_multiple_multiple(self, value): + self.check.append(value) + params_test_multiple_multiple = Params(g=C(7), h=C(8)) + params_test_multiple_multiple__more = Params(i=C(9), h=C(10)) + + expected_names = [ + *[ + f'test_{n}__{k}' + for k in 'abc' + for n in ('kw', 'arg', 'params', 'multiple_sources') + ], + *[f'test_multiple_multiple__{k}' for k in 'abcdefgh'], + *[f'test_multiple_multiple__more__{k}' for k in 'ih'], + ] + + @params( + as_value('kw', 'arg', 'params', 'multiple_sources'), + multiple_multiple=C( + 'multiple_multiple', + expected={c: n for n, c in enumerate('abcdefgh', 1)}, + ), + multiple_multiple_more=C( + 'multiple_multiple__more', + expected={c: n for n, c in enumerate('ih', 9)}, + ) + ) + def test_parameterization(self, name, expected=dict(a=1, b=2, c=3)): + self.ParameterizeFixture.check = [] + values = [] + for k, v in expected.items(): + self._test_success(self.ParameterizeFixture, f'test_{name}__{k}') + values.append(v) + self.assertEqual(self.ParameterizeFixture.check, values) + with self.assertRaisesRegex( + ValueError, + r'(?i)(?=.*no.*test.*method)(?=.*test_{name}__bad)', + ): + self.ParameterizeFixture('test_{name}__bad') + + class RawValueFixture(ParamsMixin, unittest.TestCase): + + expected = dict(a=1, b=(2, 3), c=dict(z=4)) + + @params(**expected) + def test_kw(self, value): + self.check.append(value) + + @params(Params(**expected)) + def test_arg(self, value): + self.check.append(value) + + @params + def test_params(self, value): + self.check.append(value) + params_test_params = Params(**expected) + + @params_map + def identity(v): + yield '', v + @params + def test_params_map(self, value): + self.check.append(value) + params_test_params_map = identity(**expected) + + expected_names = [ + f'test_{n}__{k}' + for k in expected + for n in ('kw', 'arg', 'params', 'params_map') + ] + + @params(as_value('kw', 'arg', 'params', 'params_map')) + def test_raw_values_are_handled(self, name): + self.RawValueFixture.check = [] + values = [] + for k, v in self.RawValueFixture.expected.items(): + self._test_success(self.RawValueFixture, f'test_{name}__{k}') + values.append(v) + self.assertEqual(self.RawValueFixture.check, values) + + class ParamsAttributeFixture(ParamsMixin, unittest.TestCase): + + @params + def test(self, n): + self.check.append(('test', n)) + params_test = Params(a='test') + params_test__more = Params(a='test') + params_test__more__still = Params(a='test') + + @params + def test_(self, n): + self.check.append(('test', n)) + params_test_ = Params(a='test') + params_test___more = Params(a='test') + params_test___more__still = Params(a='test') + + @params + def test__(self, n): + self.check.append(('test__', n)) + params_test__ = Params(a='test__') + params_test____more = Params(a='test__') + params_test____more__still = Params(a='test__') + + @params + def test___(self, n): + self.check.append(('test___', n)) + params_test___ = Params(a='test___') + params_test_____more = Params(a='test___') + params_test_____more__still = Params(a='test___') + + @params + def test____(self, n): + self.check.append(('test____', n)) + params_test____ = Params(a='test____') + params_test______more = Params(a='test____') + params_test______more__still = Params(a='test____') + + @params + def test__foo(self, n): + self.check.append(('test__foo', n)) + params_test__foo = Params(a='test__foo') + params_test__foo__more = Params(a='test__foo') + params_test__foo__more__still = Params(a='test__foo') + + @params + def test___foo(self, n): + self.check.append(('test___foo', n)) + params_test___foo = Params(a='test___foo') + params_test___foo__more = Params(a='test___foo') + params_test___foo__more__still = Params(a='test___foo') + + @params + def test____foo(self, n): + self.check.append(('test____foo', n)) + params_test____foo = Params(a='test____foo') + params_test____foo__more = Params(a='test____foo') + params_test____foo__more__still = Params(a='test____foo') + + @params + def test_foo__bar(self, n): + self.check.append(('test_foo__bar', n)) + params_test_foo__bar = Params(a='test_foo__bar') + params_test_foo__bar__more = Params(a='test_foo__bar') + params_test_foo__bar__more__still = Params(a='test_foo__bar') + + @params + def test_foo__bar__baz(self, n): + self.check.append(('test_foo__bar__baz', n)) + params_test_foo__bar__baz = Params(a='test_foo__bar__baz') + params_test_foo__bar__baz__more = Params(a='test_foo__bar__baz') + params_test_foo__bar__baz__more__still = Params(a='test_foo__bar__baz') + + expected_names = [ + 'test__a', + 'test__more__a', + 'test__more__still__a', + 'test___a', + 'test___more__a', + 'test___more__still__a', + 'test____a', + 'test____more__a', + 'test____more__still__a', + 'test_____a', + 'test_____more__a', + 'test_____more__still__a', + 'test______a', + 'test______more__a', + 'test______more__still__a', + 'test__foo__a', + 'test__foo__more__a', + 'test__foo__more__still__a', + 'test___foo__a', + 'test___foo__more__a', + 'test___foo__more__still__a', + 'test____foo__a', + 'test____foo__more__a', + 'test____foo__more__still__a', + 'test_foo__bar__a', + 'test_foo__bar__more__a', + 'test_foo__bar__more__still__a', + 'test_foo__bar__baz__a', + 'test_foo__bar__baz__more__a', + 'test_foo__bar__baz__more__still__a', + ] + + @params(as_value(*ParamsAttributeFixture.expected_names)) + def test_params_attach_to_correct_tests(self, name): + self.ParamsAttributeFixture.check = [] + self._test_success(self.ParamsAttributeFixture, name) + self.assertEqual(*self.ParamsAttributeFixture.check[0]) + + @params( + for_each_function( + ParameterizeFixture, + RawValueFixture, + ParamsAttributeFixture + )(C()), + ) + def test_names_are_as_expected(self, fixture): + test_names = [x for x in dir(fixture) if x.startswith('test_')] + self.assertEqual(sorted(test_names), sorted(fixture.expected_names)) + + def test_empty_parameters_is_an_error_by_default(self): + msg = r"(?i)(?=.*'test_foo')(?=.*no.*param)" + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + @params() + def test_foo(self): + pass + params_test_foo = Params() + + def test_empty_decorator_is_ok_when_check_disabled(self): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params() + def test_foo(self): + pass + + def test_empty_parameters_is_ok_when_check_disabled(self): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params + def test_foo(self): + pass + params_test_foo = Params() + + def test_no_parameter_sets_is_an_error(self): + msg = r"(?i)(?=.*no.*param)(?=.*'test_foo')" + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params + def test_foo(self): + pass + + def test_no_parameter_sets_is_an_error_even_when_check_disabled(self): + msg = r"(?i)(?=.*no.*param)(?=.*'test_foo')" + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + paramsRequired = False + @params + def test_foo(self): + pass + + def test_params_and_no_decorator_is_an_error(self): + with self.assertRaisesRegex( + ValueError, r'(?i)(?=.*params_test_foo)(?=.*no.*test)', + ): + class Test(ParamsMixin, unittest.TestCase): + def test_foo(self): + pass + params_test_foo = Params() + + def test_params_with_no_exactly_matching_test_is_an_error(self): + with self.assertRaisesRegex( + ValueError, r'(?i)(?=.*params_test_foo_bar)(?=.*no.*test)', + ): + class Test(ParamsMixin, unittest.TestCase): + @params + def test_foo(self): + pass + params_test_foo_bar = Params() + + def test_params_args_keys_must_differ(self): + with self.assertRaisesRegex(ValueError, r'ggg=.*6'): + class Test(ParamsMixin, unittest.TestCase): + @params(Params(xzy=1, b=2, ggg=3), Params(ggg=6, xzy=7)) + def test_foo(self): + pass + + def test_params_args_keys_must_differ_from_kws(self): + with self.assertRaisesRegex(ValueError, r'ggg=.*6'): + class Test(ParamsMixin, unittest.TestCase): + @params(Params(xzy=1, b=2, ggg=3), ggg=6, xzy=7) + def test_foo(self): + pass + + def test_params_args_keys_must_differ_from_params_attr_keys(self): + with self.assertRaisesRegexEx( + ValueError, r'params_test_foo', + ValueError, r'ggg=.*6', + ): + class Test(ParamsMixin, unittest.TestCase): + @params(Params(xzy=1, b=2, ggg=3)) + def test_foo(self): + pass + params_test_foo = Params(ggg=6, xzy=7) + + def test_kws_must_differ_from_params_attr_keys(self): + with self.assertRaisesRegexEx( + ValueError, r'params_test_foo', + ValueError, r'ggg=.*6', + ): + class Test(ParamsMixin, unittest.TestCase): + @params(xzy=1, b=2, ggg=3) + def test_foo(self): + pass + params_test_foo = Params(ggg=6, xzy=7) + + def test_params_attr_keys_must_differ(self): + with self.assertRaisesRegexEx( + ValueError, r"'params_test_bar__foo'", + ValueError, r'ggg=.*6', + ): + class Test(ParamsMixin, unittest.TestCase): + @params + def test_bar(self): + pass + params_test_bar = Params(foo__ggg=6, xzy=7) + params_test_bar__foo = Params(ggg=6, xzy=7) + + @params(TYPED_VALUES) + def test_non_params_arg_to_decorator_is_invalid(self, typ, val): + msg = fr'(?=.*1)(?=.*Params)(?=.*{typ})' + with self.assertRaisesRegex(TypeError, msg): + class Test(ParamsMixin, unittest.TestCase): + # we have to have a dummy argument here because unlike any + # normal call we'd otherwise only be passing one argument, and + # when we pass params exactly one callable it will think it is + # supposed to wrap it. Which is what it should do, but not + # what we are testing here. + @params(Params(dummy=1), val) + def test_bad_arg(self): + pass + + @params(TYPED_VALUES) + def test_non_params_value_for_params_attr_is_invalid(self, typ, val): + msg = fr'(?i)(?=.*params_test_bad_value)(?=.*not.*{typ})' + with self.assertRaisesRegex(ValueError, msg): + class Test(ParamsMixin, unittest.TestCase): + params_test_bad_value = val + + def test_debug(self): + with captured_stdout() as stdout: + class Test(ParamsMixin, unittest.TestCase): + paramsDebug = True + paramsRequired = False + def test_dummy(): pass + @params + def test_foo(self, a): pass + params_test_foo = Params(x=7, y=3) + @params(a=1, b=2) + def test_bar(self, z): pass + params_test_bar = Params(c=4, d=6) + self.assertEqual( + stdout.getvalue(), + # Making this an exact match means any change to the debug + # output requires a change here. On the other hand, that also + # means that temporary changes to the debug output during bug + # fixing in params_map itself will be caught by this test so + # they don't sneak in to production code unintentionally. + dedent("""\ + @params method 'test_foo' + params_ attribute 'params_test_foo' + @params method 'test_bar' + params_ attribute 'params_test_bar' + 'test_foo' has no decorator params and 1 params_ attribute + generated test_foo__x(7) + generated test_foo__y(3) + 'test_bar' has decorator params and 1 params_ attribute + generated test_bar__a(1) + generated test_bar__b(2) + generated test_bar__c(4) + generated test_bar__d(6) + """) + ) + + +class Test_params_map(AssertMixin, ParamsMixin, unittest.TestCase): + + @params + def test(self, callspec, expected): + i = 0 + @params_map + def numbered_params(*args, **kw): + nonlocal i + # With this 'if' we test params_map handling being handed raw data + yield f't{i}', args[0] if len(args) == 1 else C(*args, **kw) + i += 1 + result = callspec(numbered_params) + self.assertEqual(dict(result), expected) + self.assertIsInstance(result, Params) + + params_test__value_wrapping = Params( + string = C( C('abc'), dict(t0=C('abc')) ), + char = C( C(C('a')), dict(t0=C('a')) ), + tuple = C( C(('a', 2)), dict(t0=C(('a', 2))) ), + list = C( C(['b', 7]), dict(t0=C(['b', 7])) ), + dict = C( C(dict(a=1, b=2)), dict(t0=C(dict(a=1, b=2))) ), + multiple = C( C(4, 7, 9), dict(t0=C(4), t1=C(7), t2=C(9)) ), + kw_only = C( C(x=1, y=C(7)), dict(x__t0=C(1), y__t1=C(7)) ), + mixed = C( + C(1, (3, 5), z=[0, 1]), + dict(t0=C(1), t1=C((3, 5)), z__t2=C([0, 1])), + ), + mixed2 = C( + C(4, z=7, b=9), + dict(t0=C(4), z__t1=C(7), b__t2=C(9)), + ), + ) + + params_test__flattening = Params( + one_pset = C( + C(Params(a=1, b=2)), + dict(a__t0=C(1), b__t1=C(2)), + ), + pset_and_duplicator = C( + C( + Params(x=1, y=2, z=3), + params_map(lambda v: [('z', v), ('x', v)])( + Params(a='a', b='b'), + ), + ), + dict( + x__t0=C(1), + y__t1=C(2), + z__t2=C(3), + a__z__t3=C('a'), + a__x__t4=C('a'), + b__z__t5=C('b'), + b__x__t6=C('b'), + ), + ), + two_psets_and_kewords = C( + C(Params(a=0, b=1), Params(c=2, d=3), e=4, f=5), + {f"{chr(ord('a')+i)}__t{i}": C(i) for i in range(6)}, + ), + ) + + params_test__only = Params( + no_extra_name = C( + C(params_map(lambda v: only(C(v+1)))(a=1, b=2)), + dict(a__t0=C(2), b__t1=C(3)), + ), + adds_name = C( + C(params_map(lambda v: only('z', C(v+1)))(a=1, b=2)), + dict(a__z__t0=C(2), b__z__t1=C(3)), + ), + generates_name = C( + C(params_map(lambda v: only(chr(ord('b') + v), C(v+1)))(1, 2)), + dict(c__t0=C(2), d__t1=C(3)), + ), + ) + + def test_output_can_be_zero_or_many(self): + @params_map(with_name=True) + def zero_or_many(name, *args, **kw): + if name == 'skip': + return + if name == 'dup': + yield '1', C(*args, **kw) + yield '2', C(*args, **kw) + yield '3', C(*args, **kw) + else: + yield '', C(*args, **kw) + self.assertEqual( + zero_or_many(dup=C(1), skip=C(2), other=C(3)), + dict(dup__1=C(1), dup__2=C(1), dup__3=C(1), other=C(3)), + ) + + def test_composing_maps(self): + @params_map + def add_args(foo, bar): + yield foo, C(foo + bar) + @params_map + def no_zed(v): + yield '', C(v.removesuffix('zed')) + round1 = add_args(a=C('abc', 'de'), b=C('x', 'zed')) + self.assertEqual(round1, dict(a__abc=C('abcde'), b__x=C('xzed'))) + self.assertEqual(no_zed(round1), dict(a__abc=C('abcde'), b__x=C('x'))) + + @params( + repeated_name = C( C('a', ('a', 'a')), err="'a'" ), + colliding_names = C( C('a__a', a=C('a')), err=r"a=C\('a'\)" ), + null_name = C( C(''), err="''" ), + good_before_dup = C( C('a', 'b', 'c', 'c'), err="'c'" ), + empty_in_middle = C( C('a', 'b', '', 'c'), err="''" ), + ) + def test_names_must_be_unique(self, callspec, err): + @params_map + def yield_name(*args, **kw): + yield args[0], "doesn't matter" + with self.assertRaisesRegex(ValueError, err): + callspec(yield_name) + + @params + def test_with_name(self, callspec, expected): + @params_map(with_name=True) + def use_first_arg_if_no_name(n, *args, **kw): + label = '' if n else args[0] + yield label, C(*args, **kw) + self.assertEqual(callspec(use_first_arg_if_no_name), expected) + + params_test_with_name = Params( + named = C( C(a=1, b=2), expected=Params(a=C(1), b=C(2))), + noname = C( C('a', 'b'), expected=Params(a=C('a'), b=C('b'))), + mixed = C( C('a', b=2), expected=Params(a=C('a'), b=C(2))), + ) + + @params( + params_test_with_name, + cskip = C( C('z', c=1), expected=Params(z=C('z'))), + xyskip = C( C(b=1, x__y='a'), expected=Params(b=C(1))), + noxskip = C(C('m', x__b='a'), expected=Params(m=C('m'), x__b='a')), + ) + def test_with_namelist(self, callspec, expected): + @params_map(with_namelist=True) + def use_first_arg_or_skip_on_c_d_xy(nl, *args, **kw): + self.assertIsInstance(nl, NameList) + if nl.has_any('c', 'd') or nl.has_all('x', 'y'): + return + label = '' if nl else args[0] + yield label, C(*args, **kw) + self.assertEqual(callspec(use_first_arg_or_skip_on_c_d_xy), expected) + + def test_with_name_and_with_namelist_cannot_both_be_true(self): + with self.assertRaisesRegex(ValueError, "(?i)(?=.*both)(?=.*True)"): + @params_map(with_name=True, with_namelist=True) + def foo(): + pass + + @params( + on_call = C(lambda *_: 1/0), + on_bad_value = C(lambda *a: [('x', 0/a[-1])], on_0=True), + in_generator = C( + lambda *a: [(str(x), 0/x) for x in (a[-1]+1, a[-1])], + on_0=True, + ), + too_many_values = C(lambda _: [('', 2, 3)]), + too_few_values = C(lambda _: [('',)]), + non_iterable = C(lambda _: [1]), + non_string_name = C(lambda _: [(1, 1)]), + non_identifier = C(lambda _: [('.', 1)]), + ) + def test_errors_in_wrapped_function(self, func, on_0=False): + test_params_map = params_map(func) + expected = 'zero=C(0)' if on_0 else 'one=C(1)' + with self.assertRaisesRegex(ValueError, re.escape(expected)): + test_params_map(one=1, zero=0) + + @params( + no_argument = C(C(), TypeError, 'missing'), + extra_args = C(C(lambda: 1, 'bar'), TypeError, '2 were given'), + non_func_arg = C(C(1), TypeError, 'int'), + bad_keyword = C(C(bad_key=True), TypeError, 'bad_key'), + ) + def test_bad_arguments(self, callspec, ex, msg): + with self.assertRaisesRegex(ex, msg): + # For the keyword case we get back a wreapper and need to call + # it to see the error. Otherwise the error happens before the call. + callspec(params_map)('foo') + + def test_debug(self): + @params_map(debug=True) + def test_map(anarg, k=None): + yield 'arg', C(anarg) + yield 'kw', C(k=k) + with captured_stdout() as stdout: + test_map(x=C('a', k='b'), y=C('d', k='e')) + self.assertEqual( + stdout.getvalue(), + # Making this an exact match means any change to the debug + # output requires a change here. On the other hand, that also + # means that temporary changes to the debug output during bug + # fixing in params_map itself will be caught by this test so + # they don't sneak in to production code unintentionally. + dedent("""\ + flattening using test_map + an='x' av=C('a', k='b') + n='arg' v=C('a') name='x__arg' + n='kw' v=C(k='b') name='x__kw' + an='y' av=C('d', k='e') + n='arg' v=C('d') name='y__arg' + n='kw' v=C(k='e') name='y__kw' + """) + ) + + # The exact causes here are not part of the fixed expecations (just + # that there *is* a cause), but if they change it is worth noticing. + + utility_maps = params_map(lambda f, **k: only(f.__name__, C(f, **k)))( + C( + with_names, + as_map=with_names, + ), + C( + as_value, + as_map=as_value, + unnamed_cause_msg=r'(?i)(?=.*invalid)(?=.*1)' + ), + C( + add_label, + as_map=add_label('xxx'), + unnamed_cause_msg=r'x=C\(1\)', + badarg_cause_ex=ValueError, + badarg_cause_msg=r'(?i)(?=.*invalid)(?=.*1)' + ), + C( + include_if, + as_map=include_if(lambda *_: True), + badarg_cause_ex=TypeError, + badarg_cause_msg=r"'int' object is not callable", + ), + C( + include_unless, + as_map=include_unless(lambda *_: False), + badarg_cause_ex=TypeError, + badarg_cause_msg=r"'int' object is not callable", + ), + C( + for_each_name, + as_map=for_each_name('aname'), + unnamed_cause_msg=r"aname=(?=.*exists)(?=.*C\('aname', 1\))", + badarg_cause_ex=ValueError, + badarg_cause_msg=r"(?i)(?=.*invalid label)(?=.*1)", + ), + C( + for_each_function, + as_map=for_each_function(int), + unnamed_cause_msg=r'int=C\(.*int.*1\)', + badarg_cause_ex=AttributeError, + badarg_cause_msg=r"'int'.*__name__", + ), + ) + + @params + def test_utility_maps(self, utility, callspec, expected): + self.assertEqual(callspec(utility), expected) + + params_test_utility_maps = Params( + + params_map( + lambda *a, as_map, **k: only('no_args', C(as_map, C(), dict())) + )(utility_maps), + + with_names = C( + with_names, + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C('a', 1), z=C('z', 'a'), foo=C('foo', ['bar'])), + ), + + as_value = C( + as_value, + C('a', 'foo', 'bar'), + dict(a=C('a'), foo=C('foo'), bar=C('bar')), + ), + + add_label = C( + add_label('xxx'), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a__xxx=C(1), z__xxx=C('a'), foo__xxx=C(['bar'])), + ), + + include_if__include_all = C( + include_if(lambda *_: True), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1), z=C('a'), foo=C(['bar'])), + ), + + include_if__include_none = C( + include_if(lambda *_: False), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(), + ), + + include_unless__omit_all = C( + include_unless(lambda *_: True), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(), + ), + + include_unless__omit_none = C( + include_unless(lambda *_: False), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1), z=C('a'), foo=C(['bar'])), + ), + + include_if__include_one_letters = C( + include_if(lambda n, v: n.has_any('a', 'z')), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1), z=C('a')), + ), + + include_unless__omit_one_letters = C( + include_unless(lambda n, v: n.has_any('a', 'z')), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(foo=C(['bar'])), + ), + + include_if__include_int_values = C( + include_if(lambda n, v: type(v) == int), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a=C(1)), + ), + + include_unless__omit_int_values = C( + include_unless(lambda n, v: type(v) == int), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(z=C('a'), foo=C(['bar'])), + ), + + include_if__include_int_values_with_label = C( + include_if(lambda n, v: type(v) == int, label='int'), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(a__int=C(1)), + ), + + include_unless__omit_int_values_with_label = C( + include_unless(lambda n, v: type(v) == int, label='non_int'), + C(Params(a=1), Params(z='a', foo=['bar'])), + dict(z__non_int=C('a'), foo__non_int=C(['bar'])), + ), + + for_each_name = C( + for_each_name('some', 'names'), + C(42, a=1, b=C(2, z=7)), + dict( + some=C('some', 42), + names=C('names', 42), + a__some=C('some', 1), + a__names=C('names', 1), + b__some=C('some', 2, z=7), + b__names=C('names', 2, z=7), + ), + ), + + for_each_function = C( + for_each_function(as_value, add_label), + C(42, a=1, b=C(2, z=7)), + dict( + as_value=C(as_value, 42), + add_label=C(add_label, 42), + a__as_value=C(as_value, 1), + a__add_label=C(add_label, 1), + b__as_value=C(as_value, 2, z=7), + b__add_label=C(add_label, 2, z=7), + ), + ), + + ) + + @params + def test_utility_map_failures( + self, + utility, + callspec, + ex, + msg, + cause_ex=None, + cause_msg=None, + ): + with self.assertRaisesRegexEx(ex, msg, cause_ex, cause_msg): + # Some errors only show up when a generated utility is called. + callspec(utility)(t1=1, t2=2) + + params_test_utility_map_failures = Params( + + params_map( + lambda f, as_map=None, unnamed_cause_msg=r'missing.*label', **k: + only('unnamed_input', + C( + as_map, + C(1, 1), + ValueError, fr'(?i)(?=.*{as_map.__name__})(?=.*1)', + ValueError, unnamed_cause_msg, + ), + ) + )(utility_maps), + + params_map( + lambda f, *, badarg_cs=C(1), badarg_cause_ex, badarg_cause_msg, **k: + only( + 'bad_map_maker_arg', + C( + f, + badarg_cs, + ValueError, fr'(?i)(?=.*{f.__name__})(?=.*t1=C\(1\))', + badarg_cause_ex, badarg_cause_msg, + ) + ) + )( + include_if(lambda n, *a, **k: 'badarg_cause_ex' in k)(utility_maps), + ), + + ) + + +class TestNameList(ParamsMixin, unittest.TestCase): + + names_to_list_map = dict( + XonenameX = ['XonenameX'], + Xtwo__namesX = ['Xtwo', 'namesX'], + Xmany__many__names__hereX = ['Xmany', 'many', 'names', 'hereX'], + Xnames_with__underscores_tooX = ['Xnames_with', 'underscores_tooX'], + Xtoo_many___underscores____are_confusingX = [ + 'Xtoo_many', '_underscores', '', 'are_confusingX', + ] + ) + + name_nl_and_list = params_map(with_name=True)( + lambda n, v: only(C(n, NameList(n), v)) + )(**names_to_list_map) + + @params(name_nl_and_list) + def test_str_equals_name(self, name, nl, aslist): + self.assertEqual(name, str(nl)) + + @params(name_nl_and_list) + def test_str_supports_startswith(self, name, nl, aslist): + self.assertTrue(str(nl).startswith(name[:2])) + self.assertFalse(str(nl).startswith('notthestart')) + + @params(name_nl_and_list) + def test_str_supports_endswith(self, name, nl, aslist): + self.assertTrue(str(nl).endswith(name[-2:])) + self.assertFalse(str(nl).endswith('nottheend')) + + @params(name_nl_and_list) + def test_str_supports_in(self, name, nl, aslist): + self.assertTrue(name[4:6] in str(nl)) + self.assertFalse('notthemiddle' in str(nl)) + + def test_empty_string_produces_empty_list(self): + self.assertEqual(list(NameList('')), []) + + nl_and_list = params_map(with_name=True)( + lambda n, v: only(C(NameList(n), v)) + )(**names_to_list_map) + + @params(nl_and_list) + def test_list(self, nl, aslist): + self.assertIsInstance(nl, list) + self.assertListEqual(nl, aslist) + + @params(nl_and_list) + def test_indexing(self, nl, aslist): + for i in range(len(aslist)): + self.assertEqual(nl[i], aslist[i]) + + @params(nl_and_list) + def test_contains(self, nl, aslist): + for name in aslist: + self.assertTrue(name in nl) + + # Running all of these for all the examples appears to be a bit of + # overkill, but not only does it exercise the machinery and provide a + # non-trivial example, it found a bug that failed for only one of the + # example names. + + @params_map + def has_all_tests(nl, l): + yield 'one_name', C( nl, C(l[0]), True) + yield 'all_names', C( nl, C(*l), True) + yield 'name_notname', C( nl, C(l[0], 'notname'), False) + yield 'no_name', C( nl, C(''), False) + + @params(has_all_tests(nl_and_list)) + def test_has_all(self, nl, callspec, expected_value): + self.assertEqual(callspec(nl.has_all), expected_value) + + @params_map + def has_any_tests(nl, l): + yield 'one_name', C( nl, C(l[0]), True) + yield 'one_name_tuple', C( nl, C((l[0],)), True) + yield 'one_name_list', C( nl, C([l[0]]), True) + yield 'all_names', C( nl, C(*l), True) + yield 'all_names_tuple', C( nl, C(tuple(l)), True) + yield 'all_names_list', C( nl, C(l), True) + yield 'all_names_dict', C( nl, C({n: n for n in l}), True) + yield 'name_notname', C( nl, C(l[0], 'notname'), True) + yield 'no_names', C( nl, C(), False) + yield 'no_names_list', C( nl, C([]), False) + yield 'one_notname', C( nl, C('notname'), False) + yield 'two_notnames', C( nl, C('notname', 'alsonot'), False) + yield 'null_str_arg', C( nl, C(''), False) + yield 'null_str_tuple', C( nl, C(('',)), False) + yield 'null_str_list', C( nl, C(['',]), False) + + @params(has_any_tests(nl_and_list)) + def test_has_any(self, nl, callspec, expected_value): + self.assertEqual(callspec(nl.has_any), expected_value) + + def test_has_any_false_if_empty_name(self): + nl = NameList('') + self.assertFalse(nl.has_any('')) + + def test_has_any_false_partial_names(self): + nl = NameList('foo__bar__bird') + self.assertTrue(nl.has_any('foo', 'bar', 'bird')) + self.assertFalse(nl.has_any('fo', '_bar', 'ird', 'ir', 'or', '')) + + +if __name__ == '__main__': + unittest.main() diff --git a/Lib/test/test_email/test_testing_infrastructure.py b/Lib/test/test_email/test_testing_infrastructure.py new file mode 100644 index 000000000000000..e3b6214998794b1 --- /dev/null +++ b/Lib/test/test_email/test_testing_infrastructure.py @@ -0,0 +1,377 @@ +from email import errors +from test.test_email import TestEmailBase, for_each_character +from test.test_email.params import C, params_map, params, Params + +class TestAssertDefectsMatch(TestEmailBase): + + # The code should behave the same whether the pattern comes direct or + # out of a callable. + @params_map + def direct_and_callable(actual, expected, *args): + yield 'direct', C(actual, expected, *args) + expected = [(lambda x: x, x) for x in expected] + yield 'callable', C(actual, expected, *args) + + @params + def test_success(self, actual, expected): + self.assertDefectsMatch(actual, expected) + + np_checker = lambda s: (errors.NonPrintableDefect, f'.*non-printable.*{s}') + + params_test_success = direct_and_callable( + + no_defects = C([], []), + + one_defect_by_class = C( + [errors.InvalidHeaderDefect('foo')], + [errors.InvalidHeaderDefect], + ), + + one_defect_by_regex = C( + [errors.InvalidHeaderDefect('This is a message')], + [(errors.InvalidHeaderDefect, '.*is a')], + ), + + multiple_defects_by_class = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.InvalidHeaderDefect('This is a different message'), + errors.InvalidHeaderDefect('This is the same message'), + errors.InvalidHeaderDefect('This is the same message'), + ], + [*[errors.InvalidHeaderDefect] * 4], + ), + + multiple_defects_by_regex = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.InvalidHeaderDefect('This is a different message'), + errors.InvalidHeaderDefect('This is the same message'), + errors.InvalidHeaderDefect('This is the same message'), + ], + [ + (errors.InvalidHeaderDefect, '.*the same'), + (errors.InvalidHeaderDefect, '.*is a'), + (errors.InvalidHeaderDefect, '.*different'), + (errors.InvalidHeaderDefect, '.*the same'), + ], + ), + + multiple_different_defects_by_class = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.ObsoleteHeaderDefect('This is a different message'), + errors.NonPrintableDefect('abc'), + ], + [ + errors.InvalidHeaderDefect, + errors.ObsoleteHeaderDefect, + errors.NonPrintableDefect, + ], + ), + + multiple_different_defects_by_regex = C( + [ + errors.InvalidHeaderDefect('This is a message'), + errors.ObsoleteHeaderDefect('This is a different message'), + errors.NonPrintableDefect('abc'), + ], + [ + (errors.ObsoleteHeaderDefect, '.*different'), + (errors.NonPrintableDefect, '.*non-printable.*abc'), + (errors.InvalidHeaderDefect, '.*is a'), + ], + ), + + ) + + @params + def test_failure(self, actual, expected, msg): + with self.assertRaisesRegex(AssertionError, msg): + self.assertDefectsMatch(actual, expected) + + params_test_failure = direct_and_callable( + + one_extra_defect_expecting_none = C( + [errors.InvalidHeaderDefect('foo')], + [], + r'(?i)0.*matched.*1.*extra', + ), + + two_extra_defects_expecting_none = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + ], + [], + r'(?i)0.*matched.*2.*extra', + ), + + two_extra_defects_expecting_one = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + ], + [(errors.InvalidHeaderDefect, 'bar')], + r'(?i)1.*matched.*1.*extra', + ), + + three_extra_defects_expecting_one = C( + [ + errors.InvalidHeaderDefect('foo'), + *[errors.InvalidHeaderDefect('bar')]*3, + ], + [(errors.InvalidHeaderDefect, 'bar')], + r'(?is)1.*matched.*3.*extra(?=.*foo)(?=.*bar.*bar)', + ), + + one_missing_defect_expecting_one = C( + [], + [(errors.InvalidHeaderDefect, 'bar')], + r'(?is)0.*matched.*1.*missing.*bar', + ), + + two_missing_defects_expecting_two = C( + [ + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + ], + [ + (errors.InvalidHeaderDefect, 'foo'), + (errors.InvalidHeaderDefect, 'bird'), + ], + r'(?is)0.*matched.*2.*did not match' + r'(?=.*foo)(?=.*bird)(?=.*bar)(?=.*bing)', + ), + + two_missing_defects_expecting_four = C( + [ + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + ], + [ + (errors.InvalidHeaderDefect, 'bar'), + (errors.InvalidHeaderDefect, 'foo'), + (errors.InvalidHeaderDefect, 'bing'), + (errors.InvalidHeaderDefect, 'bird'), + ], + r'(?is)2.*matched.*2.*missing(?=.*foo)(?=.*bird)', + ), + + two_extra_defects_expecting_two = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + errors.InvalidHeaderDefect('bar'), + ], + [ + (errors.InvalidHeaderDefect, 'bar'), + (errors.InvalidHeaderDefect, 'bing'), + ], + r'(?is)2.*matched.*2.*extra(?=.*foo)(?=.*bar)', + ), + + two_extra_defects_one_missing_expecting_three = C( + [ + errors.InvalidHeaderDefect('foo'), + errors.InvalidHeaderDefect('bar'), + errors.InvalidHeaderDefect('bing'), + errors.InvalidHeaderDefect('bar'), + ], + [ + (errors.InvalidHeaderDefect, 'bar'), + (errors.InvalidHeaderDefect, 'bing'), + (errors.InvalidHeaderDefect, 'bing'), + ], + r'(?is)2.*matched(?=.*2.*extra)(?=.*1.*missing)' + r'(?=.*foo)(?=.*bar)(?=.*bing)', + ), + + actual_is_string = C( + ['foo'], + [], + r'(?is)0.*matched.*1.*extra(?=.*str.*foo)', + ), + + actual_is_tuple = C( + [(errors.InvalidHeaderDefect, 'foo', 'bar')], + [], + r'(?is)0.*matched.*1.*extra(?=.*tuple.*InvalidHeaderDefect.*foo)', + ), + + ) + + @params + def test_bad_expected_patterns(self, actual, expected, msg): + with self.assertRaisesRegex((ValueError, TypeError), msg): + self.assertDefectsMatch(actual, expected) + + params_test_bad_expected_patterns = direct_and_callable( + + not_subscriptable = C( + [], + [1], + r'(?i)(?=.*invalid).*1', + ), + + string = C( + [], + ['foo'], + r'(?i)(?=.*invalid).*foo', + ), + + triple = C( + [], + [(errors.InvalidHeaderDefect, 'foo', 'bar')], + r'(?i)too many values', + ), + + singleton = C( + [], + [(errors.InvalidHeaderDefect,)], + '(?i)not enough values', + ), + + # This only happens if a comparison is made. Which will happen. + regex_is_not_string = C( + [errors.InvalidHeaderDefect('foo')], + [(errors.InvalidHeaderDefect, 200)], + r'(?i)must be string', + ), + + backwards_expected_entry = C( + [], + [('foo', errors.InvalidHeaderDefect)], + r'(?i)(?=.*invalid).*foo.*InvalidHeaderDefect', + ), + + ) + + @params( + multiple_args = C( + [(lambda x, y, z: (errors.InvalidHeaderDefect, z), 'x', 1, 'foo')], + ), + no_args = C([(lambda: errors.InvalidHeaderDefect,)]), + ) + def test_callable_success(self, expected): + self.assertDefectsMatch([errors.InvalidHeaderDefect('foo')], expected) + + @params( + no_args_bad_result = C([(lambda: 'bad value',)], r'(?i)bad value'), + wrong_number_of_args = C([(lambda: 'x', 1)], r'(?i)arguments'), + ) + def test_callable_failure(self, expected, msg): + with self.assertRaisesRegex((ValueError, TypeError), msg): + self.assertDefectsMatch([], expected) + + +class TestForEachCharacter(TestEmailBase): + + @params + def test_for_each_character(self, callspec, chars, expected): + callspecs = Params(test=callspec) + expected = Params(**{f'test__{c}': v for c, v in expected.items()}) + self.assertEqual(for_each_character(chars)(callspecs), expected) + + @params_map + def for_each_value_type(callspec, chars, expected): + yield '', C(callspec, chars, expected) + yield 'in_list', C( + C(foo=['bar', callspec.args[0], 'z']), + chars=chars, + expected={ + n: C(foo=['bar', v.args[0], 'z']) for n, v in expected.items()}, + ) + yield 'in_tuple', C( + C(foo=('bar', callspec.args[0], 'z')), + chars=chars, + expected={ + n: C(foo=('bar', v.args[0], 'z')) for n, v in expected.items()}, + ) + yield 'in_dict', C( + C(foo=dict(a=callspec.args[0], z=1)), + chars=chars, + expected={ + n: C(foo=dict(a=v.args[0], z=1)) for n, v in expected.items()}, + ) + + params_test_for_each_character = for_each_value_type( + + no_subs = C( + C('no subs'), + chars='./', + expected=dict(full_stop=C('no subs'), solidus=C('no subs')), + ), + + one_sub = C( + C('one{char}sub'), + chars='./', + expected=dict(full_stop=C('one.sub'), solidus=C('one/sub')), + ), + + all_three_sub_types = C( + C('plain {char} escaped {echar} escaped repr {erchar}.'), + chars='\t', + expected=dict(HT=C('plain \t escaped \\\t escaped repr \\\\t.')), + ), + + a_list = C( + C(['a', '{char}', '{echar}']), + chars='/', + expected=dict(solidus=C(['a', '/', '/'])), + ), + + a_tuple = C( + C(('{char}{echar}', '{erchar}')), + chars='.', + expected=dict(full_stop=C((r'.\.', r'\.'))), + ), + + a_dict = C( + C(dict(a='{char}', b='{erchar}')), + chars='a', + expected=dict(latin_small_letter_a=C(dict(a='a', b='a'))), + ), + + ) + + def test_for_each_character_complex_input(self): + callspecs = Params( + two_positional=C('pos {char} 1', 'pos {echar} 2'), + all_value_types=C( + '{char}', + d=dict(a='a {char}', r='{erchar}'), + l=['a', '{char}', 'b'], + t=('{char}', ('{char}', 1)), + ), + dummy=C('no subs'), + ) + chars = 'X\t.' + expected = Params( + two_positional__latin_capital_letter_x=C('pos X 1', 'pos X 2'), + two_positional__full_stop=C('pos . 1', r'pos \. 2'), + two_positional__HT=C('pos \t 1', 'pos \\\t 2'), + all_value_types__latin_capital_letter_x=C( + 'X', + d=dict(a='a X', r='X'), + l=['a', 'X', 'b'], + t=('X', ('X', 1)), + ), + all_value_types__HT=C( + '\t', + d=dict(a='a \t', r='\\\\t'), + l=['a', '\t', 'b'], + t=('\t', ('\t', 1)), + ), + all_value_types__full_stop=C( + '.', + d=dict(a='a .', r='\\.'), + l=['a', '.', 'b'], + t=('.', ('.', 1)), + ), + dummy__latin_capital_letter_x=C('no subs'), + dummy__full_stop=C('no subs'), + dummy__HT=C('no subs'), + ) + self.assertEqual(for_each_character(chars)(callspecs), expected)