diff --git a/docs/customize.rst b/docs/customize.rst index 2a14314..9df179b 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -61,6 +61,34 @@ Other editable attributes * :py:obj:`~nameparser.config.Constants.force_mixed_case_capitalization` - If set, forces the capitalization of mixed case strings when :py:meth:`~nameparser.parser.HumanName.capitalize` is called. * :py:obj:`~nameparser.config.Constants.suffix_delimiter` - additional delimiter used to split suffix groups after comma-splitting, e.g. ``" - "`` for names like ``"Jane Smith, RN - CRNA"``. Defaults to ``None`` (disabled). * :py:obj:`~nameparser.config.Constants.initials_separator` - string placed between consecutive initials within the same name group (after the delimiter). Defaults to ``" "``, so ``"A. K."``; set to ``""`` for compact ``"A.K."``. +* :py:obj:`~nameparser.config.Constants.patronymic_name_order` - If set, detects Russian formal-order names (``Surname GivenName Patronymic``) via a trailing East-Slavic patronymic suffix and rotates the parts to Western order (``first=GivenName``, ``middle=Patronymic``, ``last=Surname``). Opt-in; see subsection below. + + +Russian Formal Name Order +~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default the parser treats all three-part names as ``First Middle Last``. For +Russian data in formal order (``Surname GivenName Patronymic``), enable +``patronymic_name_order``:: + + >>> from nameparser import HumanName + >>> from nameparser.config import Constants + >>> C = Constants(patronymic_name_order=True) + >>> hn = HumanName("Ivanov Ivan Ivanovich", constants=C) + >>> hn.first, hn.middle, hn.last + ('Ivan', 'Ivanovich', 'Ivanov') + +Detection is anchored on a recognised East-Slavic patronymic suffix +(``-ovich``, ``-ovna``, ``-evich``, ``-evna``, ``-ichna``, and the irregular +forms ``-ilyich``, ``-kuzmich``, ``-lukich``, ``-fomich``, ``-fokich``; same +patterns in Cyrillic). A comma activates the parser's standard +Last, First Middle path, which already handles Russian formal order — +reordering is suppressed to avoid a double-transformation. + +**Opt-in tradeoff:** when the flag is on, any name whose last token happens to +end in a patronymic suffix is reordered — including Western names with +patronymic-form surnames such as ``"David Michael Abramovich"``. Enable this +flag only when your data is predominantly Russian formal-order names. Splitting last-name prefix particles @@ -74,12 +102,14 @@ automatically:: >>> from nameparser import HumanName >>> from nameparser.config import CONSTANTS - >>> CONSTANTS.prefixes.add('op') + >>> CONSTANTS.prefixes.add('op') # doctest: +ELLIPSIS + SetManager({...}) >>> HumanName("Jan op den Berg").last_base 'Berg' >>> HumanName("Jan op den Berg").last_prefixes 'op den' - >>> CONSTANTS.prefixes.remove('op') + >>> CONSTANTS.prefixes.remove('op') # doctest: +ELLIPSIS + SetManager({...}) Note the ``remove`` call at the end — ``customize.rst`` examples share global ``CONSTANTS``, so mutations must be reversed to avoid affecting later examples. diff --git a/docs/release_log.rst b/docs/release_log.rst index ed5a5f4..ca9104d 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -19,6 +19,7 @@ Release Log - Fix extra whitespace before punctuation in ``str()`` output when a ``string_format`` field is empty (closes #139) - Fix ``'apn aprn'`` split into separate ``suffix_acronyms`` entries so each is recognized independently (closes #155) - Add ``last_base``, ``last_prefixes`` (and ``_list`` variants) plus ``family`` / ``family_prefixes`` aliases for splitting last-name prefix particles (tussenvoegsels) from the core surname (#130, #132) + - Add ``patronymic_name_order`` flag to ``Constants`` and ``HumanName`` for opt-in detection and reordering of Russian formal-order names (Surname GivenName Patronymic) (#85) * 1.2.1 - June 19, 2026 - Fix ``initials()`` interpolating the literal ``None`` for empty name parts when ``empty_attribute_default = None`` (e.g. ``"J. None D."``); empty parts now render as an empty string and a fully-empty result returns ``empty_attribute_default`` - Add ``python -m nameparser "Name String"`` command-line helper that prints a parsed name diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 1d5c034..4d7bae4 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -342,6 +342,34 @@ class Constants: """ + patronymic_name_order = False + """ + If set, detects names in Russian formal order (``Surname GivenName Patronymic``) + by recognizing a trailing East-Slavic patronymic suffix on the last token, and + rotates the three name parts so that ``first``/``middle``/``last`` map to + given name / patronymic / surname respectively. Detection requires exactly one + token in each of first, middle, and last; names with multi-part given names or + multiple middle names are left unchanged. + + Opt-in because a Western person whose surname happens to end in a patronymic + suffix (e.g. ``"David Michael Abramovich"``) will be reordered incorrectly + when the flag is on. Enable only when your data is predominantly Russian + formal-order names. + + For per-instance control without a shared ``Constants``, pass a dedicated + instance: ``HumanName("...", constants=Constants(patronymic_name_order=True))``. + + .. doctest:: + + >>> from nameparser import HumanName + >>> from nameparser.config import Constants + >>> C = Constants(patronymic_name_order=True) + >>> hn = HumanName("Ivanov Ivan Ivanovich", constants=C) + >>> hn.first, hn.middle, hn.last + ('Ivan', 'Ivanovich', 'Ivanov') + + """ + def __init__(self, prefixes: Iterable[str] = PREFIXES, suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, @@ -350,7 +378,8 @@ def __init__(self, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS, - regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES + regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES, + patronymic_name_order: bool = False, ) -> None: # These four descriptor assignments call _CachedUnionMember.__set__, which # calls _invalidate_pst() and establishes self._pst. They must come before @@ -363,6 +392,7 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) + self.patronymic_name_order = patronymic_name_order def _invalidate_pst(self) -> None: self._pst = None diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index a97944f..3e62710 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -23,6 +23,14 @@ ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), ("space_before_comma", re.compile(r'\s+,', re.U)), + ("patronymic", re.compile( + r'(ovich|ovna|evich|evna|ichna|ilyich|kuzmich|lukich|fomich|fokich)$', + re.I | re.U, + )), + ("patronymic_cyrillic", re.compile( + r'(ович|овна|евич|евна|ична|ильич|кузьмич|лукич|фомич|фокич)$', + re.U, + )), ]) """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index b064e15..1ffcb53 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -89,6 +89,7 @@ class HumanName: last_list: list[str] suffix_list: list[str] nickname_list: list[str] + _had_comma: bool def __init__( self, @@ -117,6 +118,7 @@ def __init__( self.initials_delimiter = initials_delimiter if initials_delimiter is not None else self.C.initials_delimiter self.initials_separator = initials_separator if initials_separator is not None else self.C.initials_separator self.suffix_delimiter = suffix_delimiter if suffix_delimiter is not None else self.C.suffix_delimiter + self._had_comma = False if (first or middle or last or title or suffix or nickname): self.first = first self.middle = middle @@ -645,6 +647,20 @@ def is_an_initial(self, value: str) -> bool: """ return bool(self.C.regexes.initial.match(value)) + def is_patronymic(self, piece: str) -> bool: + """ + Return True if ``piece`` ends with a recognised East-Slavic patronymic + suffix, checked against both Latin-script and Cyrillic patterns in + ``self.C.regexes``. Latin suffixes: ``-ovich``, ``-ovna``, ``-evich``, + ``-evna``, ``-ichna``, and the irregular forms ``-ilyich``, ``-kuzmich``, + ``-lukich``, ``-fomich``, ``-fokich``. Cyrillic equivalents are matched + by a separate pattern. + """ + return bool( + self.C.regexes.patronymic.search(piece) + or self.C.regexes.patronymic_cyrillic.search(piece) + ) + # full_name parser @property @@ -683,6 +699,29 @@ def pre_process(self) -> None: self.parse_nicknames() self.squash_emoji() + def handle_patronymic_name_order(self) -> None: + """ + When patronymic_name_order is enabled, detect Russian formal order + (Surname GivenName Patronymic) and rotate to Western order. + Fires only for no-comma, single-token first/middle/last where the last + token is a patronymic and the middle token is not. Title, suffix, and + nickname parts do not affect this guard — reordering proceeds regardless + of whether they are present. + """ + if ( + not self._had_comma + and len(self.first_list) == 1 + and len(self.middle_list) == 1 + and len(self.last_list) == 1 + and self.is_patronymic(self.last_list[0]) + and not self.is_patronymic(self.middle_list[0]) + ): + self.first_list, self.middle_list, self.last_list = ( + self.middle_list, + self.last_list, + self.first_list, + ) + def post_process(self) -> None: """ This happens at the end of the :py:func:`parse_full_name` after @@ -690,6 +729,8 @@ def post_process(self) -> None: and :py:func:`handle_capitalization`. """ self.handle_firstnames() + if self.C.patronymic_name_order: + self.handle_patronymic_name_order() self.handle_capitalization() def fix_phd(self) -> None: @@ -769,6 +810,7 @@ def parse_full_name(self) -> None: # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] + self._had_comma = len(parts) > 1 if self.suffix_delimiter and len(parts) > 1: expanded = [parts[0]] diff --git a/tests/test_patronymic_order.py b/tests/test_patronymic_order.py new file mode 100644 index 0000000..8b2b036 --- /dev/null +++ b/tests/test_patronymic_order.py @@ -0,0 +1,181 @@ +from nameparser import HumanName +from nameparser.config import Constants +from tests.base import HumanNameTestBase + + +def test_latin_patronymic_matches() -> None: + # One common suffix and one irregular — the integration tests cover the rest. + C = Constants() + assert C.regexes.patronymic.search("Ivanovich") + assert C.regexes.patronymic.search("Ilyich") + + +def test_latin_patronymic_rejects_non_patronymic() -> None: + # EMPTY_REGEX (the default for missing keys) matches everything, + # so this test is red until the real pattern is in place. + C = Constants() + assert not C.regexes.patronymic.search("Smith") + + +def test_latin_patronymic_end_anchored() -> None: + # A surname ending in a patronymic suffix matches; the end-anchor does not + # prevent this. The parser guard tests verify reordering is suppressed. + C = Constants() + assert C.regexes.patronymic.search("Abramovich") + + +def test_cyrillic_patronymic_matches() -> None: + # One common suffix and one irregular. + C = Constants() + assert C.regexes.patronymic_cyrillic.search("Иванович") + assert C.regexes.patronymic_cyrillic.search("ильич") + + +def test_cyrillic_patronymic_rejects_non_patronymic() -> None: + C = Constants() + assert not C.regexes.patronymic_cyrillic.search("Иванов") + + +class PatronymicNameOrderReorderTests(HumanNameTestBase): + """Names that SHOULD be rotated when the flag is on.""" + + def setup_method(self) -> None: + self.C = Constants(patronymic_name_order=True) + + def hn(self, name: str) -> HumanName: + return HumanName(name, constants=self.C) + + def test_canonical_latin(self) -> None: + n = self.hn("Ivanov Ivan Ivanovich") + assert n.first == "Ivan" + assert n.middle == "Ivanovich" + assert n.last == "Ivanov" + + def test_sergeevich(self) -> None: + n = self.hn("Zarubkin Alexander Sergeevich") + assert n.first == "Alexander" + assert n.middle == "Sergeevich" + assert n.last == "Zarubkin" + + def test_hyphenated_surname(self) -> None: + # A hyphenated surname counts as one token. + n = self.hn("Blokin-Mechtalin Konstantin Yurievich") + assert n.first == "Konstantin" + assert n.middle == "Yurievich" + assert n.last == "Blokin-Mechtalin" + + def test_surname_looks_like_patronymic(self) -> None: + # "Petsevich" ends in -evich but is in the FIRST position. + n = self.hn("Petsevich Sergey Vitalyevich") + assert n.first == "Sergey" + assert n.middle == "Vitalyevich" + assert n.last == "Petsevich" + + def test_cyrillic(self) -> None: + n = self.hn("Иванов Иван Иванович") + assert n.first == "Иван" + assert n.middle == "Иванович" + assert n.last == "Иванов" + + def test_title_preserved(self) -> None: + n = self.hn("Dr. Ivanov Ivan Ivanovich") + assert n.title == "Dr." + assert n.first == "Ivan" + assert n.middle == "Ivanovich" + assert n.last == "Ivanov" + + def test_suffix_preserved(self) -> None: + n = self.hn("Ivanov Ivan Ivanovich Jr.") + assert n.first == "Ivan" + assert n.middle == "Ivanovich" + assert n.last == "Ivanov" + assert n.suffix == "Jr." + + def test_western_patronymic_surname_reordered_when_flag_on(self) -> None: + # Documented opt-in tradeoff: a Western name whose last token ends in a + # patronymic suffix is reordered incorrectly. Not a bug to fix. + n = self.hn("David Michael Abramovich") + assert n.first == "Michael" + assert n.middle == "Abramovich" + assert n.last == "David" + + +class PatronymicNameOrderGuardsTests(HumanNameTestBase): + """Names that must NOT be reordered even when the flag is on.""" + + def setup_method(self) -> None: + self.C = Constants(patronymic_name_order=True) + + def hn(self, name: str) -> HumanName: + return HumanName(name, constants=self.C) + + def test_already_correct_order(self) -> None: + # middle is patronymic → already in Western order, do not rotate + n = self.hn("Ivan Ivanovich Ivanov") + assert n.first == "Ivan" + assert n.middle == "Ivanovich" + assert n.last == "Ivanov" + + def test_middle_is_patronymic_surname_ends_ovich(self) -> None: + # "Roman Arkadyevich Abramovich": middle IS patronymic → guard fires + n = self.hn("Roman Arkadyevich Abramovich") + assert n.first == "Roman" + assert n.middle == "Arkadyevich" + assert n.last == "Abramovich" + + def test_two_token_name(self) -> None: + # 2-token: middle_list is empty → condition fails + n = self.hn("Roman Abramovich") + assert n.first == "Roman" + assert n.last == "Abramovich" + + def test_no_patronymic(self) -> None: + # Three tokens but no patronymic suffix on last → not reordered + n = self.hn("Ivanov Ivan Petrov") + assert n.first == "Ivanov" + assert n.middle == "Ivan" + assert n.last == "Petrov" + + def test_western_name_unchanged(self) -> None: + n = self.hn("John Michael Smith") + assert n.first == "John" + assert n.middle == "Michael" + assert n.last == "Smith" + + def test_comma_guard_last_first_pat(self) -> None: + # "Ivanov, Ivan Ivanovich" — comma means the order was declared + n = self.hn("Ivanov, Ivan Ivanovich") + assert n.first == "Ivan" + assert n.middle == "Ivanovich" + assert n.last == "Ivanov" + + def test_comma_guard_patronymic_form_surname(self) -> None: + # Without the comma guard this would wrongly rotate + n = self.hn("Sergeevich, Ivan Petrov") + assert n.last == "Sergeevich" + +class PatronymicNameOrderFlagOffTests(HumanNameTestBase): + """With default Constants (flag=False) nothing changes.""" + + def test_canonical_unchanged(self) -> None: + n = HumanName("Ivanov Ivan Ivanovich") + assert n.first == "Ivanov" + assert n.middle == "Ivan" + assert n.last == "Ivanovich" + + +class PatronymicNameOrderFlagTests(HumanNameTestBase): + + def test_default_is_false(self) -> None: + C = Constants() + assert C.patronymic_name_order is False + + def test_can_set_true_via_constructor(self) -> None: + C = Constants(patronymic_name_order=True) + assert C.patronymic_name_order is True + + def test_does_not_affect_other_instance(self) -> None: + C1 = Constants(patronymic_name_order=True) + C2 = Constants() + assert C1.patronymic_name_order is True + assert C2.patronymic_name_order is False