diff --git a/docs/customize.rst b/docs/customize.rst index 87881c0..2a14314 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -63,6 +63,43 @@ Other editable attributes * :py:obj:`~nameparser.config.Constants.initials_separator` - string placed between consecutive initials within the same name group (after the delimiter). Defaults to ``" "``, so ``"A. K."``; set to ``""`` for compact ``"A.K."``. +Splitting last-name prefix particles +------------------------------------- + +The :py:attr:`~nameparser.parser.HumanName.last_base` and +:py:attr:`~nameparser.parser.HumanName.last_prefixes` properties split the last +name at the boundary between leading prefix particles and the core surname. They +use the same ``PREFIXES`` set, so adding a particle makes the split pick it up +automatically:: + + >>> from nameparser import HumanName + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.prefixes.add('op') + >>> HumanName("Jan op den Berg").last_base + 'Berg' + >>> HumanName("Jan op den Berg").last_prefixes + 'op den' + >>> CONSTANTS.prefixes.remove('op') + +Note the ``remove`` call at the end — ``customize.rst`` examples share global +``CONSTANTS``, so mutations must be reversed to avoid affecting later examples. + +Because ``last_base`` is a plain string property, sorting a list of names by +core surname (ignoring prefix particles like *van*, *de la*) is just a key +function:: + + names = [ + HumanName("Vincent van Gogh"), + HumanName("Juan de la Vega"), + HumanName("John Smith"), + ] + sorted_names = sorted(names, key=lambda n: n.last_base.lower()) + # sort keys: 'gogh', 'smith', 'vega' → van Gogh, Smith, de la Vega + +To sort by first name when two people share the same ``last_base``, add it as +a secondary key:: + + sorted_names = sorted(names, key=lambda n: (n.last_base.lower(), n.first.lower())) Parser Customization Examples ----------------------------- diff --git a/docs/release_log.rst b/docs/release_log.rst index 4720d32..ed5a5f4 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -18,6 +18,7 @@ Release Log - Fix spurious leading space in surnames and empty token in suffix list after ``capitalize()`` with an empty middle or suffix (#164) - Fix extra whitespace before punctuation in ``str()`` output when a ``string_format`` field is empty (closes #139) - Fix ``'apn aprn'`` split into separate ``suffix_acronyms`` entries so each is recognized independently (closes #155) + - Add ``last_base``, ``last_prefixes`` (and ``_list`` variants) plus ``family`` / ``family_prefixes`` aliases for splitting last-name prefix particles (tussenvoegsels) from the core surname (#130, #132) * 1.2.1 - June 19, 2026 - Fix ``initials()`` interpolating the literal ``None`` for empty name parts when ``empty_attribute_default = None`` (e.g. ``"J. None D."``); empty parts now render as an empty string and a fully-empty result returns ``empty_attribute_default`` - Add ``python -m nameparser "Name String"`` command-line helper that prints a parsed name diff --git a/docs/usage.rst b/docs/usage.rst index f744dea..281a803 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -21,6 +21,12 @@ Requires Python 3.10+. 'Q. Xavier' >>> name.last 'de la Vega' + >>> name.last_base + 'Vega' + >>> name.last_prefixes + 'de la' + >>> name.family + 'Vega' >>> name.suffix 'III' >>> name.surnames diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 9e0e772..1d36ce1 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -8,14 +8,20 @@ #: join with all following name pieces until the suffix "MD", resulting in the #: correct parsing of the last name "von bergen wessels". PREFIXES = set([ + "'t", 'abu', + 'af', 'al', + 'av', + 'bar', + 'bat', 'bin', + 'bint', 'bon', 'da', 'dal', 'de', - 'de\'', + "de'", 'degli', 'dei', 'del', @@ -24,6 +30,7 @@ 'delle', 'delli', 'dello', + 'den', 'der', 'di', 'dí', @@ -39,9 +46,11 @@ 'santa', 'st', 'ste', + 'ter', 'van', 'vander', 'vel', - 'von', 'vom', + 'von', + 'zu', ]) diff --git a/nameparser/parser.py b/nameparser/parser.py index 7cc4f0b..b064e15 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -430,6 +430,94 @@ def given_names(self) -> str: """ return " ".join(self.given_names_list) or self.C.empty_attribute_default + def _split_last(self) -> tuple[list[str], list[str]]: + """Return (prefix_particles, base_words) split from the last name. + + The base_words list is never empty: if every word in the last name + matches a prefix particle, the guard fires and all words are returned + as the base with an empty prefix list (heuristic: a family name is + assumed not to consist entirely of particles). + + >>> HumanName("Vincent van Gogh")._split_last() + (['van'], ['Gogh']) + >>> HumanName("Anh Do")._split_last() + ([], ['Do']) + """ + words = " ".join(self.last_list).split() + i = 0 + while i < len(words) and self.is_prefix(words[i]): + i += 1 + if i == len(words): + # Heuristic: assume a family name isn't entirely composed of + # particles (e.g. surname "Do" which also appears in PREFIXES). + # Don't strip — treat the whole last name as the base. + return [], words + return words[:i], words[i:] + + @property + def last_prefixes_list(self) -> list[str]: + """ + List of leading prefix particles in the last name (the *tussenvoegsel*). + Returns ``[]`` when there are none, including the case where every word + in the last name matches a prefix — see :py:meth:`_split_last`. + + >>> HumanName("Juan de la Vega").last_prefixes_list + ['de', 'la'] + """ + return self._split_last()[0] + + @property + def last_base_list(self) -> list[str]: + """ + List of last-name words after stripping leading prefix particles. + Never empty: when every word matches a prefix, no stripping occurs and + the full last name is returned — see :py:meth:`_split_last`. + + >>> HumanName("Vincent van Gogh").last_base_list + ['Gogh'] + """ + return self._split_last()[1] + + @property + def last_base(self) -> str: + """ + The last name with leading prefix particles removed (the core surname). + For ``"van Gogh"`` this is ``"Gogh"``; for ``"Smith"`` it is ``"Smith"``. + ``last`` is always unchanged. When every word in the last name matches a + prefix particle, no stripping occurs and the full last name is returned. + + >>> HumanName("Vincent van Gogh").last_base + 'Gogh' + >>> HumanName("John Smith").last_base + 'Smith' + """ + return " ".join(self.last_base_list) or self.C.empty_attribute_default + + @property + def last_prefixes(self) -> str: + """ + The leading prefix particle(s) of the last name (the *tussenvoegsel*). + Returns ``""`` (or ``empty_attribute_default``) when there are none, + including when every word in the last name matches a prefix particle + (the all-particles guard; see :py:meth:`_split_last`). + + >>> HumanName("Vincent van Gogh").last_prefixes + 'van' + >>> HumanName("Juan de la Vega").last_prefixes + 'de la' + """ + return " ".join(self.last_prefixes_list) or self.C.empty_attribute_default + + @property + def family(self) -> str: + """Alias for :py:attr:`last_base`.""" + return self.last_base + + @property + def family_prefixes(self) -> str: + """Alias for :py:attr:`last_prefixes`.""" + return self.last_prefixes + # setter methods def _set_list(self, attr: str, value: str | list[str] | None) -> None: diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index 0d18449..8172ba4 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -162,3 +162,81 @@ def test_comma_three_conjunctions(self) -> None: self.m(hn.title, "Dr.", hn) self.m(hn.middle, "Q. Xavier", hn) self.m(hn.suffix, "III", hn) + + +class LastNamePrefixSplitTestCase(HumanNameTestBase): + + def test_van_gogh_last_base(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_base, "Gogh", hn) + + def test_van_gogh_last_prefixes(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_prefixes, "van", hn) + + def test_van_gogh_last_base_list(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_base_list, ["Gogh"], hn) + + def test_van_gogh_last_prefixes_list(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.last_prefixes_list, ["van"], hn) + + def test_von_bergen_wessels(self) -> None: + hn = HumanName("pennie von bergen wessels") + self.m(hn.last_base, "bergen wessels", hn) + self.m(hn.last_prefixes, "von", hn) + self.m(hn.last_base_list, ["bergen", "wessels"], hn) + self.m(hn.last_prefixes_list, ["von"], hn) + + def test_de_la_vega_multiword_prefix(self) -> None: + hn = HumanName("Juan de la Vega") + self.m(hn.last_base, "Vega", hn) + self.m(hn.last_prefixes, "de la", hn) + self.m(hn.last_prefixes_list, ["de", "la"], hn) + + def test_no_prefix(self) -> None: + hn = HumanName("John Smith") + self.m(hn.last_base, "Smith", hn) + self.m(hn.last_prefixes, "", hn) + # self.m() coerces [] via `expected or empty_attribute_default`; use assertEqual for empty lists + self.assertEqual(hn.last_prefixes_list, []) + + def test_do_guard_surname_equals_prefix_word(self) -> None: + # "Do" is in PREFIXES; without the guard last_base would be empty + hn = HumanName("Anh Do") + self.m(hn.last_base, "Do", hn) + self.m(hn.last_prefixes, "", hn) + + def test_all_particles_guard(self) -> None: + # Artificial case: last name whose every word is a prefix — must not strip + hn = HumanName("Smith van der") + # last="van der"; both words are prefixes — guard fires, base = full last + self.m(hn.last_base, hn.last, hn) + self.m(hn.last_prefixes, "", hn) + + def test_alias_family_equals_last_base(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.family, hn.last_base, hn) + + def test_alias_family_prefixes_equals_last_prefixes(self) -> None: + hn = HumanName("Vincent van Gogh") + self.m(hn.family_prefixes, hn.last_prefixes, hn) + + def test_da_silva_title_plus_prefix(self) -> None: + hn = HumanName("Dra. Andréia da Silva") + self.m(hn.last_base, "Silva", hn) + self.m(hn.last_prefixes, "da", hn) + + def test_empty_name(self) -> None: + hn = HumanName() + self.m(hn.last_base, "", hn) + self.m(hn.last_prefixes, "", hn) + # self.m() coerces [] via `expected or empty_attribute_default`; use assertEqual for empty lists + self.assertEqual(hn.last_base_list, []) + self.assertEqual(hn.last_prefixes_list, []) + + def test_case_insensitive_prefix_detection(self) -> None: + hn = HumanName("VINCENT VAN GOGH") + self.m(hn.last_prefixes, "VAN", hn) + self.m(hn.last_base, "GOGH", hn)