diff --git a/docs/customize.rst b/docs/customize.rst index 9df179b..fb763ce 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -131,6 +131,41 @@ a secondary key:: sorted_names = sorted(names, key=lambda n: (n.last_base.lower(), n.first.lower())) +First-Name Prefixes +------------------- + +``CONSTANTS.first_name_prefixes`` controls bound given-name prefixes that attach +to the following word to form one first name. By default it contains +``{'abdul', 'abdel', 'abdal', 'abu', 'abou', 'umm'}``. + +Example:: + + >>> from nameparser import HumanName + >>> hn = HumanName("abdul salam ahmed salem") + >>> hn.first, hn.middle, hn.last + ('abdul salam', 'ahmed', 'salem') + +To **disable** the feature entirely:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.first_name_prefixes.clear() + +To **add** a word (e.g. if your data uses ``mohamad`` as a bound prefix):: + + >>> CONSTANTS.first_name_prefixes.add('mohamad') + +To **remove** a single entry:: + + >>> CONSTANTS.first_name_prefixes.remove('umm') + +You can also pass a custom set per ``Constants`` instance:: + + >>> from nameparser.config import Constants + >>> c = Constants(first_name_prefixes={'abu', 'umm'}) + >>> hn2 = HumanName("abu bakr al saud", constants=c) + >>> hn2.first, hn2.last + ('abu bakr', 'al saud') + Parser Customization Examples ----------------------------- @@ -181,7 +216,7 @@ constant so that "Hon" can be parsed as a first name. If you don't want to detect any titles at all, you can remove all of them: - >>> CONSTANTS.titles.remove(*CONSTANTS.titles) + >>> CONSTANTS.titles.clear() Adding a Title diff --git a/docs/release_log.rst b/docs/release_log.rst index ca9104d..381d9ed 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -20,6 +20,12 @@ Release Log - Fix ``'apn aprn'`` split into separate ``suffix_acronyms`` entries so each is recognized independently (closes #155) - Add ``last_base``, ``last_prefixes`` (and ``_list`` variants) plus ``family`` / ``family_prefixes`` aliases for splitting last-name prefix particles (tussenvoegsels) from the core surname (#130, #132) - Add ``patronymic_name_order`` flag to ``Constants`` and ``HumanName`` for opt-in detection and reordering of Russian formal-order names (Surname GivenName Patronymic) (#85) + - Add ``first_name_prefixes`` set to ``Constants``; bound Arabic given-name + prefixes (``abdul``, ``abu``, etc.) now join forward to form a single first + name (e.g. ``"abdul salam ahmed salem"`` → ``first="abdul salam"``, + ``middle="ahmed"``, ``last="salem"``). Disable via + ``CONSTANTS.first_name_prefixes.clear()``. **Default-on: changes parsing + output for names with these prefixes.** (#150) * 1.2.1 - June 19, 2026 - Fix ``initials()`` interpolating the literal ``None`` for empty name parts when ``empty_attribute_default = None`` (e.g. ``"J. None D."``); empty parts now render as an empty string and a fully-empty result returns ``empty_attribute_default`` - Add ``python -m nameparser "Name String"`` command-line helper that prints a parsed name diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 4d7bae4..c143299 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -37,6 +37,7 @@ from nameparser.util import lc from nameparser.config.prefixes import PREFIXES +from nameparser.config.first_name_prefixes import FIRST_NAME_PREFIXES from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS from nameparser.config.conjunctions import CONJUNCTIONS from nameparser.config.suffixes import SUFFIX_ACRONYMS @@ -86,7 +87,7 @@ def __len__(self) -> int: def add_with_encoding(self, s: str, encoding: str | None = None) -> None: """ - Add the lower case and no-period version of the string to the set. Pass an + Add the lowercased, leading/trailing-periods-stripped version of the string to the set. Pass an explicit `encoding` parameter to specify the encoding of binary strings that are not DEFAULT_ENCODING (UTF-8). """ @@ -96,13 +97,15 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None: encoding = encoding or stdin_encoding or DEFAULT_ENCODING if isinstance(s, bytes): s = s.decode(encoding) - self.elements.add(lc(s)) - if self._on_change: - self._on_change() + normalized = lc(s) + if normalized not in self.elements: + self.elements.add(normalized) + if self._on_change: + self._on_change() def add(self, *strings: str) -> Self: """ - Add the lower case and no-period version of the string arguments to the set. + Add the lowercased, leading/trailing-periods-stripped version of the string arguments to the set. Can pass a list of strings. Returns ``self`` for chaining. """ for s in strings: @@ -124,6 +127,14 @@ def remove(self, *strings: str) -> Self: self._on_change() return self + def clear(self) -> Self: + """Remove all entries from the set. Returns ``self`` for chaining.""" + if self.elements: + self.elements.clear() + if self._on_change: + self._on_change() + return self + T = TypeVar('T') @@ -227,8 +238,10 @@ class Constants: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set conjunctions: + :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. + :param set first_name_prefixes: + :py:attr:`~first_name_prefixes.FIRST_NAME_PREFIXES` wrapped with :py:class:`SetManager`. :type capitalization_exceptions: tuple or dict :param capitalization_exceptions: :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. @@ -243,6 +256,7 @@ class Constants: titles = _CachedUnionMember() first_name_titles: SetManager conjunctions: SetManager + first_name_prefixes: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager _pst: Set[str] | None @@ -377,6 +391,7 @@ def __init__(self, titles: Iterable[str] = TITLES, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, + first_name_prefixes: Iterable[str] = FIRST_NAME_PREFIXES, capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS, regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES, patronymic_name_order: bool = False, @@ -390,6 +405,7 @@ def __init__(self, self.titles = SetManager(titles) self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) + self.first_name_prefixes = SetManager(first_name_prefixes) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) self.patronymic_name_order = patronymic_name_order diff --git a/nameparser/config/first_name_prefixes.py b/nameparser/config/first_name_prefixes.py new file mode 100644 index 0000000..3f569db --- /dev/null +++ b/nameparser/config/first_name_prefixes.py @@ -0,0 +1,12 @@ +#: Bound Arabic given-name prefixes that attach to the following word to form +#: one first name (e.g. "abdul salam" → first name "abdul salam"). They are +#: never standalone names. Join logic runs in the given-name region only, +#: mirroring :py:data:`~nameparser.config.prefixes.PREFIXES` for last names. +FIRST_NAME_PREFIXES: set[str] = { + 'abdul', + 'abdel', + 'abdal', + 'abu', + 'abou', + 'umm', +} diff --git a/nameparser/parser.py b/nameparser/parser.py index 1ffcb53..4cda41a 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -551,7 +551,7 @@ def is_conjunction(self, piece: str) -> bool: def is_prefix(self, piece: str) -> bool: """ - Lowercase and no periods version of piece is in the + Lowercased, leading/trailing-periods-stripped version of piece is in the :py:data:`~nameparser.config.prefixes.PREFIXES` set. """ if isinstance(piece, list): @@ -561,6 +561,37 @@ def is_prefix(self, piece: str) -> bool: else: return lc(piece) in self.C.prefixes + def is_first_name_prefix(self, piece: str) -> bool: + """Lowercased, leading/trailing-periods-stripped version of piece is in :py:attr:`~nameparser.config.Constants.first_name_prefixes`.""" + return lc(piece) in self.C.first_name_prefixes + + def _join_first_name_prefix(self, pieces: list[str], reserve_last: bool) -> list[str]: + """Join a first-name prefix to its following piece. + + Finds the first non-title piece; if it is in ``first_name_prefixes``, + merges it with the next piece — unless ``reserve_last`` is True and no + further piece would remain for the last name. + """ + fi = next((i for i, p in enumerate(pieces) if not self.is_title(p)), None) + if fi is None: + return pieces + if not self.is_first_name_prefix(pieces[fi]): + return pieces + next_i = fi + 1 + if next_i >= len(pieces): + return pieces + if reserve_last: + # Count non-suffix pieces from next_i onward; need ≥2 so the join + # target and at least one last-name piece both exist. + non_suffix_remaining = sum( + 1 for p in pieces[next_i:] if not self.is_suffix(p) + ) + if non_suffix_remaining <= 1: + return pieces + pieces[fi] = pieces[fi] + " " + pieces[next_i] + del pieces[next_i] + return pieces + def is_roman_numeral(self, value: str) -> bool: """ Matches the ``roman_numeral`` regular expression in @@ -827,6 +858,7 @@ def parse_full_name(self) -> None: # part[0] pieces = self.parse_pieces(parts) + pieces = self._join_first_name_prefix(pieces, reserve_last=True) p_len = len(pieces) for i, piece in enumerate(pieces): try: @@ -909,6 +941,7 @@ def parse_full_name(self) -> None: # parts[0], parts[1], parts[2:...] log.debug("post-comma pieces: %s", str(post_comma_pieces)) + post_comma_pieces = self._join_first_name_prefix(post_comma_pieces, reserve_last=False) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) diff --git a/nameparser/util.py b/nameparser/util.py index 3b28fe7..1987dae 100644 --- a/nameparser/util.py +++ b/nameparser/util.py @@ -11,7 +11,7 @@ def lc(value: str) -> str: - """Lower case and remove any periods to normalize for comparison.""" + """Lowercase and strip leading/trailing periods to normalize for comparison.""" if not value: return '' return value.lower().strip('.') diff --git a/tests/conftest.py b/tests/conftest.py index a35d515..093e431 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,7 @@ "titles", "first_name_titles", "conjunctions", + "first_name_prefixes", "capitalization_exceptions", "regexes", ) diff --git a/tests/test_constants.py b/tests/test_constants.py index decae95..281bdf3 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -77,6 +77,14 @@ def test_chain_multiple_arguments(self) -> None: self.m(hn.middle, "Hon", hn) self.m(hn.last, "Solo", hn) + def test_clear_removes_all_entries(self) -> None: + hn = HumanName("Ms Hon Solo", constants=None) + hn.C.titles.clear() + hn.parse_full_name() + self.m(hn.first, "Ms", hn) + self.m(hn.middle, "Hon", hn) + self.m(hn.last, "Solo", hn) + def test_empty_attribute_default(self) -> None: from nameparser.config import CONSTANTS _orig = CONSTANTS.empty_attribute_default diff --git a/tests/test_first_name_prefixes.py b/tests/test_first_name_prefixes.py new file mode 100644 index 0000000..db274da --- /dev/null +++ b/tests/test_first_name_prefixes.py @@ -0,0 +1,118 @@ +from nameparser import HumanName +from tests.base import HumanNameTestBase + + +class FirstNamePrefixesTestCase(HumanNameTestBase): + + def test_is_first_name_prefix_true(self) -> None: + hn = HumanName("test") + assert hn.is_first_name_prefix("Abdul") + + def test_is_first_name_prefix_false(self) -> None: + hn = HumanName("test") + assert not hn.is_first_name_prefix("Ahmed") + + # --- no-comma: basic joining --- + def test_no_comma_basic_join(self) -> None: + hn = HumanName("abdul salam ahmed salem") + self.m(hn.first, "abdul salam", hn) + self.m(hn.middle, "ahmed", hn) + self.m(hn.last, "salem", hn) + + def test_no_comma_three_tokens_no_middle(self) -> None: + hn = HumanName("abdul salam salem") + self.m(hn.first, "abdul salam", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "salem", hn) + + def test_no_comma_guard_two_tokens_no_join(self) -> None: + """Guard: only last name remains after prefix → no join.""" + hn = HumanName("abdul salam") + self.m(hn.first, "abdul", hn) + self.m(hn.last, "salam", hn) + + def test_no_comma_guard_suffix_not_swallowed(self) -> None: + """Guard: prefix + one name + suffix — suffix must not become last.""" + hn = HumanName("abdul salam jr") + self.m(hn.first, "abdul", hn) + self.m(hn.last, "salam", hn) + self.m(hn.suffix, "jr", hn) + + # --- lastname-comma path --- + def test_lastname_comma_join(self) -> None: + hn = HumanName("salem, abdul salam") + self.m(hn.first, "abdul salam", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "salem", hn) + + def test_lastname_comma_join_with_middle(self) -> None: + hn = HumanName("salem, abdul salam ahmed") + self.m(hn.first, "abdul salam", hn) + self.m(hn.middle, "ahmed", hn) + self.m(hn.last, "salem", hn) + + # --- interaction with titles --- + def test_title_kept_prefix_joins(self) -> None: + hn = HumanName("Dr. abdul salam ahmed salem") + self.m(hn.title, "Dr.", hn) + self.m(hn.first, "abdul salam", hn) + self.m(hn.middle, "ahmed", hn) + self.m(hn.last, "salem", hn) + + # --- interaction with last-name prefixes --- + def test_abu_bakr_al_baghdadi(self) -> None: + """abu joins forward as first-prefix; al joins forward as last-prefix.""" + hn = HumanName("abu bakr al baghdadi") + self.m(hn.first, "abu bakr", hn) + self.m(hn.last, "al baghdadi", hn) + + # --- interaction with suffixes --- + def test_suffix_kept_prefix_joins(self) -> None: + hn = HumanName("abdul salam ahmed salem jr") + self.m(hn.first, "abdul salam", hn) + self.m(hn.middle, "ahmed", hn) + self.m(hn.last, "salem", hn) + self.m(hn.suffix, "jr", hn) + + # --- guard / no-op --- + def test_mohamad_unchanged(self) -> None: + """mohamad is deliberately not in first_name_prefixes.""" + hn = HumanName("Mohamad Ali Khalil") + self.m(hn.first, "Mohamad", hn) + self.m(hn.middle, "Ali", hn) + self.m(hn.last, "Khalil", hn) + + def test_single_token_already_joined_unchanged(self) -> None: + """abdulsalam is one token — not in the set, no join.""" + hn = HumanName("abdulsalam ahmed salem") + self.m(hn.first, "abdulsalam", hn) + self.m(hn.middle, "ahmed", hn) + self.m(hn.last, "salem", hn) + + def test_prefix_alone_no_join(self) -> None: + """Single-word name that is a prefix: nothing to join.""" + hn = HumanName("abdul") + self.m(hn.first, "abdul", hn) + + def test_lastname_comma_prefix_only_no_join(self) -> None: + """Prefix as sole post-comma token: nothing to join.""" + hn = HumanName("salem, abdul") + self.m(hn.first, "abdul", hn) + self.m(hn.last, "salem", hn) + + def test_mid_name_prefix_becomes_last_prefix(self) -> None: + """abu in non-first position is handled as a last-name prefix, not first-name.""" + hn = HumanName("ahmed abu bakr") + self.m(hn.first, "ahmed", hn) + self.m(hn.last, "abu bakr", hn) + + # --- opt-out --- + def test_opt_out_via_clear(self) -> None: + """Clearing first_name_prefixes restores prior behavior.""" + from nameparser.config import Constants + c = Constants(first_name_prefixes=set()) + hn = HumanName("abdul salam ahmed salem", constants=c) + self.m(hn.first, "abdul", hn) + self.m(hn.middle, "salam ahmed", hn) + self.m(hn.last, "salem", hn) +