Skip to content
Merged
37 changes: 36 additions & 1 deletion docs/customize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,41 @@ a secondary key::

sorted_names = sorted(names, key=lambda n: (n.last_base.lower(), n.first.lower()))

First-Name Prefixes
-------------------

``CONSTANTS.first_name_prefixes`` controls bound given-name prefixes that attach
to the following word to form one first name. By default it contains
``{'abdul', 'abdel', 'abdal', 'abu', 'abou', 'umm'}``.

Example::

>>> from nameparser import HumanName
>>> hn = HumanName("abdul salam ahmed salem")
>>> hn.first, hn.middle, hn.last
('abdul salam', 'ahmed', 'salem')

To **disable** the feature entirely::

>>> from nameparser.config import CONSTANTS
>>> CONSTANTS.first_name_prefixes.clear()

To **add** a word (e.g. if your data uses ``mohamad`` as a bound prefix)::

>>> CONSTANTS.first_name_prefixes.add('mohamad')

To **remove** a single entry::

>>> CONSTANTS.first_name_prefixes.remove('umm')

You can also pass a custom set per ``Constants`` instance::

>>> from nameparser.config import Constants
>>> c = Constants(first_name_prefixes={'abu', 'umm'})
>>> hn2 = HumanName("abu bakr al saud", constants=c)
>>> hn2.first, hn2.last
('abu bakr', 'al saud')

Parser Customization Examples
-----------------------------

Expand Down Expand Up @@ -181,7 +216,7 @@ constant so that "Hon" can be parsed as a first name.

If you don't want to detect any titles at all, you can remove all of them:

>>> CONSTANTS.titles.remove(*CONSTANTS.titles)
>>> CONSTANTS.titles.clear()


Adding a Title
Expand Down
6 changes: 6 additions & 0 deletions docs/release_log.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ Release Log
- Fix ``'apn aprn'`` split into separate ``suffix_acronyms`` entries so each is recognized independently (closes #155)
- Add ``last_base``, ``last_prefixes`` (and ``_list`` variants) plus ``family`` / ``family_prefixes`` aliases for splitting last-name prefix particles (tussenvoegsels) from the core surname (#130, #132)
- Add ``patronymic_name_order`` flag to ``Constants`` and ``HumanName`` for opt-in detection and reordering of Russian formal-order names (Surname GivenName Patronymic) (#85)
- Add ``first_name_prefixes`` set to ``Constants``; bound Arabic given-name
prefixes (``abdul``, ``abu``, etc.) now join forward to form a single first
name (e.g. ``"abdul salam ahmed salem"`` → ``first="abdul salam"``,
``middle="ahmed"``, ``last="salem"``). Disable via
``CONSTANTS.first_name_prefixes.clear()``. **Default-on: changes parsing
output for names with these prefixes.** (#150)
* 1.2.1 - June 19, 2026
- Fix ``initials()`` interpolating the literal ``None`` for empty name parts when ``empty_attribute_default = None`` (e.g. ``"J. None D."``); empty parts now render as an empty string and a fully-empty result returns ``empty_attribute_default``
- Add ``python -m nameparser "Name String"`` command-line helper that prints a parsed name
Expand Down
28 changes: 22 additions & 6 deletions nameparser/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from nameparser.util import lc
from nameparser.config.prefixes import PREFIXES
from nameparser.config.first_name_prefixes import FIRST_NAME_PREFIXES
from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS
from nameparser.config.conjunctions import CONJUNCTIONS
from nameparser.config.suffixes import SUFFIX_ACRONYMS
Expand Down Expand Up @@ -86,7 +87,7 @@ def __len__(self) -> int:

def add_with_encoding(self, s: str, encoding: str | None = None) -> None:
"""
Add the lower case and no-period version of the string to the set. Pass an
Add the lowercased, leading/trailing-periods-stripped version of the string to the set. Pass an
explicit `encoding` parameter to specify the encoding of binary strings that
are not DEFAULT_ENCODING (UTF-8).
"""
Expand All @@ -96,13 +97,15 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None:
encoding = encoding or stdin_encoding or DEFAULT_ENCODING
if isinstance(s, bytes):
s = s.decode(encoding)
self.elements.add(lc(s))
if self._on_change:
self._on_change()
normalized = lc(s)
if normalized not in self.elements:
self.elements.add(normalized)
if self._on_change:
self._on_change()

def add(self, *strings: str) -> Self:
"""
Add the lower case and no-period version of the string arguments to the set.
Add the lowercased, leading/trailing-periods-stripped version of the string arguments to the set.
Can pass a list of strings. Returns ``self`` for chaining.
"""
for s in strings:
Expand All @@ -124,6 +127,14 @@ def remove(self, *strings: str) -> Self:
self._on_change()
return self

def clear(self) -> Self:
"""Remove all entries from the set. Returns ``self`` for chaining."""
if self.elements:
self.elements.clear()
if self._on_change:
self._on_change()
return self


T = TypeVar('T')

Expand Down Expand Up @@ -227,8 +238,10 @@ class Constants:
:py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`.
:param set suffix_not_acronyms:
:py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`.
:param set conjunctions:
:param set conjunctions:
:py:attr:`conjunctions` wrapped with :py:class:`SetManager`.
:param set first_name_prefixes:
:py:attr:`~first_name_prefixes.FIRST_NAME_PREFIXES` wrapped with :py:class:`SetManager`.
:type capitalization_exceptions: tuple or dict
:param capitalization_exceptions:
:py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`.
Expand All @@ -243,6 +256,7 @@ class Constants:
titles = _CachedUnionMember()
first_name_titles: SetManager
conjunctions: SetManager
first_name_prefixes: SetManager
capitalization_exceptions: TupleManager[str]
regexes: RegexTupleManager
_pst: Set[str] | None
Expand Down Expand Up @@ -377,6 +391,7 @@ def __init__(self,
titles: Iterable[str] = TITLES,
first_name_titles: Iterable[str] = FIRST_NAME_TITLES,
conjunctions: Iterable[str] = CONJUNCTIONS,
first_name_prefixes: Iterable[str] = FIRST_NAME_PREFIXES,
capitalization_exceptions: TupleManager[str] | Iterable[tuple[str, str]] = CAPITALIZATION_EXCEPTIONS,
regexes: RegexTupleManager | TupleManager[re.Pattern[str]] | Iterable[tuple[str, re.Pattern[str]]] = REGEXES,
patronymic_name_order: bool = False,
Expand All @@ -390,6 +405,7 @@ def __init__(self,
self.titles = SetManager(titles)
self.first_name_titles = SetManager(first_name_titles)
self.conjunctions = SetManager(conjunctions)
self.first_name_prefixes = SetManager(first_name_prefixes)
self.capitalization_exceptions = TupleManager(capitalization_exceptions)
self.regexes = RegexTupleManager(regexes)
self.patronymic_name_order = patronymic_name_order
Expand Down
12 changes: 12 additions & 0 deletions nameparser/config/first_name_prefixes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#: Bound Arabic given-name prefixes that attach to the following word to form
#: one first name (e.g. "abdul salam" → first name "abdul salam"). They are
#: never standalone names. Join logic runs in the given-name region only,
#: mirroring :py:data:`~nameparser.config.prefixes.PREFIXES` for last names.
FIRST_NAME_PREFIXES: set[str] = {
'abdul',
'abdel',
'abdal',
'abu',
'abou',
'umm',
}
35 changes: 34 additions & 1 deletion nameparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ def is_conjunction(self, piece: str) -> bool:

def is_prefix(self, piece: str) -> bool:
"""
Lowercase and no periods version of piece is in the
Lowercased, leading/trailing-periods-stripped version of piece is in the
:py:data:`~nameparser.config.prefixes.PREFIXES` set.
"""
if isinstance(piece, list):
Expand All @@ -561,6 +561,37 @@ def is_prefix(self, piece: str) -> bool:
else:
return lc(piece) in self.C.prefixes

def is_first_name_prefix(self, piece: str) -> bool:
"""Lowercased, leading/trailing-periods-stripped version of piece is in :py:attr:`~nameparser.config.Constants.first_name_prefixes`."""
return lc(piece) in self.C.first_name_prefixes

def _join_first_name_prefix(self, pieces: list[str], reserve_last: bool) -> list[str]:
"""Join a first-name prefix to its following piece.

Finds the first non-title piece; if it is in ``first_name_prefixes``,
merges it with the next piece — unless ``reserve_last`` is True and no
further piece would remain for the last name.
"""
fi = next((i for i, p in enumerate(pieces) if not self.is_title(p)), None)
if fi is None:
return pieces
if not self.is_first_name_prefix(pieces[fi]):
return pieces
next_i = fi + 1
if next_i >= len(pieces):
return pieces
if reserve_last:
# Count non-suffix pieces from next_i onward; need ≥2 so the join
# target and at least one last-name piece both exist.
non_suffix_remaining = sum(
1 for p in pieces[next_i:] if not self.is_suffix(p)
)
if non_suffix_remaining <= 1:
return pieces
pieces[fi] = pieces[fi] + " " + pieces[next_i]
del pieces[next_i]
return pieces

def is_roman_numeral(self, value: str) -> bool:
"""
Matches the ``roman_numeral`` regular expression in
Expand Down Expand Up @@ -827,6 +858,7 @@ def parse_full_name(self) -> None:
# part[0]

pieces = self.parse_pieces(parts)
pieces = self._join_first_name_prefix(pieces, reserve_last=True)
p_len = len(pieces)
for i, piece in enumerate(pieces):
try:
Expand Down Expand Up @@ -909,6 +941,7 @@ def parse_full_name(self) -> None:
# parts[0], parts[1], parts[2:...]

log.debug("post-comma pieces: %s", str(post_comma_pieces))
post_comma_pieces = self._join_first_name_prefix(post_comma_pieces, reserve_last=False)

# lastname part may have suffixes in it
lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
Expand Down
2 changes: 1 addition & 1 deletion nameparser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def lc(value: str) -> str:
"""Lower case and remove any periods to normalize for comparison."""
"""Lowercase and strip leading/trailing periods to normalize for comparison."""
if not value:
return ''
return value.lower().strip('.')
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"titles",
"first_name_titles",
"conjunctions",
"first_name_prefixes",
"capitalization_exceptions",
"regexes",
)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def test_chain_multiple_arguments(self) -> None:
self.m(hn.middle, "Hon", hn)
self.m(hn.last, "Solo", hn)

def test_clear_removes_all_entries(self) -> None:
hn = HumanName("Ms Hon Solo", constants=None)
hn.C.titles.clear()
hn.parse_full_name()
self.m(hn.first, "Ms", hn)
self.m(hn.middle, "Hon", hn)
self.m(hn.last, "Solo", hn)

def test_empty_attribute_default(self) -> None:
from nameparser.config import CONSTANTS
_orig = CONSTANTS.empty_attribute_default
Expand Down
118 changes: 118 additions & 0 deletions tests/test_first_name_prefixes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from nameparser import HumanName
from tests.base import HumanNameTestBase


class FirstNamePrefixesTestCase(HumanNameTestBase):

def test_is_first_name_prefix_true(self) -> None:
hn = HumanName("test")
assert hn.is_first_name_prefix("Abdul")

def test_is_first_name_prefix_false(self) -> None:
hn = HumanName("test")
assert not hn.is_first_name_prefix("Ahmed")

# --- no-comma: basic joining ---
def test_no_comma_basic_join(self) -> None:
hn = HumanName("abdul salam ahmed salem")
self.m(hn.first, "abdul salam", hn)
self.m(hn.middle, "ahmed", hn)
self.m(hn.last, "salem", hn)

def test_no_comma_three_tokens_no_middle(self) -> None:
hn = HumanName("abdul salam salem")
self.m(hn.first, "abdul salam", hn)
self.m(hn.middle, "", hn)
self.m(hn.last, "salem", hn)

def test_no_comma_guard_two_tokens_no_join(self) -> None:
"""Guard: only last name remains after prefix → no join."""
hn = HumanName("abdul salam")
self.m(hn.first, "abdul", hn)
self.m(hn.last, "salam", hn)

def test_no_comma_guard_suffix_not_swallowed(self) -> None:
"""Guard: prefix + one name + suffix — suffix must not become last."""
hn = HumanName("abdul salam jr")
self.m(hn.first, "abdul", hn)
self.m(hn.last, "salam", hn)
self.m(hn.suffix, "jr", hn)

# --- lastname-comma path ---
def test_lastname_comma_join(self) -> None:
hn = HumanName("salem, abdul salam")
self.m(hn.first, "abdul salam", hn)
self.m(hn.middle, "", hn)
self.m(hn.last, "salem", hn)

def test_lastname_comma_join_with_middle(self) -> None:
hn = HumanName("salem, abdul salam ahmed")
self.m(hn.first, "abdul salam", hn)
self.m(hn.middle, "ahmed", hn)
self.m(hn.last, "salem", hn)

# --- interaction with titles ---
def test_title_kept_prefix_joins(self) -> None:
hn = HumanName("Dr. abdul salam ahmed salem")
self.m(hn.title, "Dr.", hn)
self.m(hn.first, "abdul salam", hn)
self.m(hn.middle, "ahmed", hn)
self.m(hn.last, "salem", hn)

# --- interaction with last-name prefixes ---
def test_abu_bakr_al_baghdadi(self) -> None:
"""abu joins forward as first-prefix; al joins forward as last-prefix."""
hn = HumanName("abu bakr al baghdadi")
self.m(hn.first, "abu bakr", hn)
self.m(hn.last, "al baghdadi", hn)

# --- interaction with suffixes ---
def test_suffix_kept_prefix_joins(self) -> None:
hn = HumanName("abdul salam ahmed salem jr")
self.m(hn.first, "abdul salam", hn)
self.m(hn.middle, "ahmed", hn)
self.m(hn.last, "salem", hn)
self.m(hn.suffix, "jr", hn)

# --- guard / no-op ---
def test_mohamad_unchanged(self) -> None:
"""mohamad is deliberately not in first_name_prefixes."""
hn = HumanName("Mohamad Ali Khalil")
self.m(hn.first, "Mohamad", hn)
self.m(hn.middle, "Ali", hn)
self.m(hn.last, "Khalil", hn)

def test_single_token_already_joined_unchanged(self) -> None:
"""abdulsalam is one token — not in the set, no join."""
hn = HumanName("abdulsalam ahmed salem")
self.m(hn.first, "abdulsalam", hn)
self.m(hn.middle, "ahmed", hn)
self.m(hn.last, "salem", hn)

def test_prefix_alone_no_join(self) -> None:
"""Single-word name that is a prefix: nothing to join."""
hn = HumanName("abdul")
self.m(hn.first, "abdul", hn)

def test_lastname_comma_prefix_only_no_join(self) -> None:
"""Prefix as sole post-comma token: nothing to join."""
hn = HumanName("salem, abdul")
self.m(hn.first, "abdul", hn)
self.m(hn.last, "salem", hn)

def test_mid_name_prefix_becomes_last_prefix(self) -> None:
"""abu in non-first position is handled as a last-name prefix, not first-name."""
hn = HumanName("ahmed abu bakr")
self.m(hn.first, "ahmed", hn)
self.m(hn.last, "abu bakr", hn)

# --- opt-out ---
def test_opt_out_via_clear(self) -> None:
"""Clearing first_name_prefixes restores prior behavior."""
from nameparser.config import Constants
c = Constants(first_name_prefixes=set())
hn = HumanName("abdul salam ahmed salem", constants=c)
self.m(hn.first, "abdul", hn)
self.m(hn.middle, "salam ahmed", hn)
self.m(hn.last, "salem", hn)