From c19ce96167802bcdba6a7c8724e1226d705442be Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 13:40:16 -0700 Subject: [PATCH 1/3] fix: correct suffix boundary lookup for prefixed last names (#100) The prefix-joining loop located the suffix stop boundary with a value-based pieces.index() that searched from position 0. When a token value repeated (a trailing title that is also a suffix acronym, e.g. the second 'dr' in 'dr Vincent van Gogh dr'), it matched the leading occurrence, producing an empty slice that duplicated pieces and corrupted the middle name. Constrain the lookup to start at i + 1, consistent with the sibling next_prefix lookup. Co-Authored-By: Claude Sonnet 4.6 --- nameparser/parser.py | 5 ++++- tests/test_prefixes.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 73cbd06..739aaba 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -980,7 +980,10 @@ def join_on_conjunctions(self, pieces: list[str], additional_parts_count: int = try: # if there are no more prefixes, look for a suffix to stop at stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) - j = pieces.index(stop_at) + # search from i + 1 so a repeated token earlier in the + # list (e.g. a leading title that is also a suffix + # acronym) is not matched instead of the trailing one + j = pieces.index(stop_at, i + 1) new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] except StopIteration: diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index 134f962..cb3bbaa 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -43,6 +43,18 @@ def test_prefix_before_two_part_last_name_with_acronym_suffix(self) -> None: self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "M.D.", hn) + def test_title_before_and_after_prefixed_last_name(self) -> None: + # Issue #100: a repeated title/suffix token ("dr") before AND after a + # prefixed last name used to corrupt the middle name into + # " dr Vincent van" because the suffix-boundary lookup matched the + # LEADING "dr" instead of the trailing one. + hn = HumanName("dr Vincent van Gogh dr") + self.m(hn.title, "dr", hn) + self.m(hn.first, "Vincent", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "van Gogh", hn) + self.m(hn.suffix, "dr", hn) + def test_two_part_last_name_with_suffix_comma(self) -> None: hn = HumanName("pennie von bergen wessels, III") self.m(hn.first, "pennie", hn) From 7eb356db8e38ee65de10a562d31bc25cb78890f0 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 13:40:36 -0700 Subject: [PATCH 2/3] test: guard against #108 exponential blow-up on repeated prefixes Co-Authored-By: Claude Sonnet 4.6 --- tests/test_prefixes.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index cb3bbaa..198c48f 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -55,6 +55,16 @@ def test_title_before_and_after_prefixed_last_name(self) -> None: self.m(hn.last, "van Gogh", hn) self.m(hn.suffix, "dr", hn) + def test_many_repeated_prefixes_does_not_blow_up(self) -> None: + # Issue #108: a name with a long run of repeated prefixes used to grow + # the pieces list exponentially and exhaust memory. Guard against a + # regression: this must parse quickly and not raise. If an exponential + # code path is reintroduced, this test will hang (CI timeout catches it). + name = "Jan " + "van der " * 30 + "Berg" + hn = HumanName(name) + self.assertFalse(hn.unparsable) + self.m(hn.first, "Jan", hn) + def test_two_part_last_name_with_suffix_comma(self) -> None: hn = HumanName("pennie von bergen wessels, III") self.m(hn.first, "pennie", hn) From 2f4e126a1e58702cf1d9e43734f09f306f493449 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 29 Jun 2026 13:57:43 -0700 Subject: [PATCH 3/3] review: improve comment accuracy and test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix inline comment in join_on_conjunctions: clarify that filter() finds the value in pieces[i+1:] but index() searches from 0 by default, and drop the misleading "title" framing (the token only needs to satisfy is_suffix, not is_title) - Add test for two-word prefix collision ("van der") — different loop iteration count than the single-word case - Add test with a genuine middle name alongside the repeated token, since the pre-fix bug corrupted the middle field specifically - Add @pytest.mark.timeout(2) to the #108 guard so the timeout is enforced locally and in CI, not just by CI job limits - Assert hn.last contains "Berg" in the #108 guard to catch silent last-name corruption - Add pytest-timeout dev dependency - Resolve pre-existing stash conflict in docs/resources.rst (keep upstream) Co-Authored-By: Claude Sonnet 4.6 --- nameparser/parser.py | 8 +-- pyproject.toml | 3 +- tests/test_prefixes.py | 30 +++++++++-- uv.lock | 118 +++++++++++++++++++++++------------------ 4 files changed, 100 insertions(+), 59 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 739aaba..067b586 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -980,9 +980,11 @@ def join_on_conjunctions(self, pieces: list[str], additional_parts_count: int = try: # if there are no more prefixes, look for a suffix to stop at stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) - # search from i + 1 so a repeated token earlier in the - # list (e.g. a leading title that is also a suffix - # acronym) is not matched instead of the trailing one + # search from i + 1: filter() finds the value of stop_at + # in pieces[i+1:] but pieces.index() without a start + # argument searches from 0, so an earlier occurrence of + # the same token (e.g. a suffix token that also appears + # before the prefix) would be matched instead. j = pieces.index(stop_at, i + 1) new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] diff --git a/pyproject.toml b/pyproject.toml index c8d7932..86da3e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dev = [ "dill (>=0.2.5)", "sphinx (>=8)", "mypy (>=2.1)", - "ruff (>=0.15)" + "ruff (>=0.15)", + "pytest-timeout>=2.4.0", ] [tool.mypy] diff --git a/tests/test_prefixes.py b/tests/test_prefixes.py index 198c48f..0d18449 100644 --- a/tests/test_prefixes.py +++ b/tests/test_prefixes.py @@ -1,3 +1,5 @@ +import pytest + from nameparser import HumanName from tests.base import HumanNameTestBase @@ -55,15 +57,37 @@ def test_title_before_and_after_prefixed_last_name(self) -> None: self.m(hn.last, "van Gogh", hn) self.m(hn.suffix, "dr", hn) + def test_suffix_token_collision_with_two_word_prefix(self) -> None: + # Same fix as #100 but with a two-word prefix ("van der"). Exercises a + # different iteration count through the prefix-joining loop. + hn = HumanName("dr Vincent van der Gogh dr") + self.m(hn.title, "dr", hn) + self.m(hn.first, "Vincent", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "van der Gogh", hn) + self.m(hn.suffix, "dr", hn) + + def test_title_before_and_after_prefixed_last_name_with_middle(self) -> None: + # The pre-fix bug corrupted the middle field; verify it is not disturbed + # when a genuine middle name is present alongside the repeated token. + hn = HumanName("dr Vincent James van Gogh dr") + self.m(hn.title, "dr", hn) + self.m(hn.first, "Vincent", hn) + self.m(hn.middle, "James", hn) + self.m(hn.last, "van Gogh", hn) + self.m(hn.suffix, "dr", hn) + + @pytest.mark.timeout(2) def test_many_repeated_prefixes_does_not_blow_up(self) -> None: # Issue #108: a name with a long run of repeated prefixes used to grow - # the pieces list exponentially and exhaust memory. Guard against a - # regression: this must parse quickly and not raise. If an exponential - # code path is reintroduced, this test will hang (CI timeout catches it). + # the pieces list exponentially and exhaust memory. The 2-second timeout + # enforces this locally and in CI — if the test hangs, an exponential + # regression has been reintroduced. name = "Jan " + "van der " * 30 + "Berg" hn = HumanName(name) self.assertFalse(hn.unparsable) self.m(hn.first, "Jan", hn) + self.assertIn("Berg", hn.last) def test_two_part_last_name_with_suffix_comma(self) -> None: hn = HumanName("pennie von bergen wessels, III") diff --git a/uv.lock b/uv.lock index c01beca..339eca6 100644 --- a/uv.lock +++ b/uv.lock @@ -229,7 +229,7 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ @@ -525,6 +525,7 @@ dev = [ { name = "dill" }, { name = "mypy" }, { name = "pytest" }, + { name = "pytest-timeout" }, { name = "ruff" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, @@ -539,6 +540,7 @@ dev = [ { name = "dill", specifier = ">=0.2.5" }, { name = "mypy", specifier = ">=2.1" }, { name = "pytest", specifier = ">=8" }, + { name = "pytest-timeout", specifier = ">=2.4.0" }, { name = "ruff", specifier = ">=0.15" }, { name = "sphinx", specifier = ">=8" }, ] @@ -597,6 +599,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8b/5a/ba30a81239b909821b3153e303e7def45178bf353da4f72380e6c5e8793b/pytest-9.1.0-py3-none-any.whl", hash = "sha256:8ebb0e7888bdf2bdfc602ec51f8f62d50200af37356c74e503c79a94f5c81f32", size = 386453, upload-time = "2026-06-13T18:52:44.045Z" }, ] +[[package]] +name = "pytest-timeout" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, +] + [[package]] name = "requests" version = "2.34.2" @@ -663,23 +677,23 @@ resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ - { name = "alabaster", marker = "python_full_version < '3.11'" }, - { name = "babel", marker = "python_full_version < '3.11'" }, - { name = "colorama", marker = "python_full_version < '3.11' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "imagesize", marker = "python_full_version < '3.11'" }, - { name = "jinja2", marker = "python_full_version < '3.11'" }, - { name = "packaging", marker = "python_full_version < '3.11'" }, - { name = "pygments", marker = "python_full_version < '3.11'" }, - { name = "requests", marker = "python_full_version < '3.11'" }, - { name = "snowballstemmer", marker = "python_full_version < '3.11'" }, - { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.11'" }, - { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.11'" }, - { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.11'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.11'" }, - { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.11'" }, - { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.11'" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "alabaster" }, + { name = "babel" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" } }, + { name = "imagesize" }, + { name = "jinja2" }, + { name = "packaging" }, + { name = "pygments" }, + { name = "requests" }, + { name = "snowballstemmer" }, + { name = "sphinxcontrib-applehelp" }, + { name = "sphinxcontrib-devhelp" }, + { name = "sphinxcontrib-htmlhelp" }, + { name = "sphinxcontrib-jsmath" }, + { name = "sphinxcontrib-qthelp" }, + { name = "sphinxcontrib-serializinghtml" }, + { name = "tomli" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611, upload-time = "2024-10-13T20:27:13.93Z" } wheels = [ @@ -694,23 +708,23 @@ resolution-markers = [ "python_full_version == '3.11.*'", ] dependencies = [ - { name = "alabaster", marker = "python_full_version == '3.11.*'" }, - { name = "babel", marker = "python_full_version == '3.11.*'" }, - { name = "colorama", marker = "python_full_version == '3.11.*' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*'" }, - { name = "imagesize", marker = "python_full_version == '3.11.*'" }, - { name = "jinja2", marker = "python_full_version == '3.11.*'" }, - { name = "packaging", marker = "python_full_version == '3.11.*'" }, - { name = "pygments", marker = "python_full_version == '3.11.*'" }, - { name = "requests", marker = "python_full_version == '3.11.*'" }, - { name = "roman-numerals", marker = "python_full_version == '3.11.*'" }, - { name = "snowballstemmer", marker = "python_full_version == '3.11.*'" }, - { name = "sphinxcontrib-applehelp", marker = "python_full_version == '3.11.*'" }, - { name = "sphinxcontrib-devhelp", marker = "python_full_version == '3.11.*'" }, - { name = "sphinxcontrib-htmlhelp", marker = "python_full_version == '3.11.*'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.11.*'" }, - { name = "sphinxcontrib-qthelp", marker = "python_full_version == '3.11.*'" }, - { name = "sphinxcontrib-serializinghtml", marker = "python_full_version == '3.11.*'" }, + { name = "alabaster" }, + { name = "babel" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" } }, + { name = "imagesize" }, + { name = "jinja2" }, + { name = "packaging" }, + { name = "pygments" }, + { name = "requests" }, + { name = "roman-numerals" }, + { name = "snowballstemmer" }, + { name = "sphinxcontrib-applehelp" }, + { name = "sphinxcontrib-devhelp" }, + { name = "sphinxcontrib-htmlhelp" }, + { name = "sphinxcontrib-jsmath" }, + { name = "sphinxcontrib-qthelp" }, + { name = "sphinxcontrib-serializinghtml" }, ] sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" } wheels = [ @@ -726,23 +740,23 @@ resolution-markers = [ "python_full_version >= '3.12' and python_full_version < '3.15'", ] dependencies = [ - { name = "alabaster", marker = "python_full_version >= '3.12'" }, - { name = "babel", marker = "python_full_version >= '3.12'" }, - { name = "colorama", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, - { name = "imagesize", marker = "python_full_version >= '3.12'" }, - { name = "jinja2", marker = "python_full_version >= '3.12'" }, - { name = "packaging", marker = "python_full_version >= '3.12'" }, - { name = "pygments", marker = "python_full_version >= '3.12'" }, - { name = "requests", marker = "python_full_version >= '3.12'" }, - { name = "roman-numerals", marker = "python_full_version >= '3.12'" }, - { name = "snowballstemmer", marker = "python_full_version >= '3.12'" }, - { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12'" }, - { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12'" }, - { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12'" }, - { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12'" }, - { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12'" }, + { name = "alabaster" }, + { name = "babel" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" } }, + { name = "imagesize" }, + { name = "jinja2" }, + { name = "packaging" }, + { name = "pygments" }, + { name = "requests" }, + { name = "roman-numerals" }, + { name = "snowballstemmer" }, + { name = "sphinxcontrib-applehelp" }, + { name = "sphinxcontrib-devhelp" }, + { name = "sphinxcontrib-htmlhelp" }, + { name = "sphinxcontrib-jsmath" }, + { name = "sphinxcontrib-qthelp" }, + { name = "sphinxcontrib-serializinghtml" }, ] sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" } wheels = [