From 561e99c1055368a22453701b4e58dc44d8467304 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Wed, 24 Jun 2026 09:10:36 +0200 Subject: [PATCH 1/2] Speed up matching of case-insensitive character sets Handle IN_IGNORE, IN_UNI_IGNORE and IN_LOC_IGNORE in SRE(count) so that a repeated case-insensitive set (e.g. [a-z]+ with re.I) scans inline instead of falling back to the per-character match loop. About 2x faster. Co-Authored-By: Claude Opus 4.8 (1M context) --- Modules/_sre/sre_lib.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 6e6ae46f05a50f..71eb7541d35ba5 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -213,6 +213,29 @@ SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount) ptr++; break; + case SRE_OP_IN_IGNORE: + /* repeated set, case-insensitive (ascii) */ + TRACE(("|%p|%p|COUNT IN_IGNORE\n", pattern, ptr)); + while (ptr < end && SRE(charset)(state, pattern + 2, + (SRE_CODE) sre_lower_ascii(*ptr))) + ptr++; + break; + + case SRE_OP_IN_UNI_IGNORE: + /* repeated set, case-insensitive (unicode) */ + TRACE(("|%p|%p|COUNT IN_UNI_IGNORE\n", pattern, ptr)); + while (ptr < end && SRE(charset)(state, pattern + 2, + (SRE_CODE) sre_lower_unicode(*ptr))) + ptr++; + break; + + case SRE_OP_IN_LOC_IGNORE: + /* repeated set, case-insensitive (locale) */ + TRACE(("|%p|%p|COUNT IN_LOC_IGNORE\n", pattern, ptr)); + while (ptr < end && SRE(charset_loc_ignore)(state, pattern + 2, *ptr)) + ptr++; + break; + case SRE_OP_ANY: /* repeated dot wildcard. */ TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); From 0741652d16c43c351eea202376cb389794a851cd Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Wed, 1 Jul 2026 21:25:44 +0200 Subject: [PATCH 2/2] news entry --- .../next/Library/2026-07-01-12-00-00.gh-issue-152054.Ci7Set.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-07-01-12-00-00.gh-issue-152054.Ci7Set.rst diff --git a/Misc/NEWS.d/next/Library/2026-07-01-12-00-00.gh-issue-152054.Ci7Set.rst b/Misc/NEWS.d/next/Library/2026-07-01-12-00-00.gh-issue-152054.Ci7Set.rst new file mode 100644 index 00000000000000..7298995ab1c120 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-01-12-00-00.gh-issue-152054.Ci7Set.rst @@ -0,0 +1,2 @@ +Speed up matching of case-insensitive character sets in :mod:`re`, such as +``[a-z]+`` used with the :const:`re.IGNORECASE` flag. Patch by Pieter Eendebak.