Skip to content

Commit 7233606

Browse files
gh-152100: Fuse set-operation character classes into a single charset
Add a compile-time optimization pass (Lib/re/_optimizer.py) that rewrites set-operation character classes into a single character set where the engine's charset() representation allows it. charset() treats every NEGATE as a polarity toggle, so a mid-list NEGATE expresses set difference and a flat run expresses union. Set difference -- [A--B], emitted by the parser as A(?<![B]) -- fuses into the charset [NEGATE] B [NEGATE] A, matching A minus B in one test instead of a charset match plus a lookbehind rescan. _optimize_charset is made segment-aware so the interior NEGATE compiles correctly. A union with a non-flat operand, such as [0-9||[a-z--b]], is emitted by the parser as a BRANCH that it cannot merge. Once its alternatives are all one-character matchers, their item lists are concatenated into a single IN. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 11c241e commit 7233606

3 files changed

Lines changed: 130 additions & 7 deletions

File tree

Lib/re/_compiler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from ._casefix import _EXTRA_CASES
1717
from ._optimizer import (
1818
_combine_flags, _compile_charset, _optimize_charset, _compile_info,
19-
_simple, _CHARSET_ALL, _CODEBITS, MAXCODE,
19+
_simple, _CHARSET_ALL, _CODEBITS, MAXCODE, optimize,
2020
)
2121

2222
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@@ -219,6 +219,10 @@ def isstring(obj):
219219
def _code(p, flags):
220220

221221
flags = p.state.flags | flags
222+
223+
# run the optimizer passes over the parsed pattern
224+
optimize(p)
225+
222226
code = []
223227

224228
# compile info block

Lib/re/_optimizer.py

Lines changed: 114 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,39 @@ def _compile_charset(charset, flags, code):
5656
emit(FAILURE)
5757

5858
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
59-
# internal: optimize character set
59+
# internal: optimize character set.
60+
#
61+
# The engine's charset() walk toggles polarity on every NEGATE (see
62+
# Modules/_sre/sre_lib.h), so NEGATE markers split the set into
63+
# alternating-polarity segments: a leading NEGATE is a complemented class
64+
# [^...], an interior one is set difference (RL1.3). Each segment is a
65+
# plain union, optimized on its own with the NEGATE boundaries kept in place.
66+
negates = [i for i, (op, _av) in enumerate(charset) if op is NEGATE]
67+
if not negates or negates == [0]:
68+
# Fast path: a plain union, optionally complemented as a whole -- every
69+
# charset the parser produces today, optimized as before.
70+
return _optimize_charset_segment(charset, iscased, fixup, fixes)
71+
72+
# Optimize each NEGATE-delimited run on its own. _allow_anyall is off: the
73+
# [\s\S] -> ANY_ALL / [^\s\S] -> empty shortcuts rewrite a whole set and
74+
# would inject or drop a NEGATE mid-segment.
75+
out = []
76+
hascased = False
77+
start = 0
78+
for i in negates + [len(charset)]:
79+
if i > start: # skip an empty run (e.g. a leading NEGATE)
80+
opt, cased = _optimize_charset_segment(
81+
charset[start:i], iscased, fixup, fixes, _allow_anyall=False)
82+
out.extend(opt)
83+
hascased |= cased
84+
if i < len(charset):
85+
out.append((NEGATE, None))
86+
start = i + 1
87+
return out, hascased
88+
89+
def _optimize_charset_segment(charset, iscased=None, fixup=None, fixes=None,
90+
_allow_anyall=True):
91+
# internal: optimize one NEGATE-free union of character-set members
6092
out = []
6193
tail = []
6294
charmap = bytearray(256)
@@ -94,7 +126,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
94126
charmap[i] = 1
95127
elif op is NEGATE:
96128
out.append((op, av))
97-
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
129+
elif op is CATEGORY and _allow_anyall and tail and (CATEGORY, CH_NEGATE[av]) in tail:
98130
# Optimize [\s\S] etc.
99131
out = [] if out else _CHARSET_ALL
100132
return out, False
@@ -395,3 +427,83 @@ def _compile_info(code, pattern, flags):
395427
elif charset:
396428
_compile_charset(charset, flags, code)
397429
code[skip] = len(code) - skip
430+
431+
# Difference-fusion peephole: rewrite [A--B]-style A(?<![B]) into a single
432+
# charset (see the engine's NEGATE polarity toggle).
433+
def _subpatterns(op, av):
434+
# Yield the nested SubPatterns of one item, to recurse into.
435+
if op is BRANCH:
436+
yield from av[1]
437+
elif op in (ASSERT, ASSERT_NOT):
438+
yield av[1]
439+
elif op is SUBPATTERN:
440+
yield av[3]
441+
elif op is ATOMIC_GROUP:
442+
yield av
443+
elif op in (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT):
444+
yield av[2]
445+
elif op is GROUPREF_EXISTS:
446+
yield av[1] # the "yes" branch is always present
447+
if av[2] is not None: # the "no" branch is optional
448+
yield av[2]
449+
450+
def _fuse_branch(av):
451+
# Fold a BRANCH of one-character matchers into a single charset: their union
452+
# is the concatenation of the item lists. charset() lets only the final
453+
# polarity segment subtract, so at most one alternative may be
454+
# complement-bearing (carry a NEGATE) and it must trail; two would cross
455+
# (e.g. [a-z--b]||[a-z--c]) and are left as a BRANCH.
456+
items = []
457+
tail = None
458+
for sp in av[1]:
459+
cs = _parser._flat_items(sp.data, True)
460+
if cs is None:
461+
return None
462+
if any(op is NEGATE for op, _av in cs):
463+
if tail is not None:
464+
return None
465+
tail = cs
466+
else:
467+
items += cs
468+
return items if tail is None else items + tail
469+
470+
def _fuse_difference(data):
471+
# Replace <flat charset A> (?<![B1]) (?<![B2]) ... with the single charset
472+
# [NEGATE] B1 B2 ... [NEGATE] A. Each negative lookbehind over a flat
473+
# charset subtracts its set from the character A matches.
474+
out = []
475+
head = None # _flat_items(A) for the fused difference now at out[-1]
476+
subtrahend = None # its accumulated B items, or None when not fusing
477+
for op, av in data:
478+
if op is ASSERT_NOT and av[0] < 0: # a negative lookbehind
479+
b = _parser._flat_items(av[1].data)
480+
if b is not None:
481+
if subtrahend is None and out:
482+
# the first lookbehind of a run: only now is it worth
483+
# checking whether the preceding item A is a flat charset.
484+
head = _parser._flat_items([out[-1]])
485+
if head is not None:
486+
subtrahend = []
487+
if subtrahend is not None:
488+
subtrahend += b
489+
out[-1] = (IN, [(NEGATE, None)] + subtrahend
490+
+ [(NEGATE, None)] + head)
491+
continue
492+
head = subtrahend = None
493+
out.append((op, av))
494+
data[:] = out
495+
496+
def _walk(seq):
497+
for i, (op, av) in enumerate(seq):
498+
for sub in _subpatterns(op, av):
499+
_walk(sub.data)
500+
if op is BRANCH:
501+
items = _fuse_branch(av)
502+
if items is not None:
503+
seq[i] = (IN, items)
504+
_fuse_difference(seq)
505+
506+
def optimize(pattern):
507+
"""Rewrite a parsed pattern in place and return it."""
508+
_walk(pattern.data)
509+
return pattern

Lib/re/_parser.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -516,14 +516,19 @@ def _charset_node(items):
516516
return items[0]
517517
return (IN, items)
518518

519-
def _flat_items(elements):
520-
# The items if `elements` is a single flat charset (no complement), else
521-
# None -- the dual of _charset_node: a lone LITERAL or CATEGORY is an item.
519+
def _flat_items(elements, complement=False):
520+
# The items if `elements` is a single flat charset, else None -- the dual
521+
# of _charset_node: a lone LITERAL or CATEGORY is an item. A complemented
522+
# charset (a NEGATE-bearing IN) qualifies only when `complement` is true.
522523
if len(elements) == 1:
523524
op, av = elements[0]
524525
if op in _SETITEMCODES:
525526
return [elements[0]]
526-
if op is IN and all(o is not NEGATE for o, _av in av):
527+
if op is IN:
528+
if not complement:
529+
for o, _av in av:
530+
if o is NEGATE:
531+
return None
527532
return av
528533
return None
529534

@@ -677,6 +682,8 @@ def _parse_charset(source, state, nested):
677682
# [A--B] -> A (?<![B]) difference
678683
# [A&&B] -> A (?<=[B]) intersection
679684
# [A||B] -> [AB] or (?:A|B) union
685+
# A flat-operand difference [A--B] is later fused back into a single charset
686+
# by Lib/re/_optimizer.py (see that module).
680687
# Operators chain left-to-right with no precedence. A leading '^' negates by
681688
# De Morgan, pushing the negation into the operands (no lookahead needed):
682689
# [^A--B] -> [^A] | B ; [^A&&B] -> [^A] | [^B] ; [^A||B] -> [^A] && [^B]

0 commit comments

Comments
 (0)