gh-152100: Fuse set-operation character classes into a single charset

serhiy-storchaka · claude · serhiy-storchaka · commit 723360672db2 · 2026-06-25T18:16:51.000+03:00
Add a compile-time optimization pass (Lib/re/_optimizer.py) that rewrites
set-operation character classes into a single character set where the
engine's charset() representation allows it.  charset() treats every NEGATE
as a polarity toggle, so a mid-list NEGATE expresses set difference and a
flat run expresses union.

Set difference -- [A--B], emitted by the parser as A(?&lt;![B]) -- fuses into
the charset [NEGATE] B [NEGATE] A, matching A minus B in one test instead of
a charset match plus a lookbehind rescan.  _optimize_charset is made
segment-aware so the interior NEGATE compiles correctly.

A union with a non-flat operand, such as [0-9||[a-z--b]], is emitted by the
parser as a BRANCH that it cannot merge.  Once its alternatives are all
one-character matchers, their item lists are concatenated into a single IN.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py
@@ -16,7 +16,7 @@
 from ._casefix import _EXTRA_CASES
 from ._optimizer import (
     _combine_flags, _compile_charset, _optimize_charset, _compile_info,
-    _simple, _CHARSET_ALL, _CODEBITS, MAXCODE,
+    _simple, _CHARSET_ALL, _CODEBITS, MAXCODE, optimize,
 )
 
 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@@ -219,6 +219,10 @@ def isstring(obj):
 def _code(p, flags):
 
     flags = p.state.flags | flags
+
+    # run the optimizer passes over the parsed pattern
+    optimize(p)
+
     code = []
 
     # compile info block
diff --git a/Lib/re/_optimizer.py b/Lib/re/_optimizer.py
@@ -56,7 +56,39 @@ def _compile_charset(charset, flags, code):
     emit(FAILURE)
 
 def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
-    # internal: optimize character set
+    # internal: optimize character set.
+    #
+    # The engine's charset() walk toggles polarity on every NEGATE (see
+    # Modules/_sre/sre_lib.h), so NEGATE markers split the set into
+    # alternating-polarity segments: a leading NEGATE is a complemented class
+    # [^...], an interior one is set difference (RL1.3).  Each segment is a
+    # plain union, optimized on its own with the NEGATE boundaries kept in place.
+    negates = [i for i, (op, _av) in enumerate(charset) if op is NEGATE]
+    if not negates or negates == [0]:
+        # Fast path: a plain union, optionally complemented as a whole -- every
+        # charset the parser produces today, optimized as before.
+        return _optimize_charset_segment(charset, iscased, fixup, fixes)
+
+    # Optimize each NEGATE-delimited run on its own.  _allow_anyall is off: the
+    # [\s\S] -> ANY_ALL / [^\s\S] -> empty shortcuts rewrite a whole set and
+    # would inject or drop a NEGATE mid-segment.
+    out = []
+    hascased = False
+    start = 0
+    for i in negates + [len(charset)]:
+        if i > start:                  # skip an empty run (e.g. a leading NEGATE)
+            opt, cased = _optimize_charset_segment(
+                charset[start:i], iscased, fixup, fixes, _allow_anyall=False)
+            out.extend(opt)
+            hascased |= cased
+        if i < len(charset):
+            out.append((NEGATE, None))
+        start = i + 1
+    return out, hascased
+
+def _optimize_charset_segment(charset, iscased=None, fixup=None, fixes=None,
+                              _allow_anyall=True):
+    # internal: optimize one NEGATE-free union of character-set members
     out = []
     tail = []
     charmap = bytearray(256)
@@ -94,7 +126,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
                             charmap[i] = 1
                 elif op is NEGATE:
                     out.append((op, av))
-                elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
+                elif op is CATEGORY and _allow_anyall and tail and (CATEGORY, CH_NEGATE[av]) in tail:
                     # Optimize [\s\S] etc.
                     out = [] if out else _CHARSET_ALL
                     return out, False
@@ -395,3 +427,83 @@ def _compile_info(code, pattern, flags):
     elif charset:
         _compile_charset(charset, flags, code)
     code[skip] = len(code) - skip
+
+# Difference-fusion peephole: rewrite [A--B]-style A(?<![B]) into a single
+# charset (see the engine's NEGATE polarity toggle).
+def _subpatterns(op, av):
+    # Yield the nested SubPatterns of one item, to recurse into.
+    if op is BRANCH:
+        yield from av[1]
+    elif op in (ASSERT, ASSERT_NOT):
+        yield av[1]
+    elif op is SUBPATTERN:
+        yield av[3]
+    elif op is ATOMIC_GROUP:
+        yield av
+    elif op in (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT):
+        yield av[2]
+    elif op is GROUPREF_EXISTS:
+        yield av[1]                 # the "yes" branch is always present
+        if av[2] is not None:       # the "no" branch is optional
+            yield av[2]
+
+def _fuse_branch(av):
+    # Fold a BRANCH of one-character matchers into a single charset: their union
+    # is the concatenation of the item lists.  charset() lets only the final
+    # polarity segment subtract, so at most one alternative may be
+    # complement-bearing (carry a NEGATE) and it must trail; two would cross
+    # (e.g. [a-z--b]||[a-z--c]) and are left as a BRANCH.
+    items = []
+    tail = None
+    for sp in av[1]:
+        cs = _parser._flat_items(sp.data, True)
+        if cs is None:
+            return None
+        if any(op is NEGATE for op, _av in cs):
+            if tail is not None:
+                return None
+            tail = cs
+        else:
+            items += cs
+    return items if tail is None else items + tail
+
+def _fuse_difference(data):
+    # Replace  <flat charset A> (?<![B1]) (?<![B2]) ...  with the single charset
+    # [NEGATE] B1 B2 ... [NEGATE] A.  Each negative lookbehind over a flat
+    # charset subtracts its set from the character A matches.
+    out = []
+    head = None        # _flat_items(A) for the fused difference now at out[-1]
+    subtrahend = None  # its accumulated B items, or None when not fusing
+    for op, av in data:
+        if op is ASSERT_NOT and av[0] < 0:          # a negative lookbehind
+            b = _parser._flat_items(av[1].data)
+            if b is not None:
+                if subtrahend is None and out:
+                    # the first lookbehind of a run: only now is it worth
+                    # checking whether the preceding item A is a flat charset.
+                    head = _parser._flat_items([out[-1]])
+                    if head is not None:
+                        subtrahend = []
+                if subtrahend is not None:
+                    subtrahend += b
+                    out[-1] = (IN, [(NEGATE, None)] + subtrahend
+                                   + [(NEGATE, None)] + head)
+                    continue
+        head = subtrahend = None
+        out.append((op, av))
+    data[:] = out
+
+def _walk(seq):
+    for i, (op, av) in enumerate(seq):
+        for sub in _subpatterns(op, av):
+            _walk(sub.data)
+        if op is BRANCH:
+            items = _fuse_branch(av)
+            if items is not None:
+                seq[i] = (IN, items)
+    _fuse_difference(seq)
+
+def optimize(pattern):
+    """Rewrite a parsed pattern in place and return it."""
+    _walk(pattern.data)
+    return pattern
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
@@ -516,14 +516,19 @@ def _charset_node(items):
         return items[0]
     return (IN, items)
 
-def _flat_items(elements):
-    # The items if `elements` is a single flat charset (no complement), else
-    # None -- the dual of _charset_node: a lone LITERAL or CATEGORY is an item.
+def _flat_items(elements, complement=False):
+    # The items if `elements` is a single flat charset, else None -- the dual
+    # of _charset_node: a lone LITERAL or CATEGORY is an item.  A complemented
+    # charset (a NEGATE-bearing IN) qualifies only when `complement` is true.
     if len(elements) == 1:
         op, av = elements[0]
         if op in _SETITEMCODES:
             return [elements[0]]
-        if op is IN and all(o is not NEGATE for o, _av in av):
+        if op is IN:
+            if not complement:
+                for o, _av in av:
+                    if o is NEGATE:
+                        return None
             return av
     return None
 
@@ -677,6 +682,8 @@ def _parse_charset(source, state, nested):
     #   [A--B]  ->  A (?<![B])           difference
     #   [A&&B]  ->  A (?<=[B])           intersection
     #   [A||B]  ->  [AB] or (?:A|B)      union
+    # A flat-operand difference [A--B] is later fused back into a single charset
+    # by Lib/re/_optimizer.py (see that module).
     # Operators chain left-to-right with no precedence.  A leading '^' negates by
     # De Morgan, pushing the negation into the operands (no lookahead needed):
     #   [^A--B] -> [^A] | B ; [^A&&B] -> [^A] | [^B] ; [^A||B] -> [^A] && [^B]