@@ -56,7 +56,39 @@ def _compile_charset(charset, flags, code):
5656 emit (FAILURE )
5757
5858def _optimize_charset (charset , iscased = None , fixup = None , fixes = None ):
59- # internal: optimize character set
59+ # internal: optimize character set.
60+ #
61+ # The engine's charset() walk toggles polarity on every NEGATE (see
62+ # Modules/_sre/sre_lib.h), so NEGATE markers split the set into
63+ # alternating-polarity segments: a leading NEGATE is a complemented class
64+ # [^...], an interior one is set difference (RL1.3). Each segment is a
65+ # plain union, optimized on its own with the NEGATE boundaries kept in place.
66+ negates = [i for i , (op , _av ) in enumerate (charset ) if op is NEGATE ]
67+ if not negates or negates == [0 ]:
68+ # Fast path: a plain union, optionally complemented as a whole -- every
69+ # charset the parser produces today, optimized as before.
70+ return _optimize_charset_segment (charset , iscased , fixup , fixes )
71+
72+ # Optimize each NEGATE-delimited run on its own. _allow_anyall is off: the
73+ # [\s\S] -> ANY_ALL / [^\s\S] -> empty shortcuts rewrite a whole set and
74+ # would inject or drop a NEGATE mid-segment.
75+ out = []
76+ hascased = False
77+ start = 0
78+ for i in negates + [len (charset )]:
79+ if i > start : # skip an empty run (e.g. a leading NEGATE)
80+ opt , cased = _optimize_charset_segment (
81+ charset [start :i ], iscased , fixup , fixes , _allow_anyall = False )
82+ out .extend (opt )
83+ hascased |= cased
84+ if i < len (charset ):
85+ out .append ((NEGATE , None ))
86+ start = i + 1
87+ return out , hascased
88+
89+ def _optimize_charset_segment (charset , iscased = None , fixup = None , fixes = None ,
90+ _allow_anyall = True ):
91+ # internal: optimize one NEGATE-free union of character-set members
6092 out = []
6193 tail = []
6294 charmap = bytearray (256 )
@@ -94,7 +126,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
94126 charmap [i ] = 1
95127 elif op is NEGATE :
96128 out .append ((op , av ))
97- elif op is CATEGORY and tail and (CATEGORY , CH_NEGATE [av ]) in tail :
129+ elif op is CATEGORY and _allow_anyall and tail and (CATEGORY , CH_NEGATE [av ]) in tail :
98130 # Optimize [\s\S] etc.
99131 out = [] if out else _CHARSET_ALL
100132 return out , False
@@ -395,3 +427,83 @@ def _compile_info(code, pattern, flags):
395427 elif charset :
396428 _compile_charset (charset , flags , code )
397429 code [skip ] = len (code ) - skip
430+
431+ # Difference-fusion peephole: rewrite [A--B]-style A(?<![B]) into a single
432+ # charset (see the engine's NEGATE polarity toggle).
433+ def _subpatterns (op , av ):
434+ # Yield the nested SubPatterns of one item, to recurse into.
435+ if op is BRANCH :
436+ yield from av [1 ]
437+ elif op in (ASSERT , ASSERT_NOT ):
438+ yield av [1 ]
439+ elif op is SUBPATTERN :
440+ yield av [3 ]
441+ elif op is ATOMIC_GROUP :
442+ yield av
443+ elif op in (MIN_REPEAT , MAX_REPEAT , POSSESSIVE_REPEAT ):
444+ yield av [2 ]
445+ elif op is GROUPREF_EXISTS :
446+ yield av [1 ] # the "yes" branch is always present
447+ if av [2 ] is not None : # the "no" branch is optional
448+ yield av [2 ]
449+
450+ def _fuse_branch (av ):
451+ # Fold a BRANCH of one-character matchers into a single charset: their union
452+ # is the concatenation of the item lists. charset() lets only the final
453+ # polarity segment subtract, so at most one alternative may be
454+ # complement-bearing (carry a NEGATE) and it must trail; two would cross
455+ # (e.g. [a-z--b]||[a-z--c]) and are left as a BRANCH.
456+ items = []
457+ tail = None
458+ for sp in av [1 ]:
459+ cs = _parser ._flat_items (sp .data , True )
460+ if cs is None :
461+ return None
462+ if any (op is NEGATE for op , _av in cs ):
463+ if tail is not None :
464+ return None
465+ tail = cs
466+ else :
467+ items += cs
468+ return items if tail is None else items + tail
469+
470+ def _fuse_difference (data ):
471+ # Replace <flat charset A> (?<![B1]) (?<![B2]) ... with the single charset
472+ # [NEGATE] B1 B2 ... [NEGATE] A. Each negative lookbehind over a flat
473+ # charset subtracts its set from the character A matches.
474+ out = []
475+ head = None # _flat_items(A) for the fused difference now at out[-1]
476+ subtrahend = None # its accumulated B items, or None when not fusing
477+ for op , av in data :
478+ if op is ASSERT_NOT and av [0 ] < 0 : # a negative lookbehind
479+ b = _parser ._flat_items (av [1 ].data )
480+ if b is not None :
481+ if subtrahend is None and out :
482+ # the first lookbehind of a run: only now is it worth
483+ # checking whether the preceding item A is a flat charset.
484+ head = _parser ._flat_items ([out [- 1 ]])
485+ if head is not None :
486+ subtrahend = []
487+ if subtrahend is not None :
488+ subtrahend += b
489+ out [- 1 ] = (IN , [(NEGATE , None )] + subtrahend
490+ + [(NEGATE , None )] + head )
491+ continue
492+ head = subtrahend = None
493+ out .append ((op , av ))
494+ data [:] = out
495+
496+ def _walk (seq ):
497+ for i , (op , av ) in enumerate (seq ):
498+ for sub in _subpatterns (op , av ):
499+ _walk (sub .data )
500+ if op is BRANCH :
501+ items = _fuse_branch (av )
502+ if items is not None :
503+ seq [i ] = (IN , items )
504+ _fuse_difference (seq )
505+
506+ def optimize (pattern ):
507+ """Rewrite a parsed pattern in place and return it."""
508+ _walk (pattern .data )
509+ return pattern
0 commit comments