diff --git a/PyMemoryEditor/util/scan.py b/PyMemoryEditor/util/scan.py
index ffa346b..9dbe0fc 100644
--- a/PyMemoryEditor/util/scan.py
+++ b/PyMemoryEditor/util/scan.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
+import re
import struct
import sys
from bisect import bisect_left
@@ -206,6 +207,57 @@ def scan_memory_for_exact_value(
yield offset
+def _scan_string_ordered(
+ data: bytes,
+ end: int,
+ target_value_size: int,
+ lo_byte: int,
+ hi_byte: int,
+ boundary_bytes: frozenset,
+ predicate,
+) -> Generator[int, None, None]:
+ """C-accelerated string scan for ordered comparisons (>, <, >=, <=, between).
+
+ Strings compare big-endian, so a fixed-width window can only satisfy an
+ ordered comparison when its *first* byte lies in ``[lo_byte, hi_byte]``.
+ Those candidate positions are located with a regex byte class whose C engine
+ skips the long NUL runs of reserved/zeroed memory orders of magnitude faster
+ than a per-offset Python loop — the same idea that makes the EXACT path's
+ ``bytes.find`` fast. A candidate is accepted outright unless its first byte
+ *ties* a comparison bound (``boundary_bytes``), in which case the full window
+ is decoded and checked with ``predicate``. Yields offsets in ascending order
+ (``re.finditer`` walks left to right), identical to the byte-by-byte loop.
+
+ ``lo_byte > hi_byte`` denotes an empty candidate range — it arises for a
+ reversed VALUE_BETWEEN (``start > end``), where the byte-by-byte loop's
+ ``start <= v <= end`` also matches nothing. Return empty rather than let
+ ``re`` raise "bad character range" on a ``[hi-lo]`` class.
+ """
+ if end <= 0 or lo_byte > hi_byte:
+ return
+
+ # re.escape keeps every byte (incl. class-specials like ] ^ - [ and NUL)
+ # literal inside the class; the unescaped `-` between them is the range op.
+ # Compile with NO flags: on a bytes pattern `[lo-hi]` is the exact inclusive
+ # ordinal range. In particular do NOT pass re.IGNORECASE — it folds ASCII
+ # case *inside* a class, so a range overlapping A-Z/a-z would match the
+ # opposite case too and silently return non-matching offsets.
+ matcher = re.compile(
+ b"[" + re.escape(bytes((lo_byte,))) + b"-" + re.escape(bytes((hi_byte,))) + b"]"
+ )
+
+ for match in matcher.finditer(data):
+ offset = match.start()
+ if offset >= end:
+ break
+ if data[offset] in boundary_bytes:
+ value = int.from_bytes(data[offset : offset + target_value_size], "big")
+ if predicate(value):
+ yield offset
+ else:
+ yield offset
+
+
def scan_memory(
memory_region_data: Sequence,
memory_region_data_size: int,
@@ -236,14 +288,25 @@ def scan_memory(
# narrowing for the downstream int.from_bytes / struct.unpack calls.
byte_order: _ByteOrder = cast(_ByteOrder, "big" if is_string else sys.byteorder)
+ # First byte of each target, used by the string fast path below to build the
+ # candidate-byte regex class. Captured here where `target_value`'s type is
+ # narrowed (tuple vs bytes); `None` means "empty target, no fast path".
+ first_byte: Optional[int]
+ start_first_byte: Optional[int]
+ end_first_byte: Optional[int]
if isinstance(target_value, tuple):
start_target_value = _decode_target(target_value[0], byte_order, pytype)
end_target_value = _decode_target(target_value[1], byte_order, pytype)
target_value_decoded: Union[int, float] = 0
+ first_byte = None
+ start_first_byte = target_value[0][0] if target_value[0] else None
+ end_first_byte = target_value[1][0] if target_value[1] else None
else:
target_value_decoded = _decode_target(target_value, byte_order, pytype)
start_target_value = 0
end_target_value = 0
+ first_byte = target_value[0] if target_value else None
+ start_first_byte = end_first_byte = None
fmt = None if is_string else _struct_format(byte_order, target_value_size, pytype)
@@ -334,6 +397,37 @@ def scan_memory(
int_from_bytes = int.from_bytes
signed = pytype is int
+ # Fast path for ordered string comparisons. Strings compare big-endian, so a
+ # window can only match when its first byte falls in a known range; a regex
+ # byte-class prefilter finds those candidates in C, skipping the huge NUL
+ # runs of reserved memory instead of stepping every byte in Python. Numerics
+ # with unusual sizes (3/6/7) decode little-endian and fall through unchanged.
+ if is_string:
+ spec = None
+ if first_byte is not None and scan_type is ScanTypesEnum.BIGGER_THAN:
+ spec = (first_byte, 0xFF, frozenset((first_byte,)),
+ lambda v: v > target_value_decoded)
+ elif first_byte is not None and scan_type is ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE:
+ spec = (first_byte, 0xFF, frozenset((first_byte,)),
+ lambda v: v >= target_value_decoded)
+ elif first_byte is not None and scan_type is ScanTypesEnum.SMALLER_THAN:
+ spec = (0x00, first_byte, frozenset((first_byte,)),
+ lambda v: v < target_value_decoded)
+ elif first_byte is not None and scan_type is ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE:
+ spec = (0x00, first_byte, frozenset((first_byte,)),
+ lambda v: v <= target_value_decoded)
+ elif (
+ scan_type is ScanTypesEnum.VALUE_BETWEEN
+ and start_first_byte is not None
+ and end_first_byte is not None
+ ):
+ spec = (start_first_byte, end_first_byte,
+ frozenset((start_first_byte, end_first_byte)),
+ lambda v: start_target_value <= v <= end_target_value)
+ if spec is not None:
+ yield from _scan_string_ordered(data, end, target_value_size, *spec)
+ return
+
if scan_type is ScanTypesEnum.EXACT_VALUE:
for offset in range(0, end, step):
value = int_from_bytes(
diff --git a/docs/guide/searching.md b/docs/guide/searching.md
index 0aedcb8..506f1c0 100644
--- a/docs/guide/searching.md
+++ b/docs/guide/searching.md
@@ -198,10 +198,13 @@ missing.
## Scan acceleration (the `speed` extra)
By default every scan runs in pure Python, with the hottest paths already
-delegated to C primitives (`bytes.find` for exact matches, `struct.iter_unpack`
-to decode a region). What stays in Python is the per-value **comparison loop**
-of the ordered scans (`BIGGER_THAN`, `SMALLER_THAN`, `VALUE_BETWEEN`, …): for a
-multi-megabyte region it boxes and compares millions of values one at a time.
+delegated to C primitives: `bytes.find` for exact matches, `struct.iter_unpack`
+to decode a region, and a **regex byte-class prefilter** for ordered *string*
+comparisons (`BIGGER_THAN` / `SMALLER_THAN` / `VALUE_BETWEEN` on `str`), which
+skips the long runs of non-matching bytes in C instead of stepping every offset.
+What stays in Python is the per-value **comparison loop** of the ordered
+*numeric* scans: for a multi-megabyte region it boxes and compares millions of
+values one at a time.
Installing the optional [`speed`](../installation.md#install-with-scan-acceleration-speed)
extra replaces that loop with a single vectorized NumPy comparison:
@@ -241,7 +244,8 @@ emitting matches.
| Scenario | Typical speedup |
| Selective scan of a large region (few matches — the usual first scan / refine step) | 10–60× |
Scan where most values match (e.g. > 0 on mostly-positive data) | ~2× (result building dominates) |
-str / bytes scans, or unusual widths (3/6/7 bytes) | no change (no NumPy fast path; pure-Python loop) |
+str ordered scans (>, <, between) | no NumPy fast path — instead C-accelerated by the regex byte-class prefilter (independent of the speed extra) |
+bytes scans, or unusual widths (3/6/7 bytes) | no change (no NumPy fast path; pure-Python loop) |
EXACT_VALUE via search_by_value | already bytes.find in C — NumPy not used |
@@ -262,9 +266,14 @@ for address in process.search_by_value(str, 6, "PLAYER"):
print(hex(address))
```
-For `bytes`, comparison ordering depends on your system's `byteorder` —
-something to keep in mind when using `BIGGER_THAN` / `SMALLER_THAN` on raw
-bytes.
+Ordering for the comparison modes differs by type:
+
+- **`str`** compares the UTF-8 bytes **lexicographically** (big-endian), so
+ `"AA" < "AB" < "B"`. The shorter of two values is NUL-padded to `bufflength`
+ before comparing, and a reversed `VALUE_BETWEEN` range (`start > end`) simply
+ matches nothing.
+- **`bytes`** compares using your system's `byteorder` — something to keep in
+ mind when using `BIGGER_THAN` / `SMALLER_THAN` on raw bytes.
```{seealso}
- [Pattern scan](pattern-scan.md) — find data by **shape** with regex and AOB
diff --git a/tests/test_scan.py b/tests/test_scan.py
index 89e853f..413f89a 100644
--- a/tests/test_scan.py
+++ b/tests/test_scan.py
@@ -374,3 +374,64 @@ def test_scan_memory_double_bigger_than_negative():
# -1.0 (offset 8), 1.0 (16), 3.0 (24) match; -3.0 (offset 0) does not.
assert results == [8, 16, 24]
+
+
+# --- String ordered-comparison fast path (regex byte-class prefilter) ---------
+#
+# These exercise the prefilter directly with hand-checked expected offsets, on
+# top of the property-based equivalence tests in test_scan_properties.py.
+
+
+def _scan_str(data, target, size, scan_type):
+ return list(scan_memory(data, len(data), target, size, scan_type, str))
+
+
+def test_scan_string_bigger_than_first_byte_dominates():
+ # 2-byte windows, step 1. Target "MA" (0x4D41). Accept windows > it.
+ data = b"AAZZMAMBLZ"
+ results = _scan_str(data, b"MA", 2, ScanTypesEnum.BIGGER_THAN)
+ # Windows (big-endian) and whether > "MA": AA<,AZ<,ZZ>,ZM>,MA=,AM<,MB>,BL<,LZ<
+ assert results == [2, 3, 6]
+
+
+def test_scan_string_smaller_than_includes_low_bytes():
+ data = b"AAMAZZ"
+ results = _scan_str(data, b"MA", 2, ScanTypesEnum.SMALLER_THAN)
+ # AA<,AM<,MA=,AZ<,ZZ> -> offsets 0,1,3 are smaller.
+ assert results == [0, 1, 3]
+
+
+def test_scan_string_value_between_skips_noise():
+ # Only windows whose value lands in ["EA","WZ"] inclusive should match.
+ data = b"AB" + b"EM" + b"ZZ" + b"WZ" + b" "
+ results = _scan_str(data, (b"EA", b"WZ"), 2, ScanTypesEnum.VALUE_BETWEEN)
+ # offsets: 0 AB(no) 1 BE(no) 2 EM(yes) 3 MZ(yes) 4 ZZ(no) 5 ZW(no) 6 WZ(yes)
+ # 7 Z?(no) 8 ' '..(no)
+ assert results == [2, 3, 6]
+
+
+def test_scan_string_value_between_reversed_range_is_empty():
+ """Regression: a reversed range (start > end) must yield nothing, not crash.
+
+ The fast path builds a regex class ``[start_byte-end_byte]``; a reversed
+ range would compile to ``[hi-lo]`` and raise ``re.error: bad character
+ range``. The byte-by-byte loop returns [] for start > end, so the fast path
+ must too.
+ """
+ data = b"MMMMMM"
+ assert _scan_str(data, (b"ZZ", b"AA"), 2, ScanTypesEnum.VALUE_BETWEEN) == []
+ # Reversed but sharing a first byte still resolves to empty.
+ assert _scan_str(data, (b"MZ", b"MA"), 2, ScanTypesEnum.VALUE_BETWEEN) == []
+
+
+def test_scan_string_regex_special_bytes_as_bounds():
+ """Bytes that are special inside a regex class (]^-\\[) must be literal."""
+ data = bytes([0x5D, 0x5E, 0x2D, 0x5C, 0x5B, 0x41, 0xFF]) # ] ^ - \\ [ A 0xff
+ # 1-byte EXACT-equivalent via BIGGER_THAN_OR_EXACT over a special boundary:
+ # bytes >= '-' (0x2D): all except none here are below 0x2D.
+ results = _scan_str(data, b"\x2d", 1, ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE)
+ assert results == [0, 1, 2, 3, 4, 5, 6]
+ # SMALLER_THAN ']' (0x5D): bytes < 0x5D are
+ # '-'(0x2d=off2), '\\'(0x5c=off3), '['(0x5b=off4), 'A'(0x41=off5).
+ results = _scan_str(data, b"\x5d", 1, ScanTypesEnum.SMALLER_THAN)
+ assert results == [2, 3, 4, 5]
diff --git a/tests/test_scan_properties.py b/tests/test_scan_properties.py
index 1cecfe7..ab59ac4 100644
--- a/tests/test_scan_properties.py
+++ b/tests/test_scan_properties.py
@@ -68,6 +68,98 @@ def _int_payload(draw):
return size, b"".join(struct.pack(fmt, v) for v in values), struct.pack(fmt, target)
+# Ordered string comparisons that scan_memory routes through the regex
+# byte-class fast path (NOT_* are dense and keep the byte-by-byte loop).
+_ORDERED_STRING_SCAN_TYPES = (
+ ScanTypesEnum.BIGGER_THAN,
+ ScanTypesEnum.SMALLER_THAN,
+ ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE,
+ ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE,
+)
+
+# Bias the byte alphabet toward the values most likely to trip the fast path:
+# the lexicographic extremes (0x00 / 0xff), boundary ties, and the bytes that
+# are special inside a regex character class ([ ] ^ - \ etc.).
+_TRICKY_BYTES = st.sampled_from([0x00, 0xFF] + list(b"[]^-\\&~|ABC"))
+_ANY_BYTE = st.integers(min_value=0, max_value=255)
+
+
+@st.composite
+def _string_payload(draw):
+ size = draw(st.integers(min_value=1, max_value=8))
+ count = draw(st.integers(min_value=0, max_value=40))
+ byte = st.one_of(_TRICKY_BYTES, _ANY_BYTE)
+ data = bytes(draw(st.lists(byte, min_size=count, max_size=count)))
+ target = bytes(draw(st.lists(byte, min_size=size, max_size=size)))
+ return size, data, target
+
+
+def _scan_string_slow(data, size, target_value, scan_type):
+ """Reference: byte-by-byte big-endian string scan (the pre-fast-path loop)."""
+ end = len(data) - size + 1
+ results = []
+ if isinstance(target_value, tuple):
+ lo = int.from_bytes(target_value[0], "big")
+ hi = int.from_bytes(target_value[1], "big")
+ else:
+ target = int.from_bytes(target_value, "big")
+ for offset in range(0, max(end, 0)):
+ value = int.from_bytes(data[offset : offset + size], "big")
+ if scan_type is ScanTypesEnum.BIGGER_THAN and value > target:
+ results.append(offset)
+ elif scan_type is ScanTypesEnum.SMALLER_THAN and value < target:
+ results.append(offset)
+ elif scan_type is ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE and value >= target:
+ results.append(offset)
+ elif scan_type is ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE and value <= target:
+ results.append(offset)
+ elif scan_type is ScanTypesEnum.VALUE_BETWEEN and lo <= value <= hi:
+ results.append(offset)
+ return results
+
+
+@settings(
+ suppress_health_check=[HealthCheck.too_slow],
+ deadline=None,
+ max_examples=300,
+)
+@given(
+ payload=_string_payload(),
+ scan_type=st.sampled_from(_ORDERED_STRING_SCAN_TYPES),
+)
+def test_ordered_string_scan_matches_reference(payload, scan_type):
+ """Regex byte-class fast path must agree with the byte-by-byte reference.
+
+ Strings step by one byte and compare big-endian, so the fast path uses a
+ first-byte prefilter; this checks it yields exactly the same offsets across
+ boundary ties and regex-special bytes.
+ """
+ size, data, target = payload
+ fast = list(scan_memory(data, len(data), target, size, scan_type, str))
+ slow = _scan_string_slow(data, size, target, scan_type)
+ assert fast == slow
+
+
+@settings(
+ suppress_health_check=[HealthCheck.too_slow],
+ deadline=None,
+ max_examples=300,
+)
+@given(payload=_string_payload())
+def test_value_between_string_matches_reference(payload):
+ """VALUE_BETWEEN over strings (the search_by_value_between path) must match."""
+ size, data, a = payload
+ # Build a valid [lo, hi] range from two same-width byte strings.
+ b = bytes((x + 1) % 256 for x in a)
+ lo, hi = (a, b) if a <= b else (b, a)
+ target = (lo, hi)
+ fast = list(
+ scan_memory(data, len(data), target, size, ScanTypesEnum.VALUE_BETWEEN, str)
+ )
+ slow = _scan_string_slow(data, size, target, ScanTypesEnum.VALUE_BETWEEN)
+ assert fast == slow
+
+
@st.composite
def _float_payload(draw):
size = draw(st.sampled_from(_FLOAT_SIZES))