diff --git a/PyMemoryEditor/util/scan.py b/PyMemoryEditor/util/scan.py index ffa346b..9dbe0fc 100644 --- a/PyMemoryEditor/util/scan.py +++ b/PyMemoryEditor/util/scan.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import re import struct import sys from bisect import bisect_left @@ -206,6 +207,57 @@ def scan_memory_for_exact_value( yield offset +def _scan_string_ordered( + data: bytes, + end: int, + target_value_size: int, + lo_byte: int, + hi_byte: int, + boundary_bytes: frozenset, + predicate, +) -> Generator[int, None, None]: + """C-accelerated string scan for ordered comparisons (>, <, >=, <=, between). + + Strings compare big-endian, so a fixed-width window can only satisfy an + ordered comparison when its *first* byte lies in ``[lo_byte, hi_byte]``. + Those candidate positions are located with a regex byte class whose C engine + skips the long NUL runs of reserved/zeroed memory orders of magnitude faster + than a per-offset Python loop — the same idea that makes the EXACT path's + ``bytes.find`` fast. A candidate is accepted outright unless its first byte + *ties* a comparison bound (``boundary_bytes``), in which case the full window + is decoded and checked with ``predicate``. Yields offsets in ascending order + (``re.finditer`` walks left to right), identical to the byte-by-byte loop. + + ``lo_byte > hi_byte`` denotes an empty candidate range — it arises for a + reversed VALUE_BETWEEN (``start > end``), where the byte-by-byte loop's + ``start <= v <= end`` also matches nothing. Return empty rather than let + ``re`` raise "bad character range" on a ``[hi-lo]`` class. + """ + if end <= 0 or lo_byte > hi_byte: + return + + # re.escape keeps every byte (incl. class-specials like ] ^ - [ and NUL) + # literal inside the class; the unescaped `-` between them is the range op. + # Compile with NO flags: on a bytes pattern `[lo-hi]` is the exact inclusive + # ordinal range. In particular do NOT pass re.IGNORECASE — it folds ASCII + # case *inside* a class, so a range overlapping A-Z/a-z would match the + # opposite case too and silently return non-matching offsets. + matcher = re.compile( + b"[" + re.escape(bytes((lo_byte,))) + b"-" + re.escape(bytes((hi_byte,))) + b"]" + ) + + for match in matcher.finditer(data): + offset = match.start() + if offset >= end: + break + if data[offset] in boundary_bytes: + value = int.from_bytes(data[offset : offset + target_value_size], "big") + if predicate(value): + yield offset + else: + yield offset + + def scan_memory( memory_region_data: Sequence, memory_region_data_size: int, @@ -236,14 +288,25 @@ def scan_memory( # narrowing for the downstream int.from_bytes / struct.unpack calls. byte_order: _ByteOrder = cast(_ByteOrder, "big" if is_string else sys.byteorder) + # First byte of each target, used by the string fast path below to build the + # candidate-byte regex class. Captured here where `target_value`'s type is + # narrowed (tuple vs bytes); `None` means "empty target, no fast path". + first_byte: Optional[int] + start_first_byte: Optional[int] + end_first_byte: Optional[int] if isinstance(target_value, tuple): start_target_value = _decode_target(target_value[0], byte_order, pytype) end_target_value = _decode_target(target_value[1], byte_order, pytype) target_value_decoded: Union[int, float] = 0 + first_byte = None + start_first_byte = target_value[0][0] if target_value[0] else None + end_first_byte = target_value[1][0] if target_value[1] else None else: target_value_decoded = _decode_target(target_value, byte_order, pytype) start_target_value = 0 end_target_value = 0 + first_byte = target_value[0] if target_value else None + start_first_byte = end_first_byte = None fmt = None if is_string else _struct_format(byte_order, target_value_size, pytype) @@ -334,6 +397,37 @@ def scan_memory( int_from_bytes = int.from_bytes signed = pytype is int + # Fast path for ordered string comparisons. Strings compare big-endian, so a + # window can only match when its first byte falls in a known range; a regex + # byte-class prefilter finds those candidates in C, skipping the huge NUL + # runs of reserved memory instead of stepping every byte in Python. Numerics + # with unusual sizes (3/6/7) decode little-endian and fall through unchanged. + if is_string: + spec = None + if first_byte is not None and scan_type is ScanTypesEnum.BIGGER_THAN: + spec = (first_byte, 0xFF, frozenset((first_byte,)), + lambda v: v > target_value_decoded) + elif first_byte is not None and scan_type is ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE: + spec = (first_byte, 0xFF, frozenset((first_byte,)), + lambda v: v >= target_value_decoded) + elif first_byte is not None and scan_type is ScanTypesEnum.SMALLER_THAN: + spec = (0x00, first_byte, frozenset((first_byte,)), + lambda v: v < target_value_decoded) + elif first_byte is not None and scan_type is ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE: + spec = (0x00, first_byte, frozenset((first_byte,)), + lambda v: v <= target_value_decoded) + elif ( + scan_type is ScanTypesEnum.VALUE_BETWEEN + and start_first_byte is not None + and end_first_byte is not None + ): + spec = (start_first_byte, end_first_byte, + frozenset((start_first_byte, end_first_byte)), + lambda v: start_target_value <= v <= end_target_value) + if spec is not None: + yield from _scan_string_ordered(data, end, target_value_size, *spec) + return + if scan_type is ScanTypesEnum.EXACT_VALUE: for offset in range(0, end, step): value = int_from_bytes( diff --git a/docs/guide/searching.md b/docs/guide/searching.md index 0aedcb8..506f1c0 100644 --- a/docs/guide/searching.md +++ b/docs/guide/searching.md @@ -198,10 +198,13 @@ missing. ## Scan acceleration (the `speed` extra) By default every scan runs in pure Python, with the hottest paths already -delegated to C primitives (`bytes.find` for exact matches, `struct.iter_unpack` -to decode a region). What stays in Python is the per-value **comparison loop** -of the ordered scans (`BIGGER_THAN`, `SMALLER_THAN`, `VALUE_BETWEEN`, …): for a -multi-megabyte region it boxes and compares millions of values one at a time. +delegated to C primitives: `bytes.find` for exact matches, `struct.iter_unpack` +to decode a region, and a **regex byte-class prefilter** for ordered *string* +comparisons (`BIGGER_THAN` / `SMALLER_THAN` / `VALUE_BETWEEN` on `str`), which +skips the long runs of non-matching bytes in C instead of stepping every offset. +What stays in Python is the per-value **comparison loop** of the ordered +*numeric* scans: for a multi-megabyte region it boxes and compares millions of +values one at a time. Installing the optional [`speed`](../installation.md#install-with-scan-acceleration-speed) extra replaces that loop with a single vectorized NumPy comparison: @@ -241,7 +244,8 @@ emitting matches. ScenarioTypical speedup Selective scan of a large region (few matches — the usual first scan / refine step)10–60× Scan where most values match (e.g. > 0 on mostly-positive data)~2× (result building dominates) -str / bytes scans, or unusual widths (3/6/7 bytes)no change (no NumPy fast path; pure-Python loop) +str ordered scans (>, <, between)no NumPy fast path — instead C-accelerated by the regex byte-class prefilter (independent of the speed extra) +bytes scans, or unusual widths (3/6/7 bytes)no change (no NumPy fast path; pure-Python loop) EXACT_VALUE via search_by_valuealready bytes.find in C — NumPy not used @@ -262,9 +266,14 @@ for address in process.search_by_value(str, 6, "PLAYER"): print(hex(address)) ``` -For `bytes`, comparison ordering depends on your system's `byteorder` — -something to keep in mind when using `BIGGER_THAN` / `SMALLER_THAN` on raw -bytes. +Ordering for the comparison modes differs by type: + +- **`str`** compares the UTF-8 bytes **lexicographically** (big-endian), so + `"AA" < "AB" < "B"`. The shorter of two values is NUL-padded to `bufflength` + before comparing, and a reversed `VALUE_BETWEEN` range (`start > end`) simply + matches nothing. +- **`bytes`** compares using your system's `byteorder` — something to keep in + mind when using `BIGGER_THAN` / `SMALLER_THAN` on raw bytes. ```{seealso} - [Pattern scan](pattern-scan.md) — find data by **shape** with regex and AOB diff --git a/tests/test_scan.py b/tests/test_scan.py index 89e853f..413f89a 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -374,3 +374,64 @@ def test_scan_memory_double_bigger_than_negative(): # -1.0 (offset 8), 1.0 (16), 3.0 (24) match; -3.0 (offset 0) does not. assert results == [8, 16, 24] + + +# --- String ordered-comparison fast path (regex byte-class prefilter) --------- +# +# These exercise the prefilter directly with hand-checked expected offsets, on +# top of the property-based equivalence tests in test_scan_properties.py. + + +def _scan_str(data, target, size, scan_type): + return list(scan_memory(data, len(data), target, size, scan_type, str)) + + +def test_scan_string_bigger_than_first_byte_dominates(): + # 2-byte windows, step 1. Target "MA" (0x4D41). Accept windows > it. + data = b"AAZZMAMBLZ" + results = _scan_str(data, b"MA", 2, ScanTypesEnum.BIGGER_THAN) + # Windows (big-endian) and whether > "MA": AA<,AZ<,ZZ>,ZM>,MA=,AM<,MB>,BL<,LZ< + assert results == [2, 3, 6] + + +def test_scan_string_smaller_than_includes_low_bytes(): + data = b"AAMAZZ" + results = _scan_str(data, b"MA", 2, ScanTypesEnum.SMALLER_THAN) + # AA<,AM<,MA=,AZ<,ZZ> -> offsets 0,1,3 are smaller. + assert results == [0, 1, 3] + + +def test_scan_string_value_between_skips_noise(): + # Only windows whose value lands in ["EA","WZ"] inclusive should match. + data = b"AB" + b"EM" + b"ZZ" + b"WZ" + b" " + results = _scan_str(data, (b"EA", b"WZ"), 2, ScanTypesEnum.VALUE_BETWEEN) + # offsets: 0 AB(no) 1 BE(no) 2 EM(yes) 3 MZ(yes) 4 ZZ(no) 5 ZW(no) 6 WZ(yes) + # 7 Z?(no) 8 ' '..(no) + assert results == [2, 3, 6] + + +def test_scan_string_value_between_reversed_range_is_empty(): + """Regression: a reversed range (start > end) must yield nothing, not crash. + + The fast path builds a regex class ``[start_byte-end_byte]``; a reversed + range would compile to ``[hi-lo]`` and raise ``re.error: bad character + range``. The byte-by-byte loop returns [] for start > end, so the fast path + must too. + """ + data = b"MMMMMM" + assert _scan_str(data, (b"ZZ", b"AA"), 2, ScanTypesEnum.VALUE_BETWEEN) == [] + # Reversed but sharing a first byte still resolves to empty. + assert _scan_str(data, (b"MZ", b"MA"), 2, ScanTypesEnum.VALUE_BETWEEN) == [] + + +def test_scan_string_regex_special_bytes_as_bounds(): + """Bytes that are special inside a regex class (]^-\\[) must be literal.""" + data = bytes([0x5D, 0x5E, 0x2D, 0x5C, 0x5B, 0x41, 0xFF]) # ] ^ - \\ [ A 0xff + # 1-byte EXACT-equivalent via BIGGER_THAN_OR_EXACT over a special boundary: + # bytes >= '-' (0x2D): all except none here are below 0x2D. + results = _scan_str(data, b"\x2d", 1, ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE) + assert results == [0, 1, 2, 3, 4, 5, 6] + # SMALLER_THAN ']' (0x5D): bytes < 0x5D are + # '-'(0x2d=off2), '\\'(0x5c=off3), '['(0x5b=off4), 'A'(0x41=off5). + results = _scan_str(data, b"\x5d", 1, ScanTypesEnum.SMALLER_THAN) + assert results == [2, 3, 4, 5] diff --git a/tests/test_scan_properties.py b/tests/test_scan_properties.py index 1cecfe7..ab59ac4 100644 --- a/tests/test_scan_properties.py +++ b/tests/test_scan_properties.py @@ -68,6 +68,98 @@ def _int_payload(draw): return size, b"".join(struct.pack(fmt, v) for v in values), struct.pack(fmt, target) +# Ordered string comparisons that scan_memory routes through the regex +# byte-class fast path (NOT_* are dense and keep the byte-by-byte loop). +_ORDERED_STRING_SCAN_TYPES = ( + ScanTypesEnum.BIGGER_THAN, + ScanTypesEnum.SMALLER_THAN, + ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE, + ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE, +) + +# Bias the byte alphabet toward the values most likely to trip the fast path: +# the lexicographic extremes (0x00 / 0xff), boundary ties, and the bytes that +# are special inside a regex character class ([ ] ^ - \ etc.). +_TRICKY_BYTES = st.sampled_from([0x00, 0xFF] + list(b"[]^-\\&~|ABC")) +_ANY_BYTE = st.integers(min_value=0, max_value=255) + + +@st.composite +def _string_payload(draw): + size = draw(st.integers(min_value=1, max_value=8)) + count = draw(st.integers(min_value=0, max_value=40)) + byte = st.one_of(_TRICKY_BYTES, _ANY_BYTE) + data = bytes(draw(st.lists(byte, min_size=count, max_size=count))) + target = bytes(draw(st.lists(byte, min_size=size, max_size=size))) + return size, data, target + + +def _scan_string_slow(data, size, target_value, scan_type): + """Reference: byte-by-byte big-endian string scan (the pre-fast-path loop).""" + end = len(data) - size + 1 + results = [] + if isinstance(target_value, tuple): + lo = int.from_bytes(target_value[0], "big") + hi = int.from_bytes(target_value[1], "big") + else: + target = int.from_bytes(target_value, "big") + for offset in range(0, max(end, 0)): + value = int.from_bytes(data[offset : offset + size], "big") + if scan_type is ScanTypesEnum.BIGGER_THAN and value > target: + results.append(offset) + elif scan_type is ScanTypesEnum.SMALLER_THAN and value < target: + results.append(offset) + elif scan_type is ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE and value >= target: + results.append(offset) + elif scan_type is ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE and value <= target: + results.append(offset) + elif scan_type is ScanTypesEnum.VALUE_BETWEEN and lo <= value <= hi: + results.append(offset) + return results + + +@settings( + suppress_health_check=[HealthCheck.too_slow], + deadline=None, + max_examples=300, +) +@given( + payload=_string_payload(), + scan_type=st.sampled_from(_ORDERED_STRING_SCAN_TYPES), +) +def test_ordered_string_scan_matches_reference(payload, scan_type): + """Regex byte-class fast path must agree with the byte-by-byte reference. + + Strings step by one byte and compare big-endian, so the fast path uses a + first-byte prefilter; this checks it yields exactly the same offsets across + boundary ties and regex-special bytes. + """ + size, data, target = payload + fast = list(scan_memory(data, len(data), target, size, scan_type, str)) + slow = _scan_string_slow(data, size, target, scan_type) + assert fast == slow + + +@settings( + suppress_health_check=[HealthCheck.too_slow], + deadline=None, + max_examples=300, +) +@given(payload=_string_payload()) +def test_value_between_string_matches_reference(payload): + """VALUE_BETWEEN over strings (the search_by_value_between path) must match.""" + size, data, a = payload + # Build a valid [lo, hi] range from two same-width byte strings. + b = bytes((x + 1) % 256 for x in a) + lo, hi = (a, b) if a <= b else (b, a) + target = (lo, hi) + fast = list( + scan_memory(data, len(data), target, size, ScanTypesEnum.VALUE_BETWEEN, str) + ) + slow = _scan_string_slow(data, size, target, ScanTypesEnum.VALUE_BETWEEN) + assert fast == slow + + @st.composite def _float_payload(draw): size = draw(st.sampled_from(_FLOAT_SIZES))