Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions PyMemoryEditor/util/scan.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import re
import struct
import sys
from bisect import bisect_left
Expand Down Expand Up @@ -206,6 +207,57 @@ def scan_memory_for_exact_value(
yield offset


def _scan_string_ordered(
data: bytes,
end: int,
target_value_size: int,
lo_byte: int,
hi_byte: int,
boundary_bytes: frozenset,
predicate,
) -> Generator[int, None, None]:
"""C-accelerated string scan for ordered comparisons (>, <, >=, <=, between).

Strings compare big-endian, so a fixed-width window can only satisfy an
ordered comparison when its *first* byte lies in ``[lo_byte, hi_byte]``.
Those candidate positions are located with a regex byte class whose C engine
skips the long NUL runs of reserved/zeroed memory orders of magnitude faster
than a per-offset Python loop — the same idea that makes the EXACT path's
``bytes.find`` fast. A candidate is accepted outright unless its first byte
*ties* a comparison bound (``boundary_bytes``), in which case the full window
is decoded and checked with ``predicate``. Yields offsets in ascending order
(``re.finditer`` walks left to right), identical to the byte-by-byte loop.

``lo_byte > hi_byte`` denotes an empty candidate range — it arises for a
reversed VALUE_BETWEEN (``start > end``), where the byte-by-byte loop's
``start <= v <= end`` also matches nothing. Return empty rather than let
``re`` raise "bad character range" on a ``[hi-lo]`` class.
"""
if end <= 0 or lo_byte > hi_byte:
return

# re.escape keeps every byte (incl. class-specials like ] ^ - [ and NUL)
# literal inside the class; the unescaped `-` between them is the range op.
# Compile with NO flags: on a bytes pattern `[lo-hi]` is the exact inclusive
# ordinal range. In particular do NOT pass re.IGNORECASE — it folds ASCII
# case *inside* a class, so a range overlapping A-Z/a-z would match the
# opposite case too and silently return non-matching offsets.
matcher = re.compile(
b"[" + re.escape(bytes((lo_byte,))) + b"-" + re.escape(bytes((hi_byte,))) + b"]"
)

for match in matcher.finditer(data):
offset = match.start()
if offset >= end:
break
if data[offset] in boundary_bytes:
value = int.from_bytes(data[offset : offset + target_value_size], "big")
if predicate(value):
yield offset
else:
yield offset


def scan_memory(
memory_region_data: Sequence,
memory_region_data_size: int,
Expand Down Expand Up @@ -236,14 +288,25 @@ def scan_memory(
# narrowing for the downstream int.from_bytes / struct.unpack calls.
byte_order: _ByteOrder = cast(_ByteOrder, "big" if is_string else sys.byteorder)

# First byte of each target, used by the string fast path below to build the
# candidate-byte regex class. Captured here where `target_value`'s type is
# narrowed (tuple vs bytes); `None` means "empty target, no fast path".
first_byte: Optional[int]
start_first_byte: Optional[int]
end_first_byte: Optional[int]
if isinstance(target_value, tuple):
start_target_value = _decode_target(target_value[0], byte_order, pytype)
end_target_value = _decode_target(target_value[1], byte_order, pytype)
target_value_decoded: Union[int, float] = 0
first_byte = None
start_first_byte = target_value[0][0] if target_value[0] else None
end_first_byte = target_value[1][0] if target_value[1] else None
else:
target_value_decoded = _decode_target(target_value, byte_order, pytype)
start_target_value = 0
end_target_value = 0
first_byte = target_value[0] if target_value else None
start_first_byte = end_first_byte = None

fmt = None if is_string else _struct_format(byte_order, target_value_size, pytype)

Expand Down Expand Up @@ -334,6 +397,37 @@ def scan_memory(
int_from_bytes = int.from_bytes
signed = pytype is int

# Fast path for ordered string comparisons. Strings compare big-endian, so a
# window can only match when its first byte falls in a known range; a regex
# byte-class prefilter finds those candidates in C, skipping the huge NUL
# runs of reserved memory instead of stepping every byte in Python. Numerics
# with unusual sizes (3/6/7) decode little-endian and fall through unchanged.
if is_string:
spec = None
if first_byte is not None and scan_type is ScanTypesEnum.BIGGER_THAN:
spec = (first_byte, 0xFF, frozenset((first_byte,)),
lambda v: v > target_value_decoded)
elif first_byte is not None and scan_type is ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE:
spec = (first_byte, 0xFF, frozenset((first_byte,)),
lambda v: v >= target_value_decoded)
elif first_byte is not None and scan_type is ScanTypesEnum.SMALLER_THAN:
spec = (0x00, first_byte, frozenset((first_byte,)),
lambda v: v < target_value_decoded)
elif first_byte is not None and scan_type is ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE:
spec = (0x00, first_byte, frozenset((first_byte,)),
lambda v: v <= target_value_decoded)
elif (
scan_type is ScanTypesEnum.VALUE_BETWEEN
and start_first_byte is not None
and end_first_byte is not None
):
spec = (start_first_byte, end_first_byte,
frozenset((start_first_byte, end_first_byte)),
lambda v: start_target_value <= v <= end_target_value)
if spec is not None:
yield from _scan_string_ordered(data, end, target_value_size, *spec)
return

if scan_type is ScanTypesEnum.EXACT_VALUE:
for offset in range(0, end, step):
value = int_from_bytes(
Expand Down
25 changes: 17 additions & 8 deletions docs/guide/searching.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,13 @@ missing.
## Scan acceleration (the `speed` extra)

By default every scan runs in pure Python, with the hottest paths already
delegated to C primitives (`bytes.find` for exact matches, `struct.iter_unpack`
to decode a region). What stays in Python is the per-value **comparison loop**
of the ordered scans (`BIGGER_THAN`, `SMALLER_THAN`, `VALUE_BETWEEN`, …): for a
multi-megabyte region it boxes and compares millions of values one at a time.
delegated to C primitives: `bytes.find` for exact matches, `struct.iter_unpack`
to decode a region, and a **regex byte-class prefilter** for ordered *string*
comparisons (`BIGGER_THAN` / `SMALLER_THAN` / `VALUE_BETWEEN` on `str`), which
skips the long runs of non-matching bytes in C instead of stepping every offset.
What stays in Python is the per-value **comparison loop** of the ordered
*numeric* scans: for a multi-megabyte region it boxes and compares millions of
values one at a time.

Installing the optional [`speed`](../installation.md#install-with-scan-acceleration-speed)
extra replaces that loop with a single vectorized NumPy comparison:
Expand Down Expand Up @@ -241,7 +244,8 @@ emitting matches.
<tr><th>Scenario</th><th>Typical speedup</th></tr>
<tr><td>Selective scan of a large region (few matches — the usual first scan / refine step)</td><td><b>10–60×</b></td></tr>
<tr><td>Scan where most values match (e.g. <code>&gt; 0</code> on mostly-positive data)</td><td>~2× (result building dominates)</td></tr>
<tr><td><code>str</code> / <code>bytes</code> scans, or unusual widths (3/6/7 bytes)</td><td>no change (no NumPy fast path; pure-Python loop)</td></tr>
<tr><td><code>str</code> ordered scans (<code>&gt;</code>, <code>&lt;</code>, <code>between</code>)</td><td>no NumPy fast path — instead C-accelerated by the regex byte-class prefilter (independent of the <code>speed</code> extra)</td></tr>
<tr><td><code>bytes</code> scans, or unusual widths (3/6/7 bytes)</td><td>no change (no NumPy fast path; pure-Python loop)</td></tr>
<tr><td><code>EXACT_VALUE</code> via <code>search_by_value</code></td><td>already <code>bytes.find</code> in C — NumPy not used</td></tr>
</table>

Expand All @@ -262,9 +266,14 @@ for address in process.search_by_value(str, 6, "PLAYER"):
print(hex(address))
```

For `bytes`, comparison ordering depends on your system's `byteorder` —
something to keep in mind when using `BIGGER_THAN` / `SMALLER_THAN` on raw
bytes.
Ordering for the comparison modes differs by type:

- **`str`** compares the UTF-8 bytes **lexicographically** (big-endian), so
`"AA" < "AB" < "B"`. The shorter of two values is NUL-padded to `bufflength`
before comparing, and a reversed `VALUE_BETWEEN` range (`start > end`) simply
matches nothing.
- **`bytes`** compares using your system's `byteorder` — something to keep in
mind when using `BIGGER_THAN` / `SMALLER_THAN` on raw bytes.

```{seealso}
- [Pattern scan](pattern-scan.md) — find data by **shape** with regex and AOB
Expand Down
61 changes: 61 additions & 0 deletions tests/test_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,64 @@ def test_scan_memory_double_bigger_than_negative():

# -1.0 (offset 8), 1.0 (16), 3.0 (24) match; -3.0 (offset 0) does not.
assert results == [8, 16, 24]


# --- String ordered-comparison fast path (regex byte-class prefilter) ---------
#
# These exercise the prefilter directly with hand-checked expected offsets, on
# top of the property-based equivalence tests in test_scan_properties.py.


def _scan_str(data, target, size, scan_type):
return list(scan_memory(data, len(data), target, size, scan_type, str))


def test_scan_string_bigger_than_first_byte_dominates():
# 2-byte windows, step 1. Target "MA" (0x4D41). Accept windows > it.
data = b"AAZZMAMBLZ"
results = _scan_str(data, b"MA", 2, ScanTypesEnum.BIGGER_THAN)
# Windows (big-endian) and whether > "MA": AA<,AZ<,ZZ>,ZM>,MA=,AM<,MB>,BL<,LZ<
assert results == [2, 3, 6]


def test_scan_string_smaller_than_includes_low_bytes():
data = b"AAMAZZ"
results = _scan_str(data, b"MA", 2, ScanTypesEnum.SMALLER_THAN)
# AA<,AM<,MA=,AZ<,ZZ> -> offsets 0,1,3 are smaller.
assert results == [0, 1, 3]


def test_scan_string_value_between_skips_noise():
# Only windows whose value lands in ["EA","WZ"] inclusive should match.
data = b"AB" + b"EM" + b"ZZ" + b"WZ" + b" "
results = _scan_str(data, (b"EA", b"WZ"), 2, ScanTypesEnum.VALUE_BETWEEN)
# offsets: 0 AB(no) 1 BE(no) 2 EM(yes) 3 MZ(yes) 4 ZZ(no) 5 ZW(no) 6 WZ(yes)
# 7 Z?(no) 8 ' '..(no)
assert results == [2, 3, 6]


def test_scan_string_value_between_reversed_range_is_empty():
"""Regression: a reversed range (start > end) must yield nothing, not crash.

The fast path builds a regex class ``[start_byte-end_byte]``; a reversed
range would compile to ``[hi-lo]`` and raise ``re.error: bad character
range``. The byte-by-byte loop returns [] for start > end, so the fast path
must too.
"""
data = b"MMMMMM"
assert _scan_str(data, (b"ZZ", b"AA"), 2, ScanTypesEnum.VALUE_BETWEEN) == []
# Reversed but sharing a first byte still resolves to empty.
assert _scan_str(data, (b"MZ", b"MA"), 2, ScanTypesEnum.VALUE_BETWEEN) == []


def test_scan_string_regex_special_bytes_as_bounds():
"""Bytes that are special inside a regex class (]^-\\[) must be literal."""
data = bytes([0x5D, 0x5E, 0x2D, 0x5C, 0x5B, 0x41, 0xFF]) # ] ^ - \\ [ A 0xff
# 1-byte EXACT-equivalent via BIGGER_THAN_OR_EXACT over a special boundary:
# bytes >= '-' (0x2D): all except none here are below 0x2D.
results = _scan_str(data, b"\x2d", 1, ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE)
assert results == [0, 1, 2, 3, 4, 5, 6]
# SMALLER_THAN ']' (0x5D): bytes < 0x5D are
# '-'(0x2d=off2), '\\'(0x5c=off3), '['(0x5b=off4), 'A'(0x41=off5).
results = _scan_str(data, b"\x5d", 1, ScanTypesEnum.SMALLER_THAN)
assert results == [2, 3, 4, 5]
92 changes: 92 additions & 0 deletions tests/test_scan_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,98 @@ def _int_payload(draw):
return size, b"".join(struct.pack(fmt, v) for v in values), struct.pack(fmt, target)


# Ordered string comparisons that scan_memory routes through the regex
# byte-class fast path (NOT_* are dense and keep the byte-by-byte loop).
_ORDERED_STRING_SCAN_TYPES = (
ScanTypesEnum.BIGGER_THAN,
ScanTypesEnum.SMALLER_THAN,
ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE,
ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE,
)

# Bias the byte alphabet toward the values most likely to trip the fast path:
# the lexicographic extremes (0x00 / 0xff), boundary ties, and the bytes that
# are special inside a regex character class ([ ] ^ - \ etc.).
_TRICKY_BYTES = st.sampled_from([0x00, 0xFF] + list(b"[]^-\\&~|ABC"))
_ANY_BYTE = st.integers(min_value=0, max_value=255)


@st.composite
def _string_payload(draw):
size = draw(st.integers(min_value=1, max_value=8))
count = draw(st.integers(min_value=0, max_value=40))
byte = st.one_of(_TRICKY_BYTES, _ANY_BYTE)
data = bytes(draw(st.lists(byte, min_size=count, max_size=count)))
target = bytes(draw(st.lists(byte, min_size=size, max_size=size)))
return size, data, target


def _scan_string_slow(data, size, target_value, scan_type):
"""Reference: byte-by-byte big-endian string scan (the pre-fast-path loop)."""
end = len(data) - size + 1
results = []
if isinstance(target_value, tuple):
lo = int.from_bytes(target_value[0], "big")
hi = int.from_bytes(target_value[1], "big")
else:
target = int.from_bytes(target_value, "big")
for offset in range(0, max(end, 0)):
value = int.from_bytes(data[offset : offset + size], "big")
if scan_type is ScanTypesEnum.BIGGER_THAN and value > target:
results.append(offset)
elif scan_type is ScanTypesEnum.SMALLER_THAN and value < target:
results.append(offset)
elif scan_type is ScanTypesEnum.BIGGER_THAN_OR_EXACT_VALUE and value >= target:
results.append(offset)
elif scan_type is ScanTypesEnum.SMALLER_THAN_OR_EXACT_VALUE and value <= target:
results.append(offset)
elif scan_type is ScanTypesEnum.VALUE_BETWEEN and lo <= value <= hi:
results.append(offset)
return results


@settings(
suppress_health_check=[HealthCheck.too_slow],
deadline=None,
max_examples=300,
)
@given(
payload=_string_payload(),
scan_type=st.sampled_from(_ORDERED_STRING_SCAN_TYPES),
)
def test_ordered_string_scan_matches_reference(payload, scan_type):
"""Regex byte-class fast path must agree with the byte-by-byte reference.

Strings step by one byte and compare big-endian, so the fast path uses a
first-byte prefilter; this checks it yields exactly the same offsets across
boundary ties and regex-special bytes.
"""
size, data, target = payload
fast = list(scan_memory(data, len(data), target, size, scan_type, str))
slow = _scan_string_slow(data, size, target, scan_type)
assert fast == slow


@settings(
suppress_health_check=[HealthCheck.too_slow],
deadline=None,
max_examples=300,
)
@given(payload=_string_payload())
def test_value_between_string_matches_reference(payload):
"""VALUE_BETWEEN over strings (the search_by_value_between path) must match."""
size, data, a = payload
# Build a valid [lo, hi] range from two same-width byte strings.
b = bytes((x + 1) % 256 for x in a)
lo, hi = (a, b) if a <= b else (b, a)
target = (lo, hi)
fast = list(
scan_memory(data, len(data), target, size, ScanTypesEnum.VALUE_BETWEEN, str)
)
slow = _scan_string_slow(data, size, target, ScanTypesEnum.VALUE_BETWEEN)
assert fast == slow


@st.composite
def _float_payload(draw):
size = draw(st.sampled_from(_FLOAT_SIZES))
Expand Down
Loading