diff --git a/Cargo.toml b/Cargo.toml
index 312f46d..7547f1b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
+    "crates/hash-sorted-map/benchmarks",
 ]
 resolver = "2"
 
diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
index 4019602..d325dbf 100644
--- a/crates/bpe/benchmarks/equivalence.rs
+++ b/crates/bpe/benchmarks/equivalence.rs
@@ -30,7 +30,7 @@ fn test_compare_dictionary() {
             hugging_tokens.remove(added_token);
         }
         let mut hugging_tokens: Vec<_> = hugging_tokens.into_iter().collect();
-        hugging_tokens.sort_by(|(_, a), (_, b)| a.cmp(b));
+        hugging_tokens.sort_by_key(|(_, a)| *a);
         let hugging_tokens: Vec<_> = hugging_tokens
             .into_iter()
             .map(|(token, _)| token.chars().map(char_to_byte).collect())
diff --git a/crates/hash-sorted-map/Cargo.toml b/crates/hash-sorted-map/Cargo.toml
new file mode 100644
index 0000000..84ffa02
--- /dev/null
+++ b/crates/hash-sorted-map/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "hash-sorted-map"
+authors = ["The blackbird team <support@github.com>"]
+version = "0.1.0"
+edition = "2021"
+description = "A hash map with hash-ordered iteration and linear-time merge, designed for search-index term maps."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["hashmap", "sorted", "merge", "simd"]
+categories = ["algorithms", "data-structures"]
diff --git a/crates/hash-sorted-map/OPTIMIZATIONS.md b/crates/hash-sorted-map/OPTIMIZATIONS.md
new file mode 100644
index 0000000..0b04520
--- /dev/null
+++ b/crates/hash-sorted-map/OPTIMIZATIONS.md
@@ -0,0 +1,171 @@
+# HashSortedMap vs. Rust Swiss Table (hashbrown): Optimization Analysis
+
+## Executive Summary
+
+`HashSortedMap` is a Swiss-table-inspired hash map that uses **overflow
+chaining** (instead of open addressing), **SIMD group scanning** (NEON/SSE2),
+a **slot-hint fast path**, and an **optimized growth strategy**. It is generic
+over key type, value type, and hash builder.
+
+This document analyzes the design trade-offs versus
+[hashbrown](https://github.com/rust-lang/hashbrown) and records the
+experimental results that guided the current design.
+
+---
+
+## Architecture Comparison
+
+```
+┌──────────────────────────────────────────────────────────────────┐
+│                   hashbrown Swiss Table                          │
+│                                                                  │
+│  Single contiguous allocation (SoA):                             │
+│  [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]    │
+│                data               control bytes    (mirrored)    │
+│                                                                  │
+│  • Open addressing, triangular probing                           │
+│  • 16-byte groups (SSE2) or 8-byte groups (NEON/generic)         │
+│  • EMPTY / DELETED / FULL tag states                             │
+└──────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────────────────────────────────────────────────┐
+│                      HashSortedMap                               │
+│                                                                  │
+│  Vec<Group<K,V>> where each Group (AoS):                         │
+│  { ctrl: [u8; 8], keys: [MaybeUninit<K>; 8],                     │
+│    values: [MaybeUninit<V>; 8], overflow: u32 }                  │
+│                                                                  │
+│  • Overflow chaining (linked groups)                             │
+│  • 8-byte groups with NEON/SSE2/scalar SIMD scan                 │
+│  • EMPTY / FULL tag states only (insertion-only, no deletion)    │
+│  • Slot-hint fast path                                           │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Optimizations Investigated
+
+### 1. SIMD Group Scanning ✅ Implemented
+
+Platform-specific SIMD for control byte matching:
+- **aarch64**: NEON `vceq_u8` + `vreinterpret_u64_u8` (8-byte groups)
+- **x86_64**: SSE2 `_mm_cmpeq_epi8` + `_mm_movemask_epi8` (16-byte groups)
+- **Fallback**: Scalar u64 zero-byte detection trick
+
+**Benchmark result**: ~5% faster than scalar on Apple M-series. The gain is
+modest because the slot-hint fast path often skips the group scan entirely.
+
+### 2. Open Addressing with Triangular Probing ❌ Rejected
+
+This is not really an option for this hash map, since it would prevent efficient sorting.
+Additionally, we didn't observe any performance improvement in comparison to the linked overflow buffer approach.
+The biggest benefit of triangular probing is that it allows a much higher load factor, i.e. reduces memory consumption which isn't our main concern though.
+
+**Benchmark result**: **40% slower** than overflow chaining. With the AoS
+layout, each group is ~112 bytes, so probing to the next group jumps over
+large memory regions. Overflow chaining with the slot-hint fast path is
+faster because most inserts land in the first group.
+
+### 3. SoA Memory Layout ❌ Rejected
+
+Tested a SoA variant (`SoaHashSortedMap`) with separate control byte and
+key/value arrays, combined with triangular probing.
+
+**Benchmark result**: **Slowest variant** — even slower than AoS open
+addressing. The two-Vec SoA layout doubles TLB/cache pressure versus
+hashbrown's single-allocation layout. Without the single-allocation trick,
+SoA is worse than AoS for this use case.
+
+### 4. Capacity Sizing ✅ Implemented
+
+Without the correct sizing, there was always the penality of a grow operation.
+
+**Fix**: Changed to ~70% max load factor. This was the **single biggest improvement** — HashSortedMap went from 2× slower to matching hashbrown.
+
+### 5. Optimized Growth ✅ Implemented
+
+The original `grow()` called the full `insert()` for each element (including
+duplicate checking and overflow traversal). hashbrown uses:
+- `find_insert_index` (skip duplicate check)
+- `ptr::copy_nonoverlapping` (raw memory copy)
+- Bulk counter updates
+
+**Fix**: Added `insert_for_grow()` that skips duplicate checking, uses raw
+pointer copies, and iterates occupied slots via bitmask.
+
+**Benchmark result**: Growth is now **2× faster** than hashbrown (4.8 µs vs
+9.8 µs for 3 resize rounds).
+
+### 6. Branch Prediction Hints ⚠️ Mixed Results
+
+Added `likely()`/`unlikely()` annotations and `#[cold] #[inline(never)]` on
+the overflow path.
+
+**Benchmark result**: Helped the scalar version (~2–6% faster) but **hurt the
+SIMD version** by pessimizing NEON code generation. Removed from the SIMD
+implementation, kept in the scalar version.
+
+### 7. Slot Hint Fast Path (Unique to HashSortedMap)
+
+HashSortedMap checks a preferred slot before scanning the group:
+```rust
+let hint = slot_hint(hash);  // 3 bits from hash → slot index
+if ctrl[hint] == EMPTY { /* direct insert */ }
+if ctrl[hint] == tag && keys[hint] == key { /* direct hit */ }
+```
+
+hashbrown does **not** have this optimization — it always does a full SIMD
+group scan. The reason why the performance is different is probably due to the different overflow strategies and the different load factors.
+
+### 8. Overflow Reserve Sizing ✅ Validated
+
+Tested overflow reserves from 0% to 100% of primary groups:
+
+| Reserve | Growth scenario (µs) |
+|---------|----------------------|
+| m/8 (12.5%, default) |  8.04   |
+| m/4 (25%)            |  8.33   |
+| m/2 (50%)            |  8.93   |
+| m/1 (100%)           | 10.31   |
+| 0 (grow immediately) |  6.96   |
+
+**Conclusion**: Smaller reserves are faster — growing early is cheaper than
+traversing overflow chains.
+
+### 9. IdentityHasher Fix ✅ Implemented
+
+The original `IdentityHasher` zero-extended u32 to u64, putting zeros in the
+top 32 bits. Since hashbrown derives the 7-bit tag from `hash >> 57`, every
+entry got the same tag — completely defeating control byte filtering.
+
+**Fix**: Use `folded_multiply` to expand u32 keys to u64 with independent
+entropy in both halves. Also changed trigram generation to use
+`folded_multiply` instead of murmur3.
+
+---
+
+## Optimizations Not Implemented (and Why)
+
+| Optimization                    | Reason                                   |
+|---------------------------------|------------------------------------------|
+| **Tombstone / DELETED support** | Insertion-only map — no deletions needed |
+| **In-place rehashing**          | No tombstones to reclaim                 |
+| **Control byte mirroring**      | Not needed with overflow chaining (no wrap-around) |
+| **Custom allocator support**    | Out of scope for benchmarking            |
+| **Over-allocation utilization** | Uses `Vec` (no raw allocator control)    |
+
+---
+
+## Summary of Impact
+
+| Change                     | Effect on insert time        |
+|----------------------------|------------------------------|
+| Capacity sizing fix        | **−50%** (biggest win)       |
+| Optimized growth path      | **−10%** on growth scenarios |
+| SIMD group scanning        | **−5%**                      |
+| Branch hints (scalar only) | **−2–6%**                    |
+| IdentityHasher fix         | Enabled fair comparison      |
+
+The current HashSortedMap **matches hashbrown+FxHash** on pre-sized inserts,
+**beats all hashbrown variants** on overwrites, and has **2× faster growth**.
diff --git a/crates/hash-sorted-map/README.md b/crates/hash-sorted-map/README.md
new file mode 100644
index 0000000..ebd5ef6
--- /dev/null
+++ b/crates/hash-sorted-map/README.md
@@ -0,0 +1,89 @@
+# hash-sorted-map
+
+A hash map whose groups are ordered by hash prefix, enabling efficient
+sorted-order iteration and linear-time merging of two maps.
+
+## Motivation
+
+In a search index, each document produces a **term map** (term → frequency).
+At index time, term maps from many documents must be **merged** into a single
+posting list, and the result is **serialized in hash-key order** so that
+lookups can use a skip-list approach, leveraging the hash ordering to
+efficiently jump to the right region of the serialized data.
+
+A conventional hash map stores entries in arbitrary order, so merging two maps
+requires collecting, sorting, and reshuffling all entries — an expensive step
+that dominates indexing time for large term maps typical of code search, where
+documents contain massive numbers of tokens.
+
+`HashSortedMap` avoids this by organizing its groups by hash prefix.
+Iterating through the groups in order yields entries sorted by their hashed
+keys, which means:
+
+- **Merging** two maps is a single linear scan (like merge-sort's merge step).
+- **Serialization** in hash-key order requires no extra sorting or copying.
+
+## Design
+
+`HashSortedMap<K, V, S>` is a Swiss-table-inspired hash map that uses:
+
+- **Overflow chaining** instead of open addressing — groups that fill up link
+  to overflow groups rather than probing into neighbours.
+- **Slot hint** — a preferred slot index derived from the hash, checked before
+  scanning the group. Gives a direct hit on most inserts at low load.
+- **SIMD group scanning** — uses NEON on aarch64, SSE2 on x86\_64, and a
+  scalar fallback elsewhere to scan 8–16 control bytes in parallel.
+- **AoS group layout** — each group stores its control bytes, keys, and values
+  together, keeping a single insert's data within 1–2 cache lines.
+- **Optimized growth** — during resize, elements are re-inserted without
+  duplicate checking and copied via raw pointers.
+- **Generic key/value/hasher** — supports any `K: Hash + Eq`, any
+  `S: BuildHasher`, and `Borrow<Q>`-based lookups.
+
+## Benchmark results
+
+All benchmarks insert 1000 random trigram hashes (scrambled with
+`folded_multiply`) into maps with various configurations. Measured on Apple
+M-series (aarch64).
+
+### Insert 1000 trigrams — pre-sized, no growth
+
+| Rank | Map | Time (µs) | vs best |
+|------|-----|-----------|---------|
+| 🥇 | FoldHashMap | 2.44 | — |
+| 🥈 | FxHashMap | 2.61 | +7% |
+| 🥉 | hashbrown::HashMap | 2.67 | +9% |
+| 4 | **HashSortedMap** | **2.71** | +11% |
+| 5 | hashbrown+Identity | 2.74 | +12% |
+| 6 | std::HashMap+FNV | 3.27 | +34% |
+| 7 | AHashMap | 3.22 | +32% |
+| 8 | std::HashMap | 8.49 | +248% |
+
+### Re-insert same keys (all overwrites)
+
+| Map | Time (µs) |
+|-----|-----------|
+| **HashSortedMap** | **2.36** ✅ |
+| hashbrown+Identity | 2.58 |
+
+### Growth from small (`with_capacity(128)`, 3 resize rounds)
+
+| Map | Time (µs) | Growth penalty |
+|-----|-----------|----------------|
+| **HashSortedMap** | **4.85** | +2.14 |
+| hashbrown+Identity | 9.77 | +7.03 |
+
+### Key takeaways
+
+- **HashSortedMap matches the fastest hashbrown configurations** on pre-sized
+  first-time inserts and is **the fastest for overwrites**.
+- **Growth is ~2× faster** than hashbrown thanks to the optimized
+  `insert_for_grow` path that skips duplicate checking and uses raw copies.
+- The remaining gap to FoldHashMap (~11%) comes from foldhash's extremely
+  efficient hash function that pipelines well with hashbrown's SIMD scan.
+
+## Running
+
+```sh
+cargo bench --bench hashmap_insert
+```
diff --git a/crates/hash-sorted-map/benchmarks/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml
new file mode 100644
index 0000000..9ee37dc
--- /dev/null
+++ b/crates/hash-sorted-map/benchmarks/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "hash-sorted-map-benchmarks"
+edition = "2021"
+
+[lib]
+path = "lib.rs"
+test = false
+
+[[bench]]
+name = "performance"
+path = "performance.rs"
+harness = false
+test = false
+
+[dependencies]
+hash-sorted-map = { path = ".." }
+criterion = "0.8"
+rand = "0.10"
+rustc-hash = "2"
+ahash = "0.8"
+hashbrown = "0.15"
+foldhash = "0.1"
+fnv = "1"
diff --git a/crates/hash-sorted-map/benchmarks/lib.rs b/crates/hash-sorted-map/benchmarks/lib.rs
new file mode 100644
index 0000000..b80c3e4
--- /dev/null
+++ b/crates/hash-sorted-map/benchmarks/lib.rs
@@ -0,0 +1,46 @@
+use std::hash::{BuildHasherDefault, Hasher};
+
+use rand::RngExt;
+
+const ARBITRARY0: u64 = 0x243f6a8885a308d3;
+
+/// Folded multiply: full u64×u64→u128, then XOR the two halves.
+#[inline(always)]
+pub fn folded_multiply(x: u64, y: u64) -> u64 {
+    let full = (x as u128).wrapping_mul(y as u128);
+    (full as u64) ^ ((full >> 64) as u64)
+}
+
+/// A hasher that passes through u32 keys without hashing, suitable for
+/// keys that are already well-distributed.
+#[derive(Default)]
+pub struct IdentityHasher(u64);
+
+impl Hasher for IdentityHasher {
+    fn write(&mut self, _bytes: &[u8]) {
+        unimplemented!("IdentityHasher only supports write_u32");
+    }
+    fn write_u32(&mut self, i: u32) {
+        self.0 = (i as u64) | ((i as u64) << 32);
+    }
+    fn finish(&self) -> u64 {
+        self.0
+    }
+}
+
+pub type IdentityBuildHasher = BuildHasherDefault<IdentityHasher>;
+
+/// Generate `n` random trigrams as well-distributed u32 hashes.
+/// Each trigram is packed into a u32, then scrambled with folded_multiply.
+pub fn random_trigram_hashes(n: usize) -> Vec<u32> {
+    let mut rng = rand::rng();
+    (0..n)
+        .map(|_| {
+            let a = rng.random_range(b'a'..=b'z') as u32;
+            let b = rng.random_range(b'a'..=b'z') as u32;
+            let c = rng.random_range(b'a'..=b'z') as u32;
+            let packed = a | (b << 8) | (c << 16);
+            folded_multiply(packed as u64, ARBITRARY0) as u32
+        })
+        .collect()
+}
diff --git a/crates/hash-sorted-map/benchmarks/performance.rs b/crates/hash-sorted-map/benchmarks/performance.rs
new file mode 100644
index 0000000..5a04801
--- /dev/null
+++ b/crates/hash-sorted-map/benchmarks/performance.rs
@@ -0,0 +1,301 @@
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use hash_sorted_map::HashSortedMap;
+use hash_sorted_map_benchmarks::{random_trigram_hashes, IdentityBuildHasher};
+
+fn trigrams() -> Vec<u32> {
+    random_trigram_hashes(1000)
+}
+
+fn bench_insert(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut group = c.benchmark_group("presized_insert_1000_trigrams");
+
+    group.bench_function("std::HashMap", |b| {
+        b.iter_batched(
+            || std::collections::HashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("hashbrown::HashMap", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("FxHashMap", |b| {
+        b.iter_batched(
+            || rustc_hash::FxHashMap::with_capacity_and_hasher(trigrams.len(), Default::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("AHashMap", |b| {
+        b.iter_batched(
+            || ahash::AHashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("FoldHashMap", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::<u32, usize, foldhash::fast::FixedState>::with_capacity_and_hasher(
+                trigrams.len(),
+                foldhash::fast::FixedState::default(),
+            ),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("std::HashMap+FNV", |b| {
+        b.iter_batched(
+            || {
+                std::collections::HashMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    fnv::FnvBuildHasher::default(),
+                )
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("hashbrown+Identity", |b| {
+        b.iter_batched(
+            || {
+                hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    Default::default(),
+                )
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap", |b| {
+        b.iter_batched(
+            || {
+                HashSortedMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                )
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+fn bench_reinsert(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut group = c.benchmark_group("reinsert_1000_trigrams");
+
+    group.bench_function("hashbrown+Identity", |b| {
+        b.iter_batched(
+            || {
+                let mut map =
+                    hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                        trigrams.len(),
+                        Default::default(),
+                    );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i + 1000);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap", |b| {
+        b.iter_batched(
+            || {
+                let mut map = HashSortedMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i + 1000);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+fn bench_grow(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut group = c.benchmark_group("grow_from_128_insert_1000_trigrams");
+
+    group.bench_function("hashbrown+Identity", |b| {
+        b.iter_batched(
+            || {
+                hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                    128,
+                    Default::default(),
+                )
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap", |b| {
+        b.iter_batched(
+            || HashSortedMap::with_capacity_and_hasher(128, IdentityBuildHasher::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+fn bench_count(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut counted_trigrams = Vec::with_capacity(trigrams.len() * 4);
+    for _ in 0..4 {
+        counted_trigrams.extend_from_slice(&trigrams);
+    }
+
+    let mut group = c.benchmark_group("count_4000_trigrams_get_or_default");
+
+    group.bench_function("hashbrown+Identity entry()", |b| {
+        b.iter_batched(
+            || {
+                hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    Default::default(),
+                )
+            },
+            |mut map| {
+                for &key in &counted_trigrams {
+                    *map.entry(key).or_insert(0) += 1;
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap get_or_default", |b| {
+        b.iter_batched(
+            || {
+                HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                )
+            },
+            |mut map| {
+                for &key in &counted_trigrams {
+                    *map.get_or_default(key) += 1;
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("HashSortedMap entry().or_default()", |b| {
+        b.iter_batched(
+            || {
+                HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                )
+            },
+            |mut map| {
+                for &key in &counted_trigrams {
+                    *map.entry(key).or_default() += 1;
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_insert,
+    bench_reinsert,
+    bench_grow,
+    bench_count
+);
+criterion_main!(benches);
diff --git a/crates/hash-sorted-map/src/group_ops.rs b/crates/hash-sorted-map/src/group_ops.rs
new file mode 100644
index 0000000..a1b92ec
--- /dev/null
+++ b/crates/hash-sorted-map/src/group_ops.rs
@@ -0,0 +1,177 @@
+// Platform-dependent group size: 16 on x86_64 (SSE2), 8 everywhere else.
+#[cfg(target_arch = "x86_64")]
+pub const GROUP_SIZE: usize = 16;
+#[cfg(not(target_arch = "x86_64"))]
+pub const GROUP_SIZE: usize = 8;
+
+/// Maximum safe fill ratio (keys / primary slots) that keeps overflow within
+/// the 12.5% reserve budget at p95 confidence. Derived from simulation.
+#[cfg(target_arch = "x86_64")]
+pub const MAX_FILL: f64 = 0.71;
+#[cfg(not(target_arch = "x86_64"))]
+pub const MAX_FILL: f64 = 0.67;
+
+pub const CTRL_EMPTY: u8 = 0x00;
+
+#[cfg(target_arch = "x86_64")]
+pub type Mask = u32;
+#[cfg(not(target_arch = "x86_64"))]
+pub type Mask = u64;
+
+// ── SIMD group operations ───────────────────────────────────────────────────
+
+#[cfg(target_arch = "x86_64")]
+mod arch {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64 as x86;
+
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        unsafe {
+            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
+            let cmp = x86::_mm_cmpeq_epi8(group, x86::_mm_set1_epi8(tag as i8));
+            x86::_mm_movemask_epi8(cmp) as u32
+        }
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        match_tag(ctrl, super::CTRL_EMPTY)
+    }
+
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    /// Uses SSE2 `_mm_movemask_epi8` which extracts the top bit of each byte.
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
+            x86::_mm_movemask_epi8(group) as u32
+        }
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        mask.trailing_zeros() as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(1u32 << slot)
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod arch {
+    use core::arch::aarch64 as neon;
+
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(tag));
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
+        }
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(0));
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
+        }
+    }
+
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(group), 0) & 0x8080808080808080
+        }
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        (mask.trailing_zeros() >> 3) as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(0x80u64 << (slot * 8))
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+mod arch {
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        let broadcast = 0x0101010101010101u64 * (tag as u64);
+        let xor = word ^ broadcast;
+        (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        !word & 0x8080808080808080
+    }
+
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        word & 0x8080808080808080
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        (mask.trailing_zeros() >> 3) as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(0x80u64 << (slot * 8))
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+pub use arch::*;
diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
new file mode 100644
index 0000000..26a4ecd
--- /dev/null
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -0,0 +1,815 @@
+use core::mem::MaybeUninit;
+use std::borrow::Borrow;
+use std::collections::hash_map::RandomState;
+use std::hash::{BuildHasher, Hash};
+use std::marker::PhantomData;
+
+use super::group_ops::{self, CTRL_EMPTY, GROUP_SIZE};
+
+const NO_OVERFLOW: u32 = u32::MAX;
+
+// ── Helpers ─────────────────────────────────────────────────────────────────
+
+#[inline]
+fn tag(hash: u64) -> u8 {
+    (hash as u8) | 0x80
+}
+
+#[inline]
+fn slot_hint(hash: u64) -> usize {
+    ((hash >> 7) & (GROUP_SIZE as u64 - 1)) as usize
+}
+
+struct Group<K, V> {
+    ctrl: [u8; GROUP_SIZE],
+    keys: [MaybeUninit<K>; GROUP_SIZE],
+    values: [MaybeUninit<V>; GROUP_SIZE],
+    overflow: u32,
+}
+
+impl<K, V> Group<K, V> {
+    fn new() -> Self {
+        Self {
+            ctrl: [CTRL_EMPTY; GROUP_SIZE],
+            keys: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            overflow: NO_OVERFLOW,
+        }
+    }
+}
+
+/// Insertion-only hash map with SIMD group scanning.
+///
+/// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere.
+/// Generic over key type `K`, value type `V`, and hash builder `S`.
+pub struct HashSortedMap<K, V, S = RandomState> {
+    groups: Box<[Group<K, V>]>,
+    num_groups: u32,
+    n_bits: u32,
+    len: usize,
+    hash_builder: S,
+}
+
+impl<K: Hash + Eq, V> Default for HashSortedMap<K, V> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<K: Hash + Eq, V> HashSortedMap<K, V> {
+    pub fn new() -> Self {
+        Self::with_capacity_and_hasher(0, RandomState::new())
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self::with_capacity_and_hasher(capacity, RandomState::new())
+    }
+}
+
+impl<K, V, S> HashSortedMap<K, V, S> {
+    pub fn with_hasher(hash_builder: S) -> Self {
+        Self::with_capacity_and_hasher(0, hash_builder)
+    }
+
+    pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self {
+        let adjusted = (capacity as f64 / group_ops::MAX_FILL).ceil() as usize;
+        let min_groups = (adjusted.div_ceil(GROUP_SIZE)).max(1).next_power_of_two();
+        let n_bits = min_groups.trailing_zeros().max(1);
+        let (groups, num_primary) = Self::alloc_groups(n_bits);
+        Self {
+            groups,
+            num_groups: num_primary,
+            n_bits,
+            len: 0,
+            hash_builder,
+        }
+    }
+
+    /// Allocate a fully default-initialized boxed slice sized for `n_bits` primary groups
+    /// plus the standard 12.5% overflow reserve. Returns the slice and the number of
+    /// primary groups (which is also the initial in-use count).
+    fn alloc_groups(n_bits: u32) -> (Box<[Group<K, V>]>, u32) {
+        let num_primary = 1usize << n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        let mut groups: Vec<Group<K, V>> = Vec::with_capacity(total);
+        groups.resize_with(total, Group::new);
+        (groups.into_boxed_slice(), num_primary as u32)
+    }
+
+    #[inline]
+    fn group_index(&self, hash: u64) -> usize {
+        (hash >> (64 - self.n_bits)) as usize
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+}
+
+impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
+    pub fn insert(&mut self, key: K, value: V) -> Option<V> {
+        let hash = self.hash_builder.hash_one(&key);
+        self.insert_hashed(hash, key, value)
+    }
+
+    pub fn get<Q>(&self, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
+    {
+        let hash = self.hash_builder.hash_one(key);
+        self.get_hashed(hash, key)
+    }
+
+    /// Returns a mutable reference to the value for `key`, inserting `f()` if absent.
+    #[inline]
+    pub fn get_or_insert_with<F: FnOnce() -> V>(&mut self, key: K, f: F) -> &mut V {
+        self.entry(key).or_insert_with(f)
+    }
+
+    /// Returns a mutable reference to the value for `key`, inserting `V::default()` if absent.
+    pub fn get_or_default(&mut self, key: K) -> &mut V
+    where
+        V: Default,
+    {
+        self.get_or_insert_with(key, V::default)
+    }
+
+    /// Returns an [`Entry`] for `key`, providing in-place access to its value
+    /// (insertion, mutation, or read). The lookup chain is walked exactly once;
+    /// the resulting `VacantEntry` already knows where to write.
+    #[inline]
+    pub fn entry(&mut self, key: K) -> Entry<'_, K, V, S> {
+        let hash = self.hash_builder.hash_one(&key);
+        match self.find_or_insertion_slot(hash, &key) {
+            FindResult::Found(ptr) => Entry::Occupied(OccupiedEntry {
+                // SAFETY: pointer is valid for `'_` (bounded by `&mut self`).
+                value: unsafe { &mut *ptr },
+            }),
+            FindResult::Vacant(insertion) => Entry::Vacant(VacantEntry {
+                phantom: PhantomData,
+                map: self,
+                hash,
+                key,
+                insertion,
+            }),
+        }
+    }
+
+    fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option<V> {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
+        loop {
+            let group = &mut self.groups[gi];
+            // Fast path: check preferred slot.
+            let c = group.ctrl[hint];
+            if c == CTRL_EMPTY {
+                group.ctrl[hint] = tag;
+                group.keys[hint] = MaybeUninit::new(key);
+                group.values[hint] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
+                let old = std::mem::replace(unsafe { group.values[hint].assume_init_mut() }, value);
+                return Some(old);
+            }
+            // Slow path: SIMD scan group for tag match.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            tag_mask = group_ops::clear_slot(tag_mask, hint);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if unsafe { group.keys[i].assume_init_ref() } == &key {
+                    let old =
+                        std::mem::replace(unsafe { group.values[i].assume_init_mut() }, value);
+                    return Some(old);
+                }
+            }
+            // Check for empty slot in this group.
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = group_ops::lowest(empty_mask);
+                group.ctrl[i] = tag;
+                group.keys[i] = MaybeUninit::new(key);
+                group.values[i] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+            // Group full — follow or create overflow chain.
+            let overflow = group.overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                if self.num_groups as usize == self.groups.len() {
+                    self.grow();
+                    // n_bits changed; recompute the primary group and retry.
+                    gi = self.group_index(hash);
+                    continue;
+                }
+                let new_gi = self.num_groups as usize;
+                self.num_groups += 1;
+                self.groups[gi].overflow = new_gi as u32;
+                let group = &mut self.groups[new_gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = MaybeUninit::new(key);
+                group.values[hint] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+        }
+    }
+
+    fn get_hashed<Q>(&self, hash: u64, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Eq + ?Sized,
+    {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Fast path: preferred slot.
+            let c = group.ctrl[hint];
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key {
+                return Some(unsafe { group.values[hint].assume_init_ref() });
+            }
+
+            // Slow path: SIMD scan group.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            tag_mask = group_ops::clear_slot(tag_mask, hint);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if unsafe { group.keys[i].assume_init_ref() }.borrow() == key {
+                    return Some(unsafe { group.values[i].assume_init_ref() });
+                }
+            }
+
+            if group_ops::match_empty(&group.ctrl) != 0 {
+                return None;
+            }
+
+            if group.overflow == NO_OVERFLOW {
+                return None;
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
+    /// Single-walk variant that returns either the found slot or precise
+    /// information about where to insert. Used by [`entry`].
+    ///
+    /// Returns raw pointers (instead of indices) so the caller can write
+    /// directly without re-indexing. Pointers remain valid for the lifetime
+    /// of `&mut self` until any reallocation (`grow`).
+    fn find_or_insertion_slot(&mut self, hash: u64, key: &K) -> FindResult<K, V> {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
+
+        loop {
+            let group = &mut self.groups[gi];
+
+            // Fast path: preferred slot.
+            let c = group.ctrl[hint];
+            if c == CTRL_EMPTY {
+                return FindResult::Vacant(Insertion::Empty {
+                    group: group as *mut _,
+                    slot: hint,
+                });
+            }
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == key {
+                return FindResult::Found(group.values[hint].as_mut_ptr());
+            }
+
+            // Slow path: SIMD scan group for tag match.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            tag_mask = group_ops::clear_slot(tag_mask, hint);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if unsafe { group.keys[i].assume_init_ref() } == key {
+                    return FindResult::Found(group.values[i].as_mut_ptr());
+                }
+            }
+
+            // Check for empty slot in this group.
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = group_ops::lowest(empty_mask);
+                return FindResult::Vacant(Insertion::Empty {
+                    group: group as *mut _,
+                    slot: i,
+                });
+            }
+
+            // Group full — follow or report end of chain.
+            if group.overflow == NO_OVERFLOW {
+                return FindResult::Vacant(Insertion::NeedsOverflow {
+                    tail: group as *mut _,
+                });
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
+    fn grow(&mut self) {
+        let old_groups = std::mem::replace(
+            &mut self.groups,
+            Vec::<Group<K, V>>::new().into_boxed_slice(),
+        );
+        let old_num_groups = self.num_groups as usize;
+        let old_len = self.len;
+
+        self.n_bits += 1;
+        let (new_groups, num_primary) = Self::alloc_groups(self.n_bits);
+        self.groups = new_groups;
+        self.num_groups = num_primary;
+        self.len = 0;
+
+        for group in &old_groups[..old_num_groups] {
+            let mut full_mask = group_ops::match_full(&group.ctrl);
+            while let Some(i) = group_ops::next_match(&mut full_mask) {
+                let hash = self
+                    .hash_builder
+                    .hash_one(unsafe { group.keys[i].assume_init_ref() });
+                self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
+            }
+        }
+        // Group<K, V> has no Drop (keys/values are MaybeUninit), so dropping
+        // old_groups runs no destructors but does free the backing buffer.
+        drop(old_groups);
+
+        debug_assert_eq!(self.len, old_len);
+    }
+
+    fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) {
+        let tag = tag(hash);
+        let mut hint = slot_hint(hash);
+        let gi = self.group_index(hash);
+        let mut group = &mut self.groups[gi];
+
+        loop {
+            if group.ctrl[hint] == CTRL_EMPTY {
+                break;
+            }
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                hint = group_ops::lowest(empty_mask);
+                break;
+            }
+            let overflow = group.overflow;
+            if overflow != NO_OVERFLOW {
+                group = &mut self.groups[overflow as usize];
+            } else {
+                let new_gi = self.num_groups as usize;
+                group.overflow = new_gi as u32;
+                self.num_groups += 1;
+                group = &mut self.groups[new_gi];
+                break;
+            }
+        }
+        group.ctrl[hint] = tag;
+        unsafe {
+            group.keys[hint]
+                .as_mut_ptr()
+                .copy_from_nonoverlapping(key_src, 1);
+            group.values[hint]
+                .as_mut_ptr()
+                .copy_from_nonoverlapping(value_src, 1);
+        }
+        self.len += 1;
+    }
+}
+
+// ────────────────────────────────────────────────────────────────────────
+// Entry API
+// ────────────────────────────────────────────────────────────────────────
+
+/// Result of a single chain walk during `entry()`: either the existing slot
+/// for the key or a pre-computed insertion location for a vacant entry.
+enum FindResult<K, V> {
+    /// Pointer to the existing value.
+    Found(*mut V),
+    /// Where to insert if the caller decides to add a new entry.
+    Vacant(Insertion<K, V>),
+}
+
+/// Pre-computed insertion location stashed inside [`VacantEntry`] so that
+/// `insert()` doesn't need to re-walk the chain. Pointers remain valid as
+/// long as no reallocation occurs (the grow path re-walks via the slow path).
+enum Insertion<K, V> {
+    /// An empty slot is waiting at `(group, slot)`.
+    Empty {
+        group: *mut Group<K, V>,
+        slot: usize,
+    },
+    /// The chain is full; allocate a new overflow group and link via `tail`.
+    NeedsOverflow { tail: *mut Group<K, V> },
+}
+
+/// View into a single entry in a [`HashSortedMap`], either occupied or vacant.
+pub enum Entry<'a, K, V, S> {
+    Occupied(OccupiedEntry<'a, V>),
+    Vacant(VacantEntry<'a, K, V, S>),
+}
+
+/// View into an occupied entry.
+pub struct OccupiedEntry<'a, V> {
+    value: &'a mut V,
+}
+
+/// View into a vacant entry. Holds the borrow of the map plus the hash, key,
+/// and pre-computed insertion slot.
+pub struct VacantEntry<'a, K, V, S> {
+    phantom: PhantomData<&'a mut HashSortedMap<K, V, S>>,
+    map: *mut HashSortedMap<K, V, S>,
+    hash: u64,
+    key: K,
+    insertion: Insertion<K, V>,
+}
+
+impl<'a, K: Hash + Eq, V, S: BuildHasher> Entry<'a, K, V, S> {
+    /// Insert `default` if vacant; return a mutable reference to the value either way.
+    #[inline]
+    pub fn or_insert(self, default: V) -> &'a mut V {
+        match self {
+            Entry::Occupied(o) => o.into_mut(),
+            Entry::Vacant(v) => v.insert(default),
+        }
+    }
+
+    /// Insert `f()` if vacant; `f` runs only on the vacant branch.
+    #[inline]
+    pub fn or_insert_with<F: FnOnce() -> V>(self, f: F) -> &'a mut V {
+        match self {
+            Entry::Occupied(o) => o.into_mut(),
+            Entry::Vacant(v) => v.insert(f()),
+        }
+    }
+
+    /// Insert `V::default()` if vacant.
+    #[inline]
+    pub fn or_default(self) -> &'a mut V
+    where
+        V: Default,
+    {
+        self.or_insert_with(V::default)
+    }
+
+    /// Apply `f` to the value if occupied; pass through unchanged otherwise.
+    #[inline]
+    pub fn and_modify<F: FnOnce(&mut V)>(self, f: F) -> Self {
+        match self {
+            Entry::Occupied(mut o) => {
+                f(o.get_mut());
+                Entry::Occupied(o)
+            }
+            v @ Entry::Vacant(_) => v,
+        }
+    }
+}
+
+impl<'a, V> OccupiedEntry<'a, V> {
+    /// Get a shared reference to the value.
+    #[inline]
+    pub fn get(&self) -> &V {
+        &*self.value
+    }
+
+    /// Get a mutable reference to the value.
+    #[inline]
+    pub fn get_mut(&mut self) -> &mut V {
+        self.value
+    }
+
+    /// Consume the entry, returning the mutable reference with the entry's lifetime.
+    #[inline]
+    pub fn into_mut(self) -> &'a mut V {
+        self.value
+    }
+}
+
+impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
+    /// Insert `value` and return a mutable reference to it.
+    /// Writes directly to the slot pre-computed during `entry()`; only re-walks
+    /// the chain on the rare grow path (where the pre-computed pointers become
+    /// stale because grow re-allocates the groups buffer).
+    #[inline]
+    pub fn insert(self, value: V) -> &'a mut V {
+        let map = self.map;
+        let hash = self.hash;
+        let key = self.key;
+
+        let (group_ptr, slot) = match self.insertion {
+            Insertion::Empty { group, slot } => (group, slot),
+            Insertion::NeedsOverflow { tail } => {
+                let (new_gi, new_group) = unsafe {
+                    let map = &mut *map;
+                    if map.num_groups as usize == map.groups.len() {
+                        return insert_after_grow(map, hash, key, value);
+                    }
+                    let new_gi = map.num_groups as usize;
+                    map.num_groups += 1;
+                    let new_group: *mut Group<K, V> = &mut map.groups[new_gi];
+                    (new_gi, new_group)
+                };
+                unsafe {
+                    // SAFETY: `tail` was obtained from `&mut self.groups[..]` and
+                    // remains valid because no reallocation occurred between
+                    // `entry()` and now (we hold the only `&mut self`).
+                    (*tail).overflow = new_gi as u32;
+                }
+                (new_group, slot_hint(hash))
+            }
+        };
+
+        let tag = tag(hash);
+        unsafe {
+            (*map).len += 1;
+            // SAFETY: `group_ptr` points into `map.groups` and is valid for `'a`.
+            let group = &mut *group_ptr;
+            group.ctrl[slot] = tag;
+            group.keys[slot] = MaybeUninit::new(key);
+            group.values[slot] = MaybeUninit::new(value);
+            group.values[slot].assume_init_mut()
+        }
+    }
+}
+
+/// Cold path: the chain was full, the table is at capacity, and we need to
+/// grow before inserting. Re-walks via the slow path after grow.
+///
+/// After `grow()` doubles `num_primary` (`n_bits += 1`), our key's new
+/// primary group can have at most ~half the old chain's keys, so hitting
+/// `NeedsOverflow` again would require `GROUP_SIZE` keys to all collide on
+/// one extra bit of hash — essentially impossible for any reasonable hash.
+/// (`insert_for_grow` relies on the same assumption to skip its own
+/// capacity check.)
+#[cold]
+#[inline(never)]
+fn insert_after_grow<K: Hash + Eq, V, S: BuildHasher>(
+    map: &mut HashSortedMap<K, V, S>,
+    hash: u64,
+    key: K,
+    value: V,
+) -> &mut V {
+    map.grow();
+    match map.find_or_insertion_slot(hash, &key) {
+        FindResult::Vacant(Insertion::Empty { group, slot }) => {
+            let tag = tag(hash);
+            // SAFETY: `group` points into `map.groups` and is valid for `'a`.
+            unsafe {
+                let g = &mut *group;
+                g.ctrl[slot] = tag;
+                g.keys[slot] = MaybeUninit::new(key);
+                g.values[slot] = MaybeUninit::new(value);
+                map.len += 1;
+                g.values[slot].assume_init_mut()
+            }
+        }
+        // After grow, the new primary group for `key` cannot be full (see
+        // function docs), and the key wasn't in the table before grow.
+        FindResult::Vacant(Insertion::NeedsOverflow { .. }) | FindResult::Found(_) => {
+            unreachable!("post-grow walk must hit an empty slot")
+        }
+    }
+}
+
+impl<K, V, S> Drop for HashSortedMap<K, V, S> {
+    fn drop(&mut self) {
+        for group in &mut self.groups[..self.num_groups as usize] {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.keys[i].assume_init_drop() };
+                    unsafe { group.values[i].assume_init_drop() };
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::hash::{BuildHasher, Hasher};
+
+    use super::*;
+
+    #[test]
+    fn insert_and_get() {
+        let mut map = HashSortedMap::new();
+        map.insert(100, "hello");
+        map.insert(200, "world");
+        assert_eq!(map.get(&100), Some(&"hello"));
+        assert_eq!(map.get(&200), Some(&"world"));
+        assert_eq!(map.get(&999), None);
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn insert_overwrite() {
+        let mut map = HashSortedMap::new();
+        map.insert(42, "a");
+        assert_eq!(map.insert(42, "b"), Some("a"));
+        assert_eq!(map.get(&42), Some(&"b"));
+        assert_eq!(map.len(), 1);
+    }
+
+    #[test]
+    fn grow_preserves_entries() {
+        let mut map = HashSortedMap::new();
+        for i in 0..200u32 {
+            map.insert(i, i * 10);
+        }
+        assert_eq!(map.len(), 200);
+        for i in 0..200u32 {
+            assert_eq!(map.get(&i), Some(&(i * 10)), "missing key {i}");
+        }
+    }
+
+    #[test]
+    fn many_entries() {
+        let mut map = HashSortedMap::with_capacity(2000);
+        for i in 0..2000u32 {
+            map.insert(i.wrapping_mul(2654435761), i);
+        }
+        assert_eq!(map.len(), 2000);
+        for i in 0..2000u32 {
+            assert_eq!(map.get(&i.wrapping_mul(2654435761)), Some(&i));
+        }
+    }
+
+    #[test]
+    fn overflow_chain() {
+        let mut map = HashSortedMap::with_capacity(8);
+        for i in 0..20u32 {
+            let key = i | 0xAB000000;
+            map.insert(key, i);
+        }
+        assert_eq!(map.len(), 20);
+        for i in 0..20u32 {
+            let key = i | 0xAB000000;
+            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x}");
+        }
+    }
+
+    #[test]
+    fn grow_on_overflow_exhaustion() {
+        let mut map = HashSortedMap::with_capacity(1);
+        let old_n_bits = map.n_bits;
+        for i in 0..100u32 {
+            let key = i | 0xFF000000;
+            map.insert(key, i);
+        }
+        assert!(map.n_bits > old_n_bits, "should have grown");
+        assert_eq!(map.len(), 100);
+        for i in 0..100u32 {
+            let key = i | 0xFF000000;
+            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x} after grow");
+        }
+    }
+
+    #[test]
+    fn string_keys() {
+        let mut map = HashSortedMap::new();
+        map.insert("hello".to_string(), 1);
+        map.insert("world".to_string(), 2);
+        assert_eq!(map.get("hello"), Some(&1));
+        assert_eq!(map.get("world"), Some(&2));
+        assert_eq!(map.get("missing"), None);
+        assert_eq!(map.len(), 2);
+
+        assert_eq!(map.insert("hello".to_string(), 3), Some(1));
+        assert_eq!(map.get("hello"), Some(&3));
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn get_or_default_basics() {
+        let mut map: HashSortedMap<&str, i32> = HashSortedMap::new();
+        // Inserts default (0), then mutates.
+        *map.get_or_default("a") += 5;
+        *map.get_or_default("b") += 7;
+        // Subsequent calls return the existing value.
+        *map.get_or_default("a") += 3;
+        assert_eq!(map.get(&"a"), Some(&8));
+        assert_eq!(map.get(&"b"), Some(&7));
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn get_or_insert_with_lazy() {
+        let mut map: HashSortedMap<u32, String> = HashSortedMap::new();
+        let mut call_count = 0;
+        let mut make = |s: &str| {
+            call_count += 1;
+            s.to_string()
+        };
+        // First call: f runs, inserts "first".
+        assert_eq!(
+            map.get_or_insert_with(1, || make("first")),
+            &mut "first".to_string()
+        );
+        // Second call with same key: f does NOT run; returns existing.
+        assert_eq!(
+            map.get_or_insert_with(1, || make("second")),
+            &mut "first".to_string()
+        );
+        // New key: f runs.
+        assert_eq!(
+            map.get_or_insert_with(2, || make("third")),
+            &mut "third".to_string()
+        );
+        assert_eq!(call_count, 2);
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn get_or_default_survives_grow() {
+        let mut map: HashSortedMap<u32, u32> = HashSortedMap::with_capacity(1);
+        for i in 0..500u32 {
+            *map.get_or_default(i) = i * 2;
+        }
+        assert_eq!(map.len(), 500);
+        for i in 0..500u32 {
+            assert_eq!(map.get(&i), Some(&(i * 2)), "missing key {i}");
+        }
+    }
+
+    #[test]
+    fn entry_or_default_counting() {
+        // Classic counting workload via Entry API.
+        let mut map: HashSortedMap<&str, u32> = HashSortedMap::new();
+        for word in ["a", "b", "a", "c", "b", "a"] {
+            *map.entry(word).or_default() += 1;
+        }
+        assert_eq!(map.get(&"a"), Some(&3));
+        assert_eq!(map.get(&"b"), Some(&2));
+        assert_eq!(map.get(&"c"), Some(&1));
+        assert_eq!(map.len(), 3);
+    }
+
+    #[test]
+    fn entry_or_insert_lazy() {
+        let mut map: HashSortedMap<u32, String> = HashSortedMap::new();
+        let mut call_count = 0;
+        let mut make = |s: &str| {
+            call_count += 1;
+            s.to_string()
+        };
+        // First call: f runs, inserts.
+        let v = map.entry(1).or_insert_with(|| make("first"));
+        assert_eq!(v, "first");
+        // Second call with same key: f does NOT run.
+        let v = map.entry(1).or_insert_with(|| make("second"));
+        assert_eq!(v, "first");
+        assert_eq!(call_count, 1);
+    }
+
+    #[test]
+    fn entry_and_modify() {
+        let mut map: HashSortedMap<u32, u32> = HashSortedMap::new();
+        // Vacant: and_modify is a no-op, then or_insert(0) runs.
+        *map.entry(7).and_modify(|v| *v *= 10).or_insert(1) += 100;
+        assert_eq!(map.get(&7), Some(&101));
+        // Occupied: and_modify runs, or_insert is skipped.
+        *map.entry(7).and_modify(|v| *v *= 2).or_insert(99) += 1;
+        assert_eq!(map.get(&7), Some(&203));
+    }
+
+    /// Degenerate hasher that returns a fixed hash code, for forcing collisions.
+    struct FixedHasher(u64);
+
+    impl Hasher for FixedHasher {
+        fn finish(&self) -> u64 {
+            self.0
+        }
+        fn write(&mut self, _bytes: &[u8]) {}
+    }
+
+    #[derive(Clone)]
+    struct FixedState(u64);
+
+    impl BuildHasher for FixedState {
+        type Hasher = FixedHasher;
+        fn build_hasher(&self) -> FixedHasher {
+            FixedHasher(self.0)
+        }
+    }
+
+    #[test]
+    fn test_collisions() {
+        // Tiny initial capacity + all collisions
+        let mut m = HashSortedMap::with_capacity_and_hasher(1, FixedState(0));
+        for i in 0..200u32 {
+            m.insert(i, i);
+        }
+        assert_eq!(m.len(), 200);
+        for i in 0..200u32 {
+            assert_eq!(m.get(&i), Some(&i));
+        }
+    }
+}
diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs
new file mode 100644
index 0000000..79dac69
--- /dev/null
+++ b/crates/hash-sorted-map/src/lib.rs
@@ -0,0 +1,4 @@
+mod group_ops;
+mod hash_sorted_map;
+
+pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacantEntry};
diff --git a/crates/string-offsets/benchmarks/performance.rs b/crates/string-offsets/benchmarks/performance.rs
index c4e6cb4..8f62e8f 100644
--- a/crates/string-offsets/benchmarks/performance.rs
+++ b/crates/string-offsets/benchmarks/performance.rs
@@ -1,5 +1,6 @@
-use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
 use rand::{rng, RngExt};
+use std::hint::black_box;
 use string_offsets::{AllConfig, OnlyLines, StringOffsets};
 
 fn only_lines_construction_benchmark(c: &mut Criterion) {