diff --git a/Cargo.toml b/Cargo.toml
index 312f46d..4e584bf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
+    "crates/sparse-ngrams",
 ]
 resolver = "2"
 
diff --git a/README.md b/README.md
index ae3acce..0dbe85f 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ A collection of useful algorithms written in Rust. Currently contains:
 - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters.
 - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents.
 - [`bpe-openai`](crates/bpe-openai): Fast tokenizers for OpenAI token sets based on the `bpe` crate.
+- [`sparse-ngrams`](crates/sparse-ngrams): fast sparse n-gram extraction from byte slices. Selects variable-length n-grams (2–8 bytes) deterministically using bigram frequency priorities, suitable for substring search indexes.
 - [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.
 
 ## Background
diff --git a/crates/sparse-ngrams/Cargo.toml b/crates/sparse-ngrams/Cargo.toml
new file mode 100644
index 0000000..d5bcde0
--- /dev/null
+++ b/crates/sparse-ngrams/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "sparse-ngrams"
+version = "0.1.0"
+edition = "2021"
+description = "Fast sparse n-gram extraction from byte slices."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["ngram", "algorithm", "search", "index"]
+categories = ["algorithms", "data-structures", "text-processing"]
+
+[lib]
+bench = false
+
+[[bench]]
+name = "performance"
+path = "benchmarks/performance.rs"
+harness = false
+
+[dev-dependencies]
+criterion = "0.5"
diff --git a/crates/sparse-ngrams/README.md b/crates/sparse-ngrams/README.md
new file mode 100644
index 0000000..f12808e
--- /dev/null
+++ b/crates/sparse-ngrams/README.md
@@ -0,0 +1,79 @@
+# sparse-ngrams
+
+Fast sparse n-gram extraction from byte slices.
+
+Sparse grams select variable-length n-grams (2–8 bytes) without extracting all possible substrings. The algorithm is deterministic: the same extraction logic applies to every substring, making it suitable for substring search indexes.
+
+## How it works
+
+Each consecutive byte pair (bigram) is assigned a frequency-based priority from a precomputed table. An n-gram boundary occurs wherever a bigram has lower priority than all bigrams between it and the previous boundary. This is computed efficiently using a monotone deque or a scan-based approach.
+
+For a document of N bytes, this produces at most 3(N−1) n-grams: N−1 bigrams, plus up to 2(N−1) algorithmically selected longer n-grams (up to 8 bytes).
+
+### Selection criterion
+
+A substring of length 3–8 is emitted as a sparse n-gram if and only if every interior bigram priority is strictly greater than the maximum of the left and right boundary bigram priorities.
+
+## Usage
+
+```rust
+use sparse_ngrams::{collect_sparse_grams, NGram, MAX_SPARSE_GRAM_SIZE};
+
+let input = b"hello world";
+let grams = collect_sparse_grams(input);
+for gram in &grams {
+    assert!(gram.len() >= 2);
+    assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize);
+}
+```
+
+## Performance
+
+Benchmarks on an Apple M1 (15 KB input, `lib.rs` source file):
+
+| Variant | Throughput |
+|---------|-----------|
+| `deque` | ~3.5 GB/s |
+| `scan`  | ~4.9 GB/s |
+
+The `scan` variant is ~40% faster than the deque variant by replacing the monotone deque with a fixed-size circular buffer and a suffix-minimum scan.
+
+## Bigram table size
+
+The priority table maps byte pairs to frequency-based priorities. Increasing the table size (number of ranked bigrams) produces more distinct longer n-grams, but saturates quickly:
+
+![Unique n-grams vs. table size](images/unique_ngrams_vs_table_size.png)
+
+| Table size | Unique n-grams | % of max |
+|-----------|---------------|----------|
+| 100       | 6.2M          | 79.4%    |
+| 200       | 6.7M          | 85.9%    |
+| 400       | 7.1M          | 91.1%    |
+| 800       | 7.5M          | 96.2%    |
+| 1,600     | 7.8M          | 99.0%    |
+| 3,200     | 7.8M          | 99.9%    |
+| 6,400+    | 7.8M          | 100%     |
+
+Beyond ~6,400 entries the table saturates — additional bigram rankings produce no new n-grams since all occurring byte pairs already have distinct priorities.
+
+## Maximum n-gram length
+
+Increasing the maximum n-gram length produces more unique longer grams, with diminishing returns:
+
+![Unique n-grams vs. max length](images/unique_ngrams_vs_max_length.png)
+
+| Max length | Unique n-grams | vs. len=8 |
+|-----------|---------------|-----------|
+| 2         | 1.4M          | 18%       |
+| 3         | 4.6M          | 59%       |
+| 4         | 5.8M          | 74%       |
+| 6         | 7.1M          | 90%       |
+| 8         | 7.8M          | 100%      |
+| 12        | 8.7M          | 111%      |
+| 16        | 9.2M          | 118%      |
+| 24        | 9.8M          | 124%      |
+| 32        | 10.0M         | 128%      |
+| 48        | 10.3M         | 131%      |
+| 64        | 10.4M         | 132%      |
+
+The default of 8 captures most of the discriminative power. Going to 16 adds ~18% more unique grams but doubles the scan window; going to 64 adds only ~32% total.
diff --git a/crates/sparse-ngrams/benchmarks/performance.rs b/crates/sparse-ngrams/benchmarks/performance.rs
new file mode 100644
index 0000000..4a24063
--- /dev/null
+++ b/crates/sparse-ngrams/benchmarks/performance.rs
@@ -0,0 +1,32 @@
+use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
+use sparse_ngrams::{NGram, collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams};
+
+fn bench_collect(c: &mut Criterion) {
+    let inputs: Vec<(&str, Vec<u8>)> = vec![
+        ("small_11B", b"hello world".to_vec()),
+        (
+            "medium_900B",
+            "the quick brown fox jumps over the lazy dog. "
+                .repeat(20)
+                .into_bytes(),
+        ),
+        ("large_15KB", include_str!("../src/lib.rs").as_bytes().to_vec()),
+    ];
+
+    let mut group = c.benchmark_group("collect");
+    for (name, input) in &inputs {
+        let mut buf = vec![NGram::from_bytes(b"xx"); max_sparse_grams(input.len())];
+        group.throughput(Throughput::Bytes(input.len() as u64));
+
+        group.bench_with_input(BenchmarkId::new("deque", name), input, |b, input| {
+            b.iter(|| collect_sparse_grams_deque(black_box(input), &mut buf))
+        });
+        group.bench_with_input(BenchmarkId::new("scan", name), input, |b, input| {
+            b.iter(|| collect_sparse_grams_scan(black_box(input), &mut buf))
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_collect);
+criterion_main!(benches);
diff --git a/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png b/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png
new file mode 100644
index 0000000..d878c2e
Binary files /dev/null and b/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png differ
diff --git a/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png b/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png
new file mode 100644
index 0000000..218143a
Binary files /dev/null and b/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png differ
diff --git a/crates/sparse-ngrams/src/bigrams.bin b/crates/sparse-ngrams/src/bigrams.bin
new file mode 100644
index 0000000..d011c90
Binary files /dev/null and b/crates/sparse-ngrams/src/bigrams.bin differ
diff --git a/crates/sparse-ngrams/src/deque.rs b/crates/sparse-ngrams/src/deque.rs
new file mode 100644
index 0000000..3738ba0
--- /dev/null
+++ b/crates/sparse-ngrams/src/deque.rs
@@ -0,0 +1,71 @@
+//! Stack-allocated circular buffer (monotone deque).
+
+use std::mem::MaybeUninit;
+
+/// Deque element representing two neighboring bytes in the input.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct PosStateBytes {
+    /// Absolute index position between the two bigram characters.
+    /// I.e. 1 references the very first bigram.
+    pub index: u32,
+    pub value: u16,
+}
+
+/// Stack-allocated circular buffer holding up to `CAP` elements.
+/// Replaces `VecDeque<PosStateBytes>` — avoids heap allocation and fits in a
+/// single cache line for small CAP values.
+pub(crate) struct FixedDeque<const CAP: usize> {
+    data: [MaybeUninit<PosStateBytes>; CAP],
+    start: u8,
+    len: u8,
+}
+
+impl<const CAP: usize> FixedDeque<CAP> {
+    pub fn new() -> Self {
+        Self {
+            data: [MaybeUninit::uninit(); CAP],
+            start: 0,
+            len: 0,
+        }
+    }
+
+    #[inline]
+    pub fn front(&self) -> Option<&PosStateBytes> {
+        if self.len == 0 {
+            None
+        } else {
+            Some(unsafe { self.data[self.start as usize].assume_init_ref() })
+        }
+    }
+
+    #[inline]
+    pub fn back(&self) -> Option<&PosStateBytes> {
+        if self.len == 0 {
+            None
+        } else {
+            let idx = (self.start + self.len - 1) as usize % CAP;
+            Some(unsafe { self.data[idx].assume_init_ref() })
+        }
+    }
+
+    #[inline]
+    pub fn pop_front(&mut self) {
+        debug_assert!(self.len > 0);
+        self.start = (self.start + 1) % CAP as u8;
+        self.len -= 1;
+    }
+
+    #[inline]
+    pub fn pop_back(&mut self) {
+        debug_assert!(self.len > 0);
+        self.len -= 1;
+    }
+
+    #[inline]
+    pub fn push_back(&mut self, val: PosStateBytes) {
+        debug_assert!((self.len as usize) < CAP);
+        let idx = (self.start + self.len) as usize % CAP;
+        self.data[idx] = MaybeUninit::new(val);
+        self.len += 1;
+    }
+}
diff --git a/crates/sparse-ngrams/src/extract.rs b/crates/sparse-ngrams/src/extract.rs
new file mode 100644
index 0000000..0f845fa
--- /dev/null
+++ b/crates/sparse-ngrams/src/extract.rs
@@ -0,0 +1,350 @@
+//! Core sparse n-gram extraction algorithm.
+
+use crate::deque::{FixedDeque, PosStateBytes};
+use crate::ngram::{NGram, POLY_HASH_PRIME, POLY_POWERS};
+use crate::table::get_bigram_table;
+use crate::MAX_SPARSE_GRAM_SIZE;
+
+/// Returns the maximum number of sparse n-grams that can be produced from
+/// `content_len` bytes of input. Use this to pre-allocate the output slice.
+#[inline]
+pub const fn max_sparse_grams(content_len: usize) -> usize {
+    if content_len < 2 {
+        0
+    } else {
+        (content_len - 1) * 3
+    }
+}
+
+/// Collect all sparse n-grams from the input byte slice into a new [`Vec`].
+pub fn collect_sparse_grams(content: &[u8]) -> Vec<NGram> {
+    let mut buf = vec![NGram::from_rolling_hash(0, 0); max_sparse_grams(content.len())];
+    let count = collect_sparse_grams_deque(content, &mut buf);
+    buf.truncate(count);
+    buf
+}
+
+/// Deque-based extraction. Writes n-grams into `out` (must have at least
+/// [`max_sparse_grams`]`(content.len())` slots). Returns the count written.
+///
+/// # Panics
+///
+/// Panics if `out` is too small.
+pub fn collect_sparse_grams_deque(content: &[u8], out: &mut [NGram]) -> usize {
+    let n = content.len();
+    if n < 2 {
+        return 0;
+    }
+    assert!(out.len() >= max_sparse_grams(n));
+    let table = get_bigram_table();
+    let mut queue = FixedDeque::<{ MAX_SPARSE_GRAM_SIZE as usize }>::new();
+    let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE as usize];
+    prefix_hashes[1] = content[0] as u32;
+    let mut w = 0usize;
+
+    for idx in 1..n as u32 {
+        let mask = MAX_SPARSE_GRAM_SIZE as usize - 1;
+        let end_hash = prefix_hashes[idx as usize & mask]
+            .wrapping_mul(POLY_HASH_PRIME)
+            .wrapping_add(content[idx as usize] as u32);
+
+        // Bigram
+        let bigram_hash = end_hash
+            .wrapping_sub(prefix_hashes[(idx as usize - 1) & mask].wrapping_mul(POLY_POWERS[2]));
+        out[w] = NGram::from_rolling_hash(bigram_hash, 2);
+        w += 1;
+
+        let v1 =
+            table[content[idx as usize - 1] as usize * 256 + content[idx as usize] as usize];
+
+        if let Some(begin) = queue.front() {
+            if idx - begin.index + 1 >= MAX_SPARSE_GRAM_SIZE {
+                queue.pop_front();
+            }
+        }
+        while let Some(begin) = queue.back() {
+            let start = begin.index as usize - 1;
+            let len = (idx - begin.index + 2) as usize;
+            let hash = end_hash.wrapping_sub(prefix_hashes[start & mask].wrapping_mul(POLY_POWERS[len]));
+            out[w] = NGram::from_rolling_hash(hash, len);
+            w += 1;
+            if begin.value == v1 {
+                queue.pop_back();
+                break;
+            } else if begin.value <= v1 {
+                break;
+            }
+            queue.pop_back();
+        }
+        queue.push_back(PosStateBytes {
+            index: idx,
+            value: v1,
+        });
+        prefix_hashes[(idx as usize + 1) & mask] = end_hash;
+    }
+    w
+}
+
+/// Queue-free scan-based extraction. Writes n-grams into `out` (must have at least
+/// [`max_sparse_grams`]`(content.len())` slots). Returns the count written.
+///
+/// Produces identical output (same order) as [`collect_sparse_grams_deque`].
+///
+/// # Panics
+///
+/// Panics if `out` is too small.
+pub fn collect_sparse_grams_scan(content: &[u8], out: &mut [NGram]) -> usize {
+    let n = content.len();
+    if n < 2 {
+        return 0;
+    }
+    assert!(out.len() >= max_sparse_grams(n));
+
+    let table = get_bigram_table();
+    const MASK: usize = MAX_SPARSE_GRAM_SIZE as usize - 1;
+    let mut w = 0usize;
+    let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE as usize];
+    prefix_hashes[1] = content[0] as u32;
+    let mut priorities = [u16::MAX; MAX_SPARSE_GRAM_SIZE as usize];
+    for idx in 1..n as u32 {
+        let end_hash = prefix_hashes[idx as usize & MASK]
+            .wrapping_mul(POLY_HASH_PRIME)
+            .wrapping_add(content[idx as usize] as u32);
+        // Bigram
+        let bigram_hash = end_hash
+            .wrapping_sub(prefix_hashes[(idx as usize - 1) & MASK].wrapping_mul(POLY_POWERS[2]));
+        out[w] = NGram::from_rolling_hash(bigram_hash, 2);
+        w += 1;
+        let v1 =
+            table[content[idx as usize - 1] as usize * 256 + content[idx as usize] as usize];
+        priorities[idx as usize & MASK] = v1;
+        let mut running_min = u16::MAX;
+        for d in 1..=(MAX_SPARSE_GRAM_SIZE - 2) {
+            if d >= idx {
+                break;
+            }
+            let p = idx.wrapping_sub(d) as usize & MASK;
+            let v_p = priorities[p];
+            if v_p < running_min {
+                running_min = v_p;
+                let start = p.wrapping_sub(1) & MASK;
+                let len = d as usize + 2;
+                let hash = end_hash.wrapping_sub(prefix_hashes[start].wrapping_mul(POLY_POWERS[len]));
+                out[w] = NGram::from_rolling_hash(hash, len);
+                w += 1;
+                if v_p <= v1 {
+                    break;
+                }
+            }
+        }
+        prefix_hashes[(idx as usize + 1) & MASK] = end_hash;
+    }
+    w
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::table::get_bigram_table;
+    use std::collections::HashSet;
+
+    fn collect_to_vec(content: &[u8], f: fn(&[u8], &mut [NGram]) -> usize) -> Vec<NGram> {
+        let mut buf = vec![NGram::from_rolling_hash(0, 0); max_sparse_grams(content.len())];
+        let count = f(content, &mut buf);
+        buf.truncate(count);
+        buf
+    }
+
+    /// Brute-force reference implementation.
+    ///
+    /// Enumerates all substrings of length 2..=MAX_SPARSE_GRAM_SIZE and emits those
+    /// where every interior bigram priority is strictly greater than `max(left, right)`
+    /// boundary bigram priority. All bigrams (len=2) are always emitted.
+    fn brute_force_sparse_grams(content: &[u8]) -> HashSet<NGram> {
+        let table = get_bigram_table();
+        let n = content.len();
+        let mut result = HashSet::new();
+        if n < 2 {
+            return result;
+        }
+        // All bigrams.
+        for i in 0..n - 1 {
+            result.insert(NGram::from_bytes(&content[i..i + 2]));
+        }
+        // Longer grams: length 3..=MAX_SPARSE_GRAM_SIZE.
+        for len in 3..=MAX_SPARSE_GRAM_SIZE as usize {
+            'outer: for start in 0..=n.saturating_sub(len) {
+                if start + len > n {
+                    break;
+                }
+                let left = table[content[start] as usize * 256 + content[start + 1] as usize];
+                let right =
+                    table[content[start + len - 2] as usize * 256 + content[start + len - 1] as usize];
+                let boundary = left.max(right);
+                // Inner bigrams: bytes [start+1,start+2], ..., [start+len-3,start+len-2]
+                for k in 1..len - 2 {
+                    let p = table[content[start + k] as usize * 256 + content[start + k + 1] as usize];
+                    if p <= boundary {
+                        continue 'outer;
+                    }
+                }
+                result.insert(NGram::from_bytes(&content[start..start + len]));
+            }
+        }
+        result
+    }
+
+    #[test]
+    fn test_empty_input() {
+        assert!(collect_sparse_grams(b"").is_empty());
+    }
+
+    #[test]
+    fn test_single_byte() {
+        assert!(collect_sparse_grams(b"a").is_empty());
+    }
+
+    #[test]
+    fn test_two_bytes() {
+        let grams = collect_sparse_grams(b"ab");
+        assert_eq!(grams.len(), 1);
+        assert_eq!(grams[0], NGram::from_bytes(b"ab"));
+    }
+
+    #[test]
+    fn test_three_bytes() {
+        let grams = collect_sparse_grams(b"abc");
+        assert!(grams.len() >= 2);
+        assert_eq!(grams[0], NGram::from_bytes(b"ab"));
+        assert_eq!(grams[1], NGram::from_bytes(b"bc"));
+    }
+
+    #[test]
+    fn test_gram_lengths_bounded() {
+        let input = b"self.reset_states(the_quick_brown_fox_jumps";
+        let grams = collect_sparse_grams(input);
+        for gram in &grams {
+            assert!(gram.len() >= 2, "gram too short: {gram:?}");
+            assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize, "gram too long: {gram:?}");
+        }
+    }
+
+    #[test]
+    fn test_produces_longer_grams() {
+        let grams = collect_sparse_grams(b"self.reset_states(");
+        assert!(grams.iter().any(|g| g.len() > 2));
+    }
+
+    #[test]
+    fn test_max_gram_size_boundary() {
+        let grams = collect_sparse_grams(b"abcdefgh");
+        for gram in &grams {
+            assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize);
+        }
+    }
+
+    #[test]
+    fn test_repeated_bytes() {
+        let grams = collect_sparse_grams(b"aaaaaaaaaa");
+        assert!(grams.iter().filter(|g| g.len() == 2).count() >= 9);
+    }
+
+    #[test]
+    fn test_gram_count_scales_linearly() {
+        let input: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
+        let grams = collect_sparse_grams(&input);
+        assert!(grams.len() >= input.len() - 1);
+        assert!(grams.len() <= input.len() * 3);
+    }
+
+    // -- Equivalence: scan vs deque --
+
+    #[test]
+    fn test_scan_equivalence_small() {
+        for input in [b"" as &[u8], b"x", b"ab", b"abc", b"abcdefgh", b"abcdefghi"] {
+            assert_eq!(
+                collect_to_vec(input, collect_sparse_grams_deque),
+                collect_to_vec(input, collect_sparse_grams_scan),
+                "mismatch on {:?}",
+                std::str::from_utf8(input).unwrap_or("?")
+            );
+        }
+    }
+
+    #[test]
+    fn test_scan_equivalence_hello_world() {
+        let input = b"hello world";
+        assert_eq!(
+            collect_to_vec(input, collect_sparse_grams_deque),
+            collect_to_vec(input, collect_sparse_grams_scan),
+        );
+    }
+
+    #[test]
+    fn test_scan_equivalence_large() {
+        let input: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
+        assert_eq!(
+            collect_to_vec(&input, collect_sparse_grams_deque),
+            collect_to_vec(&input, collect_sparse_grams_scan),
+        );
+    }
+
+    #[test]
+    fn test_scan_equivalence_source_code() {
+        let input = include_bytes!("lib.rs");
+        assert_eq!(
+            collect_to_vec(input, collect_sparse_grams_deque),
+            collect_to_vec(input, collect_sparse_grams_scan),
+        );
+    }
+
+    // -- Brute-force equivalence --
+
+    fn assert_matches_brute_force(input: &[u8]) {
+        let grams = collect_sparse_grams(input);
+        let actual: HashSet<NGram> = grams.into_iter().collect();
+        let expected = brute_force_sparse_grams(input);
+        let only_actual: Vec<_> = actual.difference(&expected).collect();
+        let only_expected: Vec<_> = expected.difference(&actual).collect();
+        if !only_actual.is_empty() || !only_expected.is_empty() {
+            panic!(
+                "mismatch on input len={}\n  only in algorithm: {:?}\n  only in brute force: {:?}",
+                input.len(), only_actual, only_expected
+            );
+        }
+    }
+
+    #[test]
+    fn test_brute_force_small() {
+        for input in [b"" as &[u8], b"x", b"ab", b"abc", b"abcd", b"abcdefgh", b"abcdefghi"] {
+            assert_matches_brute_force(input);
+        }
+    }
+
+    #[test]
+    fn test_brute_force_hello_world() {
+        assert_matches_brute_force(b"hello world");
+    }
+
+    #[test]
+    fn test_brute_force_repeated() {
+        assert_matches_brute_force(b"aaaaaaaaaa");
+    }
+
+    #[test]
+    fn test_brute_force_code_snippet() {
+        assert_matches_brute_force(b"self.reset_states(the_quick_brown_fox_jumps");
+    }
+
+    #[test]
+    fn test_brute_force_diverse() {
+        let input: Vec<u8> = (0..200).map(|i| (i % 256) as u8).collect();
+        assert_matches_brute_force(&input);
+    }
+
+    #[test]
+    fn test_brute_force_source_code() {
+        let input = include_bytes!("lib.rs");
+        assert_matches_brute_force(input);
+    }
+}
diff --git a/crates/sparse-ngrams/src/lib.rs b/crates/sparse-ngrams/src/lib.rs
new file mode 100644
index 0000000..ca2c064
--- /dev/null
+++ b/crates/sparse-ngrams/src/lib.rs
@@ -0,0 +1,45 @@
+//! Sparse n-gram extraction from byte slices.
+//!
+//! Sparse grams are a way of selecting variable-length n-grams (longer than 2 bytes) without
+//! extracting all possible n-grams. The algorithm is deterministic: the same extraction logic
+//! works for every substring, so that substring searches are supported.
+//!
+//! # How it works
+//!
+//! Each consecutive byte pair (bigram) is assigned a priority based on how frequently it occurs
+//! in a large code corpus. A monotone deque tracks potential n-gram boundaries: an n-gram
+//! boundary occurs wherever a bigram has lower priority than all bigrams between it and the
+//! previous boundary.
+//!
+//! For a document of N bytes, this produces at most 3(N-1) n-grams: all bigrams plus algorithmically
+//! selected longer n-grams (up to [`MAX_SPARSE_GRAM_SIZE`] bytes).
+//!
+//! # Example
+//!
+//! ```
+//! use sparse_ngrams::{NGram, collect_sparse_grams, MAX_SPARSE_GRAM_SIZE};
+//!
+//! let input = b"hello world";
+//! let grams = collect_sparse_grams(input);
+//! assert!(grams.len() > input.len() - 1);
+//! for gram in &grams {
+//!     assert!(gram.len() >= 2);
+//!     assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize);
+//! }
+//! ```
+
+mod deque;
+mod extract;
+mod ngram;
+mod table;
+
+pub use ngram::NGram;
+
+/// Number of high-frequency bigrams used to build the priority table.
+pub const NUM_FREQUENT_BIGRAMS: usize = 65534;
+
+/// Maximum length (in bytes) of a sparse n-gram.
+pub const MAX_SPARSE_GRAM_SIZE: u32 = 8;
+
+pub use extract::{collect_sparse_grams, collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams};
+
diff --git a/crates/sparse-ngrams/src/murmur.rs b/crates/sparse-ngrams/src/murmur.rs
new file mode 100644
index 0000000..b970224
--- /dev/null
+++ b/crates/sparse-ngrams/src/murmur.rs
@@ -0,0 +1,67 @@
+//! Murmur1 hash function (ported from blackbird_core).
+
+use std::hash::Hasher;
+
+trait XorRsh {
+    fn xor_rsh(self, nbits: u32) -> Self;
+}
+
+impl XorRsh for u32 {
+    #[inline]
+    fn xor_rsh(self, nbits: u32) -> u32 {
+        self ^ (self >> nbits)
+    }
+}
+
+fn murmur1_hash(bytes: &[u8], seed: u32) -> u32 {
+    const M: u32 = 0xc6a4_a793;
+    const R: u32 = 16;
+
+    let mut h = seed ^ (bytes.len() as u32).wrapping_mul(M);
+
+    let chunks_len = bytes.len() / 4 * 4;
+    for chunk in bytes[..chunks_len].chunks_exact(4) {
+        let ptr = chunk.as_ptr() as *const u32;
+        let k = unsafe { ptr.read_unaligned() };
+        h = h.wrapping_add(k).wrapping_mul(M).xor_rsh(R);
+    }
+
+    let mut tail_bytes = [0u8; 4];
+    tail_bytes[..bytes.len() - chunks_len].copy_from_slice(&bytes[chunks_len..]);
+    h = h
+        .wrapping_add(u32::from_le_bytes(tail_bytes))
+        .wrapping_mul(M)
+        .xor_rsh(R);
+
+    h.wrapping_mul(M).xor_rsh(10).wrapping_mul(M).xor_rsh(17)
+}
+
+/// Hasher implementing the Murmur1 hash function.
+struct Murmur1Hasher {
+    bytes: Vec<u8>,
+}
+
+impl Default for Murmur1Hasher {
+    fn default() -> Self {
+        Self {
+            bytes: Vec::with_capacity(64),
+        }
+    }
+}
+
+impl Hasher for Murmur1Hasher {
+    fn write(&mut self, bytes: &[u8]) {
+        self.bytes.extend_from_slice(bytes);
+    }
+
+    fn finish(&self) -> u64 {
+        murmur1_hash(&self.bytes, 0).into()
+    }
+}
+
+pub(crate) fn hash_bigram(gram: (u8, u8)) -> u32 {
+    use std::hash::Hash;
+    let mut h = Murmur1Hasher::default();
+    gram.hash(&mut h);
+    h.finish() as u32
+}
diff --git a/crates/sparse-ngrams/src/ngram.rs b/crates/sparse-ngrams/src/ngram.rs
new file mode 100644
index 0000000..5475dcc
--- /dev/null
+++ b/crates/sparse-ngrams/src/ngram.rs
@@ -0,0 +1,132 @@
+//! Compact n-gram representation using a polynomial rolling hash.
+//!
+//! An [`NGram`] packs both a hash and the byte length into a single `u32`:
+//! the upper 24 bits hold the rolling hash and the lower 8 bits hold the length.
+//! This makes it suitable as a cheap, fixed-size key for hash maps and sets.
+
+use std::fmt;
+
+use crate::MAX_SPARSE_GRAM_SIZE;
+
+/// Prime for the polynomial rolling hash.
+pub(crate) const POLY_HASH_PRIME: u32 = 2_654_435_761;
+
+/// Precomputed powers of [`POLY_HASH_PRIME`] for rolling-hash range queries.
+/// `POLY_POWERS[i] = POLY_HASH_PRIME.pow(i)` (wrapping `u32`).
+pub(crate) const POLY_POWERS: [u32; MAX_SPARSE_GRAM_SIZE as usize + 1] = {
+    let mut p = [0u32; MAX_SPARSE_GRAM_SIZE as usize + 1];
+    p[0] = 1;
+    let mut i = 1;
+    while i < p.len() {
+        p[i] = (p[i - 1] as u64 * POLY_HASH_PRIME as u64) as u32;
+        i += 1;
+    }
+    p
+};
+
+/// A compact n-gram identifier: upper 24 bits are a polynomial rolling hash,
+/// lower 8 bits are the byte length of the n-gram.
+///
+/// Two `NGram` values are equal iff both their hash and length match, which
+/// greatly reduces collision probability compared to a bare hash.
+///
+/// # Construction
+///
+/// Use [`NGram::from_bytes`] for one-off hashing, or the rolling-hash helpers
+/// inside the extraction loop for amortised O(1) computation per n-gram.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct NGram(pub(crate) u32);
+
+impl NGram {
+    /// Build an `NGram` by hashing the given byte slice from scratch.
+    pub fn from_bytes(src: &[u8]) -> Self {
+        let mut hash = 0u32;
+        for &byte in src {
+            hash = hash.wrapping_mul(POLY_HASH_PRIME).wrapping_add(byte as u32);
+        }
+        Self((hash << 8) | src.len() as u32)
+    }
+
+    /// Build an `NGram` from a precomputed rolling hash and a length.
+    #[inline]
+    pub(crate) fn from_rolling_hash(hash: u32, len: usize) -> Self {
+        Self((hash << 8) | len as u32)
+    }
+
+    /// The byte length of the n-gram (stored in the lower 8 bits).
+    #[inline]
+    pub fn len(&self) -> usize {
+        (self.0 & 0xff) as usize
+    }
+
+    /// Whether this represents an empty gram (should never happen in practice).
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// The raw packed `u32` (hash ≪ 8 | len).
+    #[inline]
+    pub fn as_u32(&self) -> u32 {
+        self.0
+    }
+}
+
+impl fmt::Debug for NGram {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "NGram({:#x}, len={})", self.0 >> 8, self.len())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_from_bytes_roundtrip() {
+        let ngram = NGram::from_bytes(b"hello");
+        assert_eq!(ngram.len(), 5);
+    }
+
+    #[test]
+    fn test_equal_content_equal_ngram() {
+        assert_eq!(NGram::from_bytes(b"abc"), NGram::from_bytes(b"abc"));
+    }
+
+    #[test]
+    fn test_different_content_likely_different() {
+        assert_ne!(NGram::from_bytes(b"abc"), NGram::from_bytes(b"abd"));
+    }
+
+    #[test]
+    fn test_same_hash_different_length() {
+        // Even if hashes collide, different lengths produce different NGrams.
+        let a = NGram::from_bytes(b"ab");
+        let b = NGram::from_bytes(b"abc");
+        assert_ne!(a, b);
+    }
+
+    #[test]
+    fn test_rolling_hash_matches_from_bytes() {
+        let content = b"hello world";
+        // Build prefix hashes the same way the extraction loop does.
+        let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE as usize];
+        if !content.is_empty() {
+            prefix_hashes[1] = content[0] as u32;
+        }
+        for idx in 1..content.len() {
+            let end_hash = prefix_hashes[idx & (MAX_SPARSE_GRAM_SIZE as usize - 1)]
+                .wrapping_mul(POLY_HASH_PRIME)
+                .wrapping_add(content[idx] as u32);
+            // Check the bigram content[idx-1..idx+1]
+            let rolling_hash = end_hash
+                .wrapping_sub(prefix_hashes[(idx - 1) & (MAX_SPARSE_GRAM_SIZE as usize - 1)]
+                    .wrapping_mul(POLY_POWERS[2]));
+            let rolling = NGram::from_rolling_hash(rolling_hash, 2);
+            let direct = NGram::from_bytes(&content[idx - 1..idx + 1]);
+            assert_eq!(rolling, direct, "mismatch at idx={idx}");
+            prefix_hashes[(idx + 1) & (MAX_SPARSE_GRAM_SIZE as usize - 1)] = end_hash;
+        }
+    }
+}
diff --git a/crates/sparse-ngrams/src/table.rs b/crates/sparse-ngrams/src/table.rs
new file mode 100644
index 0000000..70d55f5
--- /dev/null
+++ b/crates/sparse-ngrams/src/table.rs
@@ -0,0 +1,41 @@
+//! Bigram priority table.
+//!
+//! Assigns a frequency-based priority to each byte pair, used by the sparse n-gram
+//! extraction algorithm to decide where n-gram boundaries fall.
+
+use std::sync::OnceLock;
+
+use crate::NUM_FREQUENT_BIGRAMS;
+
+/// The bigrams in this string are sorted by how frequently they occur in code (descending).
+/// Bigrams are separated by null bytes. Only the first [`NUM_FREQUENT_BIGRAMS`] entries
+/// receive nonzero priority; all other byte pairs default to 0.
+static BIGRAMS_STR: &str = include_str!("bigrams.bin");
+
+/// Flat 256×256 lookup table indexed by `a as usize * 256 + b`.
+/// Entries default to 0 for bigrams not in the frequency table.
+static BIGRAM_TABLE: OnceLock<Box<[u16; 256 * 256]>> = OnceLock::new();
+
+/// Returns the bigram priority table. The first call initializes it (thread-safe).
+pub(crate) fn get_bigram_table() -> &'static [u16; 256 * 256] {
+    BIGRAM_TABLE.get_or_init(|| {
+        let mut table = Box::new([0u16; 256 * 256]);
+        for (idx, s) in BIGRAMS_STR
+            .split('\0')
+            .take(NUM_FREQUENT_BIGRAMS)
+            .enumerate()
+        {
+            let mut chars = s.chars();
+            let Some((a, b)) = chars.next().zip(chars.next()) else {
+                continue;
+            };
+            let a = (a as u8).to_ascii_lowercase();
+            let b = (b as u8).to_ascii_lowercase();
+            // Higher-frequency bigrams get HIGHER values so they are more often
+            // encompassed by longer grams.
+            table[a as usize * 256 + b as usize] =
+                (NUM_FREQUENT_BIGRAMS - idx) as u16;
+        }
+        table
+    })
+}