github · aneubeck · Oct 7, 2025 · Oct 20, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -4,6 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
+    "crates/sparse-ngrams",
 ]
 resolver = "2"
 

@@ -5,6 +5,7 @@ A collection of useful algorithms written in Rust. Currently contains:
 - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters.
 - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents.
 - [`bpe-openai`](crates/bpe-openai): Fast tokenizers for OpenAI token sets based on the `bpe` crate.
+- [`sparse-ngrams`](crates/sparse-ngrams): fast sparse n-gram extraction from byte slices. Selects variable-length n-grams (2–8 bytes) deterministically using bigram frequency priorities, suitable for substring search indexes.
 - [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.
 
 ## Background

@@ -0,0 +1,20 @@
+[package]
+name = "sparse-ngrams"
+version = "0.1.0"
+edition = "2021"
+description = "Fast sparse n-gram extraction from byte slices."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["ngram", "algorithm", "search", "index"]
+categories = ["algorithms", "data-structures", "text-processing"]
+
+[lib]
+bench = false
+
+[[bench]]
+name = "performance"
+path = "benchmarks/performance.rs"
+harness = false
+
+[dev-dependencies]
+criterion = "0.5"
@@ -0,0 +1,79 @@
+# sparse-ngrams
+
+Fast sparse n-gram extraction from byte slices.
+
+Sparse grams select variable-length n-grams (2–8 bytes) without extracting all possible substrings. The algorithm is deterministic: the same extraction logic applies to every substring, making it suitable for substring search indexes.
+
+## How it works
+
+Each consecutive byte pair (bigram) is assigned a frequency-based priority from a precomputed table. An n-gram boundary occurs wherever a bigram has lower priority than all bigrams between it and the previous boundary. This is computed efficiently using a monotone deque or a scan-based approach.
+
+For a document of N bytes, this produces at most 3(N−1) n-grams: N−1 bigrams, plus up to 2(N−1) algorithmically selected longer n-grams (up to 8 bytes).
+
+### Selection criterion
+
+A substring of length 3–8 is emitted as a sparse n-gram if and only if every interior bigram priority is strictly greater than the maximum of the left and right boundary bigram priorities.
+
+## Usage
+
+```rust
+use sparse_ngrams::{collect_sparse_grams, NGram, MAX_SPARSE_GRAM_SIZE};
+
+let input = b"hello world";
+let grams = collect_sparse_grams(input);
+for gram in &grams {
+    assert!(gram.len() >= 2);
+    assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize);
+}
+```
+
+## Performance
+
+Benchmarks on an Apple M1 (15 KB input, `lib.rs` source file):
+
+| Variant | Throughput |
+|---------|-----------|
+| `deque` | ~3.5 GB/s |
+| `scan`  | ~4.9 GB/s |
+
+The `scan` variant is ~40% faster than the deque variant by replacing the monotone deque with a fixed-size circular buffer and a suffix-minimum scan.
+
+## Bigram table size
+
+The priority table maps byte pairs to frequency-based priorities. Increasing the table size (number of ranked bigrams) produces more distinct longer n-grams, but saturates quickly:
+
+![Unique n-grams vs. table size](images/unique_ngrams_vs_table_size.png)
+
+| Table size | Unique n-grams | % of max |
+|-----------|---------------|----------|
+| 100       | 6.2M          | 79.4%    |
+| 200       | 6.7M          | 85.9%    |
+| 400       | 7.1M          | 91.1%    |
+| 800       | 7.5M          | 96.2%    |
+| 1,600     | 7.8M          | 99.0%    |
+| 3,200     | 7.8M          | 99.9%    |
+| 6,400+    | 7.8M          | 100%     |
+
+Beyond ~6,400 entries the table saturates — additional bigram rankings produce no new n-grams since all occurring byte pairs already have distinct priorities.
+
+## Maximum n-gram length
+
+Increasing the maximum n-gram length produces more unique longer grams, with diminishing returns:
+
+![Unique n-grams vs. max length](images/unique_ngrams_vs_max_length.png)
+
+| Max length | Unique n-grams | vs. len=8 |
+|-----------|---------------|-----------|
+| 2         | 1.4M          | 18%       |
+| 3         | 4.6M          | 59%       |
+| 4         | 5.8M          | 74%       |
+| 6         | 7.1M          | 90%       |
+| 8         | 7.8M          | 100%      |
+| 12        | 8.7M          | 111%      |
+| 16        | 9.2M          | 118%      |
+| 24        | 9.8M          | 124%      |
+| 32        | 10.0M         | 128%      |
+| 48        | 10.3M         | 131%      |
+| 64        | 10.4M         | 132%      |
+
+The default of 8 captures most of the discriminative power. Going to 16 adds ~18% more unique grams but doubles the scan window; going to 64 adds only ~32% total.
@@ -0,0 +1,32 @@
+use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
+use sparse_ngrams::{NGram, collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams};
+
+fn bench_collect(c: &mut Criterion) {
+    let inputs: Vec<(&str, Vec<u8>)> = vec![
+        ("small_11B", b"hello world".to_vec()),
+        (
+            "medium_900B",
+            "the quick brown fox jumps over the lazy dog. "
+                .repeat(20)
+                .into_bytes(),
+        ),
+        ("large_15KB", include_str!("../src/lib.rs").as_bytes().to_vec()),
+    ];
+
+    let mut group = c.benchmark_group("collect");
+    for (name, input) in &inputs {
+        let mut buf = vec![NGram::from_bytes(b"xx"); max_sparse_grams(input.len())];
+        group.throughput(Throughput::Bytes(input.len() as u64));
+
+        group.bench_with_input(BenchmarkId::new("deque", name), input, |b, input| {
+            b.iter(|| collect_sparse_grams_deque(black_box(input), &mut buf))
+        });
+        group.bench_with_input(BenchmarkId::new("scan", name), input, |b, input| {
+            b.iter(|| collect_sparse_grams_scan(black_box(input), &mut buf))
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_collect);
+criterion_main!(benches);
@@ -0,0 +1,71 @@
+//! Stack-allocated circular buffer (monotone deque).
+
+use std::mem::MaybeUninit;
+
+/// Deque element representing two neighboring bytes in the input.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct PosStateBytes {
+    /// Absolute index position between the two bigram characters.
+    /// I.e. 1 references the very first bigram.
+    pub index: u32,
+    pub value: u16,
+}
+
+/// Stack-allocated circular buffer holding up to `CAP` elements.
+/// Replaces `VecDeque<PosStateBytes>` — avoids heap allocation and fits in a
+/// single cache line for small CAP values.
+pub(crate) struct FixedDeque<const CAP: usize> {
+    data: [MaybeUninit<PosStateBytes>; CAP],
+    start: u8,
+    len: u8,
+}
+
+impl<const CAP: usize> FixedDeque<CAP> {
+    pub fn new() -> Self {
+        Self {
+            data: [MaybeUninit::uninit(); CAP],
+            start: 0,
+            len: 0,
+        }
+    }
+
+    #[inline]
+    pub fn front(&self) -> Option<&PosStateBytes> {
+        if self.len == 0 {
+            None
+        } else {
+            Some(unsafe { self.data[self.start as usize].assume_init_ref() })
+        }
+    }
+
+    #[inline]
+    pub fn back(&self) -> Option<&PosStateBytes> {
+        if self.len == 0 {
+            None
+        } else {
+            let idx = (self.start + self.len - 1) as usize % CAP;
+            Some(unsafe { self.data[idx].assume_init_ref() })
+        }
+    }
+
+    #[inline]
+    pub fn pop_front(&mut self) {
+        debug_assert!(self.len > 0);
+        self.start = (self.start + 1) % CAP as u8;
+        self.len -= 1;
+    }
+
+    #[inline]
+    pub fn pop_back(&mut self) {
+        debug_assert!(self.len > 0);
+        self.len -= 1;
+    }
+
+    #[inline]
+    pub fn push_back(&mut self, val: PosStateBytes) {
+        debug_assert!((self.len as usize) < CAP);
+        let idx = (self.start + self.len) as usize % CAP;
+        self.data[idx] = MaybeUninit::new(val);
+        self.len += 1;
+    }
+}