diff --git a/Cargo.toml b/Cargo.toml index 312f46d..4e584bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "crates/*", "crates/bpe/benchmarks", "crates/bpe/tests", + "crates/sparse-ngrams", ] resolver = "2" diff --git a/README.md b/README.md index ae3acce..0dbe85f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ A collection of useful algorithms written in Rust. Currently contains: - [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters. - [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents. - [`bpe-openai`](crates/bpe-openai): Fast tokenizers for OpenAI token sets based on the `bpe` crate. +- [`sparse-ngrams`](crates/sparse-ngrams): fast sparse n-gram extraction from byte slices. Selects variable-length n-grams (2–8 bytes) deterministically using bigram frequency priorities, suitable for substring search indexes. - [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries. ## Background diff --git a/crates/sparse-ngrams/Cargo.toml b/crates/sparse-ngrams/Cargo.toml new file mode 100644 index 0000000..d5bcde0 --- /dev/null +++ b/crates/sparse-ngrams/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "sparse-ngrams" +version = "0.1.0" +edition = "2021" +description = "Fast sparse n-gram extraction from byte slices." +repository = "https://github.com/github/rust-gems" +license = "MIT" +keywords = ["ngram", "algorithm", "search", "index"] +categories = ["algorithms", "data-structures", "text-processing"] + +[lib] +bench = false + +[[bench]] +name = "performance" +path = "benchmarks/performance.rs" +harness = false + +[dev-dependencies] +criterion = "0.5" diff --git a/crates/sparse-ngrams/README.md b/crates/sparse-ngrams/README.md new file mode 100644 index 0000000..f12808e --- /dev/null +++ b/crates/sparse-ngrams/README.md @@ -0,0 +1,79 @@ +# sparse-ngrams + +Fast sparse n-gram extraction from byte slices. + +Sparse grams select variable-length n-grams (2–8 bytes) without extracting all possible substrings. The algorithm is deterministic: the same extraction logic applies to every substring, making it suitable for substring search indexes. + +## How it works + +Each consecutive byte pair (bigram) is assigned a frequency-based priority from a precomputed table. An n-gram boundary occurs wherever a bigram has lower priority than all bigrams between it and the previous boundary. This is computed efficiently using a monotone deque or a scan-based approach. + +For a document of N bytes, this produces at most 3(N−1) n-grams: N−1 bigrams, plus up to 2(N−1) algorithmically selected longer n-grams (up to 8 bytes). + +### Selection criterion + +A substring of length 3–8 is emitted as a sparse n-gram if and only if every interior bigram priority is strictly greater than the maximum of the left and right boundary bigram priorities. + +## Usage + +```rust +use sparse_ngrams::{collect_sparse_grams, NGram, MAX_SPARSE_GRAM_SIZE}; + +let input = b"hello world"; +let grams = collect_sparse_grams(input); +for gram in &grams { + assert!(gram.len() >= 2); + assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize); +} +``` + +## Performance + +Benchmarks on an Apple M1 (15 KB input, `lib.rs` source file): + +| Variant | Throughput | +|---------|-----------| +| `deque` | ~3.5 GB/s | +| `scan` | ~4.9 GB/s | + +The `scan` variant is ~40% faster than the deque variant by replacing the monotone deque with a fixed-size circular buffer and a suffix-minimum scan. + +## Bigram table size + +The priority table maps byte pairs to frequency-based priorities. Increasing the table size (number of ranked bigrams) produces more distinct longer n-grams, but saturates quickly: + +![Unique n-grams vs. table size](images/unique_ngrams_vs_table_size.png) + +| Table size | Unique n-grams | % of max | +|-----------|---------------|----------| +| 100 | 6.2M | 79.4% | +| 200 | 6.7M | 85.9% | +| 400 | 7.1M | 91.1% | +| 800 | 7.5M | 96.2% | +| 1,600 | 7.8M | 99.0% | +| 3,200 | 7.8M | 99.9% | +| 6,400+ | 7.8M | 100% | + +Beyond ~6,400 entries the table saturates — additional bigram rankings produce no new n-grams since all occurring byte pairs already have distinct priorities. + +## Maximum n-gram length + +Increasing the maximum n-gram length produces more unique longer grams, with diminishing returns: + +![Unique n-grams vs. max length](images/unique_ngrams_vs_max_length.png) + +| Max length | Unique n-grams | vs. len=8 | +|-----------|---------------|-----------| +| 2 | 1.4M | 18% | +| 3 | 4.6M | 59% | +| 4 | 5.8M | 74% | +| 6 | 7.1M | 90% | +| 8 | 7.8M | 100% | +| 12 | 8.7M | 111% | +| 16 | 9.2M | 118% | +| 24 | 9.8M | 124% | +| 32 | 10.0M | 128% | +| 48 | 10.3M | 131% | +| 64 | 10.4M | 132% | + +The default of 8 captures most of the discriminative power. Going to 16 adds ~18% more unique grams but doubles the scan window; going to 64 adds only ~32% total. diff --git a/crates/sparse-ngrams/benchmarks/performance.rs b/crates/sparse-ngrams/benchmarks/performance.rs new file mode 100644 index 0000000..4a24063 --- /dev/null +++ b/crates/sparse-ngrams/benchmarks/performance.rs @@ -0,0 +1,32 @@ +use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main}; +use sparse_ngrams::{NGram, collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams}; + +fn bench_collect(c: &mut Criterion) { + let inputs: Vec<(&str, Vec)> = vec![ + ("small_11B", b"hello world".to_vec()), + ( + "medium_900B", + "the quick brown fox jumps over the lazy dog. " + .repeat(20) + .into_bytes(), + ), + ("large_15KB", include_str!("../src/lib.rs").as_bytes().to_vec()), + ]; + + let mut group = c.benchmark_group("collect"); + for (name, input) in &inputs { + let mut buf = vec![NGram::from_bytes(b"xx"); max_sparse_grams(input.len())]; + group.throughput(Throughput::Bytes(input.len() as u64)); + + group.bench_with_input(BenchmarkId::new("deque", name), input, |b, input| { + b.iter(|| collect_sparse_grams_deque(black_box(input), &mut buf)) + }); + group.bench_with_input(BenchmarkId::new("scan", name), input, |b, input| { + b.iter(|| collect_sparse_grams_scan(black_box(input), &mut buf)) + }); + } + group.finish(); +} + +criterion_group!(benches, bench_collect); +criterion_main!(benches); diff --git a/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png b/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png new file mode 100644 index 0000000..d878c2e Binary files /dev/null and b/crates/sparse-ngrams/images/unique_ngrams_vs_max_length.png differ diff --git a/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png b/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png new file mode 100644 index 0000000..218143a Binary files /dev/null and b/crates/sparse-ngrams/images/unique_ngrams_vs_table_size.png differ diff --git a/crates/sparse-ngrams/src/bigrams.bin b/crates/sparse-ngrams/src/bigrams.bin new file mode 100644 index 0000000..d011c90 Binary files /dev/null and b/crates/sparse-ngrams/src/bigrams.bin differ diff --git a/crates/sparse-ngrams/src/deque.rs b/crates/sparse-ngrams/src/deque.rs new file mode 100644 index 0000000..3738ba0 --- /dev/null +++ b/crates/sparse-ngrams/src/deque.rs @@ -0,0 +1,71 @@ +//! Stack-allocated circular buffer (monotone deque). + +use std::mem::MaybeUninit; + +/// Deque element representing two neighboring bytes in the input. +#[derive(Debug, Clone, Copy)] +pub(crate) struct PosStateBytes { + /// Absolute index position between the two bigram characters. + /// I.e. 1 references the very first bigram. + pub index: u32, + pub value: u16, +} + +/// Stack-allocated circular buffer holding up to `CAP` elements. +/// Replaces `VecDeque` — avoids heap allocation and fits in a +/// single cache line for small CAP values. +pub(crate) struct FixedDeque { + data: [MaybeUninit; CAP], + start: u8, + len: u8, +} + +impl FixedDeque { + pub fn new() -> Self { + Self { + data: [MaybeUninit::uninit(); CAP], + start: 0, + len: 0, + } + } + + #[inline] + pub fn front(&self) -> Option<&PosStateBytes> { + if self.len == 0 { + None + } else { + Some(unsafe { self.data[self.start as usize].assume_init_ref() }) + } + } + + #[inline] + pub fn back(&self) -> Option<&PosStateBytes> { + if self.len == 0 { + None + } else { + let idx = (self.start + self.len - 1) as usize % CAP; + Some(unsafe { self.data[idx].assume_init_ref() }) + } + } + + #[inline] + pub fn pop_front(&mut self) { + debug_assert!(self.len > 0); + self.start = (self.start + 1) % CAP as u8; + self.len -= 1; + } + + #[inline] + pub fn pop_back(&mut self) { + debug_assert!(self.len > 0); + self.len -= 1; + } + + #[inline] + pub fn push_back(&mut self, val: PosStateBytes) { + debug_assert!((self.len as usize) < CAP); + let idx = (self.start + self.len) as usize % CAP; + self.data[idx] = MaybeUninit::new(val); + self.len += 1; + } +} diff --git a/crates/sparse-ngrams/src/extract.rs b/crates/sparse-ngrams/src/extract.rs new file mode 100644 index 0000000..0f845fa --- /dev/null +++ b/crates/sparse-ngrams/src/extract.rs @@ -0,0 +1,350 @@ +//! Core sparse n-gram extraction algorithm. + +use crate::deque::{FixedDeque, PosStateBytes}; +use crate::ngram::{NGram, POLY_HASH_PRIME, POLY_POWERS}; +use crate::table::get_bigram_table; +use crate::MAX_SPARSE_GRAM_SIZE; + +/// Returns the maximum number of sparse n-grams that can be produced from +/// `content_len` bytes of input. Use this to pre-allocate the output slice. +#[inline] +pub const fn max_sparse_grams(content_len: usize) -> usize { + if content_len < 2 { + 0 + } else { + (content_len - 1) * 3 + } +} + +/// Collect all sparse n-grams from the input byte slice into a new [`Vec`]. +pub fn collect_sparse_grams(content: &[u8]) -> Vec { + let mut buf = vec![NGram::from_rolling_hash(0, 0); max_sparse_grams(content.len())]; + let count = collect_sparse_grams_deque(content, &mut buf); + buf.truncate(count); + buf +} + +/// Deque-based extraction. Writes n-grams into `out` (must have at least +/// [`max_sparse_grams`]`(content.len())` slots). Returns the count written. +/// +/// # Panics +/// +/// Panics if `out` is too small. +pub fn collect_sparse_grams_deque(content: &[u8], out: &mut [NGram]) -> usize { + let n = content.len(); + if n < 2 { + return 0; + } + assert!(out.len() >= max_sparse_grams(n)); + let table = get_bigram_table(); + let mut queue = FixedDeque::<{ MAX_SPARSE_GRAM_SIZE as usize }>::new(); + let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE as usize]; + prefix_hashes[1] = content[0] as u32; + let mut w = 0usize; + + for idx in 1..n as u32 { + let mask = MAX_SPARSE_GRAM_SIZE as usize - 1; + let end_hash = prefix_hashes[idx as usize & mask] + .wrapping_mul(POLY_HASH_PRIME) + .wrapping_add(content[idx as usize] as u32); + + // Bigram + let bigram_hash = end_hash + .wrapping_sub(prefix_hashes[(idx as usize - 1) & mask].wrapping_mul(POLY_POWERS[2])); + out[w] = NGram::from_rolling_hash(bigram_hash, 2); + w += 1; + + let v1 = + table[content[idx as usize - 1] as usize * 256 + content[idx as usize] as usize]; + + if let Some(begin) = queue.front() { + if idx - begin.index + 1 >= MAX_SPARSE_GRAM_SIZE { + queue.pop_front(); + } + } + while let Some(begin) = queue.back() { + let start = begin.index as usize - 1; + let len = (idx - begin.index + 2) as usize; + let hash = end_hash.wrapping_sub(prefix_hashes[start & mask].wrapping_mul(POLY_POWERS[len])); + out[w] = NGram::from_rolling_hash(hash, len); + w += 1; + if begin.value == v1 { + queue.pop_back(); + break; + } else if begin.value <= v1 { + break; + } + queue.pop_back(); + } + queue.push_back(PosStateBytes { + index: idx, + value: v1, + }); + prefix_hashes[(idx as usize + 1) & mask] = end_hash; + } + w +} + +/// Queue-free scan-based extraction. Writes n-grams into `out` (must have at least +/// [`max_sparse_grams`]`(content.len())` slots). Returns the count written. +/// +/// Produces identical output (same order) as [`collect_sparse_grams_deque`]. +/// +/// # Panics +/// +/// Panics if `out` is too small. +pub fn collect_sparse_grams_scan(content: &[u8], out: &mut [NGram]) -> usize { + let n = content.len(); + if n < 2 { + return 0; + } + assert!(out.len() >= max_sparse_grams(n)); + + let table = get_bigram_table(); + const MASK: usize = MAX_SPARSE_GRAM_SIZE as usize - 1; + let mut w = 0usize; + let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE as usize]; + prefix_hashes[1] = content[0] as u32; + let mut priorities = [u16::MAX; MAX_SPARSE_GRAM_SIZE as usize]; + for idx in 1..n as u32 { + let end_hash = prefix_hashes[idx as usize & MASK] + .wrapping_mul(POLY_HASH_PRIME) + .wrapping_add(content[idx as usize] as u32); + // Bigram + let bigram_hash = end_hash + .wrapping_sub(prefix_hashes[(idx as usize - 1) & MASK].wrapping_mul(POLY_POWERS[2])); + out[w] = NGram::from_rolling_hash(bigram_hash, 2); + w += 1; + let v1 = + table[content[idx as usize - 1] as usize * 256 + content[idx as usize] as usize]; + priorities[idx as usize & MASK] = v1; + let mut running_min = u16::MAX; + for d in 1..=(MAX_SPARSE_GRAM_SIZE - 2) { + if d >= idx { + break; + } + let p = idx.wrapping_sub(d) as usize & MASK; + let v_p = priorities[p]; + if v_p < running_min { + running_min = v_p; + let start = p.wrapping_sub(1) & MASK; + let len = d as usize + 2; + let hash = end_hash.wrapping_sub(prefix_hashes[start].wrapping_mul(POLY_POWERS[len])); + out[w] = NGram::from_rolling_hash(hash, len); + w += 1; + if v_p <= v1 { + break; + } + } + } + prefix_hashes[(idx as usize + 1) & MASK] = end_hash; + } + w +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::table::get_bigram_table; + use std::collections::HashSet; + + fn collect_to_vec(content: &[u8], f: fn(&[u8], &mut [NGram]) -> usize) -> Vec { + let mut buf = vec![NGram::from_rolling_hash(0, 0); max_sparse_grams(content.len())]; + let count = f(content, &mut buf); + buf.truncate(count); + buf + } + + /// Brute-force reference implementation. + /// + /// Enumerates all substrings of length 2..=MAX_SPARSE_GRAM_SIZE and emits those + /// where every interior bigram priority is strictly greater than `max(left, right)` + /// boundary bigram priority. All bigrams (len=2) are always emitted. + fn brute_force_sparse_grams(content: &[u8]) -> HashSet { + let table = get_bigram_table(); + let n = content.len(); + let mut result = HashSet::new(); + if n < 2 { + return result; + } + // All bigrams. + for i in 0..n - 1 { + result.insert(NGram::from_bytes(&content[i..i + 2])); + } + // Longer grams: length 3..=MAX_SPARSE_GRAM_SIZE. + for len in 3..=MAX_SPARSE_GRAM_SIZE as usize { + 'outer: for start in 0..=n.saturating_sub(len) { + if start + len > n { + break; + } + let left = table[content[start] as usize * 256 + content[start + 1] as usize]; + let right = + table[content[start + len - 2] as usize * 256 + content[start + len - 1] as usize]; + let boundary = left.max(right); + // Inner bigrams: bytes [start+1,start+2], ..., [start+len-3,start+len-2] + for k in 1..len - 2 { + let p = table[content[start + k] as usize * 256 + content[start + k + 1] as usize]; + if p <= boundary { + continue 'outer; + } + } + result.insert(NGram::from_bytes(&content[start..start + len])); + } + } + result + } + + #[test] + fn test_empty_input() { + assert!(collect_sparse_grams(b"").is_empty()); + } + + #[test] + fn test_single_byte() { + assert!(collect_sparse_grams(b"a").is_empty()); + } + + #[test] + fn test_two_bytes() { + let grams = collect_sparse_grams(b"ab"); + assert_eq!(grams.len(), 1); + assert_eq!(grams[0], NGram::from_bytes(b"ab")); + } + + #[test] + fn test_three_bytes() { + let grams = collect_sparse_grams(b"abc"); + assert!(grams.len() >= 2); + assert_eq!(grams[0], NGram::from_bytes(b"ab")); + assert_eq!(grams[1], NGram::from_bytes(b"bc")); + } + + #[test] + fn test_gram_lengths_bounded() { + let input = b"self.reset_states(the_quick_brown_fox_jumps"; + let grams = collect_sparse_grams(input); + for gram in &grams { + assert!(gram.len() >= 2, "gram too short: {gram:?}"); + assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize, "gram too long: {gram:?}"); + } + } + + #[test] + fn test_produces_longer_grams() { + let grams = collect_sparse_grams(b"self.reset_states("); + assert!(grams.iter().any(|g| g.len() > 2)); + } + + #[test] + fn test_max_gram_size_boundary() { + let grams = collect_sparse_grams(b"abcdefgh"); + for gram in &grams { + assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize); + } + } + + #[test] + fn test_repeated_bytes() { + let grams = collect_sparse_grams(b"aaaaaaaaaa"); + assert!(grams.iter().filter(|g| g.len() == 2).count() >= 9); + } + + #[test] + fn test_gram_count_scales_linearly() { + let input: Vec = (0..1000).map(|i| (i % 256) as u8).collect(); + let grams = collect_sparse_grams(&input); + assert!(grams.len() >= input.len() - 1); + assert!(grams.len() <= input.len() * 3); + } + + // -- Equivalence: scan vs deque -- + + #[test] + fn test_scan_equivalence_small() { + for input in [b"" as &[u8], b"x", b"ab", b"abc", b"abcdefgh", b"abcdefghi"] { + assert_eq!( + collect_to_vec(input, collect_sparse_grams_deque), + collect_to_vec(input, collect_sparse_grams_scan), + "mismatch on {:?}", + std::str::from_utf8(input).unwrap_or("?") + ); + } + } + + #[test] + fn test_scan_equivalence_hello_world() { + let input = b"hello world"; + assert_eq!( + collect_to_vec(input, collect_sparse_grams_deque), + collect_to_vec(input, collect_sparse_grams_scan), + ); + } + + #[test] + fn test_scan_equivalence_large() { + let input: Vec = (0..1000).map(|i| (i % 256) as u8).collect(); + assert_eq!( + collect_to_vec(&input, collect_sparse_grams_deque), + collect_to_vec(&input, collect_sparse_grams_scan), + ); + } + + #[test] + fn test_scan_equivalence_source_code() { + let input = include_bytes!("lib.rs"); + assert_eq!( + collect_to_vec(input, collect_sparse_grams_deque), + collect_to_vec(input, collect_sparse_grams_scan), + ); + } + + // -- Brute-force equivalence -- + + fn assert_matches_brute_force(input: &[u8]) { + let grams = collect_sparse_grams(input); + let actual: HashSet = grams.into_iter().collect(); + let expected = brute_force_sparse_grams(input); + let only_actual: Vec<_> = actual.difference(&expected).collect(); + let only_expected: Vec<_> = expected.difference(&actual).collect(); + if !only_actual.is_empty() || !only_expected.is_empty() { + panic!( + "mismatch on input len={}\n only in algorithm: {:?}\n only in brute force: {:?}", + input.len(), only_actual, only_expected + ); + } + } + + #[test] + fn test_brute_force_small() { + for input in [b"" as &[u8], b"x", b"ab", b"abc", b"abcd", b"abcdefgh", b"abcdefghi"] { + assert_matches_brute_force(input); + } + } + + #[test] + fn test_brute_force_hello_world() { + assert_matches_brute_force(b"hello world"); + } + + #[test] + fn test_brute_force_repeated() { + assert_matches_brute_force(b"aaaaaaaaaa"); + } + + #[test] + fn test_brute_force_code_snippet() { + assert_matches_brute_force(b"self.reset_states(the_quick_brown_fox_jumps"); + } + + #[test] + fn test_brute_force_diverse() { + let input: Vec = (0..200).map(|i| (i % 256) as u8).collect(); + assert_matches_brute_force(&input); + } + + #[test] + fn test_brute_force_source_code() { + let input = include_bytes!("lib.rs"); + assert_matches_brute_force(input); + } +} diff --git a/crates/sparse-ngrams/src/lib.rs b/crates/sparse-ngrams/src/lib.rs new file mode 100644 index 0000000..ca2c064 --- /dev/null +++ b/crates/sparse-ngrams/src/lib.rs @@ -0,0 +1,45 @@ +//! Sparse n-gram extraction from byte slices. +//! +//! Sparse grams are a way of selecting variable-length n-grams (longer than 2 bytes) without +//! extracting all possible n-grams. The algorithm is deterministic: the same extraction logic +//! works for every substring, so that substring searches are supported. +//! +//! # How it works +//! +//! Each consecutive byte pair (bigram) is assigned a priority based on how frequently it occurs +//! in a large code corpus. A monotone deque tracks potential n-gram boundaries: an n-gram +//! boundary occurs wherever a bigram has lower priority than all bigrams between it and the +//! previous boundary. +//! +//! For a document of N bytes, this produces at most 3(N-1) n-grams: all bigrams plus algorithmically +//! selected longer n-grams (up to [`MAX_SPARSE_GRAM_SIZE`] bytes). +//! +//! # Example +//! +//! ``` +//! use sparse_ngrams::{NGram, collect_sparse_grams, MAX_SPARSE_GRAM_SIZE}; +//! +//! let input = b"hello world"; +//! let grams = collect_sparse_grams(input); +//! assert!(grams.len() > input.len() - 1); +//! for gram in &grams { +//! assert!(gram.len() >= 2); +//! assert!(gram.len() <= MAX_SPARSE_GRAM_SIZE as usize); +//! } +//! ``` + +mod deque; +mod extract; +mod ngram; +mod table; + +pub use ngram::NGram; + +/// Number of high-frequency bigrams used to build the priority table. +pub const NUM_FREQUENT_BIGRAMS: usize = 65534; + +/// Maximum length (in bytes) of a sparse n-gram. +pub const MAX_SPARSE_GRAM_SIZE: u32 = 8; + +pub use extract::{collect_sparse_grams, collect_sparse_grams_deque, collect_sparse_grams_scan, max_sparse_grams}; + diff --git a/crates/sparse-ngrams/src/murmur.rs b/crates/sparse-ngrams/src/murmur.rs new file mode 100644 index 0000000..b970224 --- /dev/null +++ b/crates/sparse-ngrams/src/murmur.rs @@ -0,0 +1,67 @@ +//! Murmur1 hash function (ported from blackbird_core). + +use std::hash::Hasher; + +trait XorRsh { + fn xor_rsh(self, nbits: u32) -> Self; +} + +impl XorRsh for u32 { + #[inline] + fn xor_rsh(self, nbits: u32) -> u32 { + self ^ (self >> nbits) + } +} + +fn murmur1_hash(bytes: &[u8], seed: u32) -> u32 { + const M: u32 = 0xc6a4_a793; + const R: u32 = 16; + + let mut h = seed ^ (bytes.len() as u32).wrapping_mul(M); + + let chunks_len = bytes.len() / 4 * 4; + for chunk in bytes[..chunks_len].chunks_exact(4) { + let ptr = chunk.as_ptr() as *const u32; + let k = unsafe { ptr.read_unaligned() }; + h = h.wrapping_add(k).wrapping_mul(M).xor_rsh(R); + } + + let mut tail_bytes = [0u8; 4]; + tail_bytes[..bytes.len() - chunks_len].copy_from_slice(&bytes[chunks_len..]); + h = h + .wrapping_add(u32::from_le_bytes(tail_bytes)) + .wrapping_mul(M) + .xor_rsh(R); + + h.wrapping_mul(M).xor_rsh(10).wrapping_mul(M).xor_rsh(17) +} + +/// Hasher implementing the Murmur1 hash function. +struct Murmur1Hasher { + bytes: Vec, +} + +impl Default for Murmur1Hasher { + fn default() -> Self { + Self { + bytes: Vec::with_capacity(64), + } + } +} + +impl Hasher for Murmur1Hasher { + fn write(&mut self, bytes: &[u8]) { + self.bytes.extend_from_slice(bytes); + } + + fn finish(&self) -> u64 { + murmur1_hash(&self.bytes, 0).into() + } +} + +pub(crate) fn hash_bigram(gram: (u8, u8)) -> u32 { + use std::hash::Hash; + let mut h = Murmur1Hasher::default(); + gram.hash(&mut h); + h.finish() as u32 +} diff --git a/crates/sparse-ngrams/src/ngram.rs b/crates/sparse-ngrams/src/ngram.rs new file mode 100644 index 0000000..5475dcc --- /dev/null +++ b/crates/sparse-ngrams/src/ngram.rs @@ -0,0 +1,132 @@ +//! Compact n-gram representation using a polynomial rolling hash. +//! +//! An [`NGram`] packs both a hash and the byte length into a single `u32`: +//! the upper 24 bits hold the rolling hash and the lower 8 bits hold the length. +//! This makes it suitable as a cheap, fixed-size key for hash maps and sets. + +use std::fmt; + +use crate::MAX_SPARSE_GRAM_SIZE; + +/// Prime for the polynomial rolling hash. +pub(crate) const POLY_HASH_PRIME: u32 = 2_654_435_761; + +/// Precomputed powers of [`POLY_HASH_PRIME`] for rolling-hash range queries. +/// `POLY_POWERS[i] = POLY_HASH_PRIME.pow(i)` (wrapping `u32`). +pub(crate) const POLY_POWERS: [u32; MAX_SPARSE_GRAM_SIZE as usize + 1] = { + let mut p = [0u32; MAX_SPARSE_GRAM_SIZE as usize + 1]; + p[0] = 1; + let mut i = 1; + while i < p.len() { + p[i] = (p[i - 1] as u64 * POLY_HASH_PRIME as u64) as u32; + i += 1; + } + p +}; + +/// A compact n-gram identifier: upper 24 bits are a polynomial rolling hash, +/// lower 8 bits are the byte length of the n-gram. +/// +/// Two `NGram` values are equal iff both their hash and length match, which +/// greatly reduces collision probability compared to a bare hash. +/// +/// # Construction +/// +/// Use [`NGram::from_bytes`] for one-off hashing, or the rolling-hash helpers +/// inside the extraction loop for amortised O(1) computation per n-gram. +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[repr(transparent)] +pub struct NGram(pub(crate) u32); + +impl NGram { + /// Build an `NGram` by hashing the given byte slice from scratch. + pub fn from_bytes(src: &[u8]) -> Self { + let mut hash = 0u32; + for &byte in src { + hash = hash.wrapping_mul(POLY_HASH_PRIME).wrapping_add(byte as u32); + } + Self((hash << 8) | src.len() as u32) + } + + /// Build an `NGram` from a precomputed rolling hash and a length. + #[inline] + pub(crate) fn from_rolling_hash(hash: u32, len: usize) -> Self { + Self((hash << 8) | len as u32) + } + + /// The byte length of the n-gram (stored in the lower 8 bits). + #[inline] + pub fn len(&self) -> usize { + (self.0 & 0xff) as usize + } + + /// Whether this represents an empty gram (should never happen in practice). + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// The raw packed `u32` (hash ≪ 8 | len). + #[inline] + pub fn as_u32(&self) -> u32 { + self.0 + } +} + +impl fmt::Debug for NGram { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "NGram({:#x}, len={})", self.0 >> 8, self.len()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_bytes_roundtrip() { + let ngram = NGram::from_bytes(b"hello"); + assert_eq!(ngram.len(), 5); + } + + #[test] + fn test_equal_content_equal_ngram() { + assert_eq!(NGram::from_bytes(b"abc"), NGram::from_bytes(b"abc")); + } + + #[test] + fn test_different_content_likely_different() { + assert_ne!(NGram::from_bytes(b"abc"), NGram::from_bytes(b"abd")); + } + + #[test] + fn test_same_hash_different_length() { + // Even if hashes collide, different lengths produce different NGrams. + let a = NGram::from_bytes(b"ab"); + let b = NGram::from_bytes(b"abc"); + assert_ne!(a, b); + } + + #[test] + fn test_rolling_hash_matches_from_bytes() { + let content = b"hello world"; + // Build prefix hashes the same way the extraction loop does. + let mut prefix_hashes = [0u32; MAX_SPARSE_GRAM_SIZE as usize]; + if !content.is_empty() { + prefix_hashes[1] = content[0] as u32; + } + for idx in 1..content.len() { + let end_hash = prefix_hashes[idx & (MAX_SPARSE_GRAM_SIZE as usize - 1)] + .wrapping_mul(POLY_HASH_PRIME) + .wrapping_add(content[idx] as u32); + // Check the bigram content[idx-1..idx+1] + let rolling_hash = end_hash + .wrapping_sub(prefix_hashes[(idx - 1) & (MAX_SPARSE_GRAM_SIZE as usize - 1)] + .wrapping_mul(POLY_POWERS[2])); + let rolling = NGram::from_rolling_hash(rolling_hash, 2); + let direct = NGram::from_bytes(&content[idx - 1..idx + 1]); + assert_eq!(rolling, direct, "mismatch at idx={idx}"); + prefix_hashes[(idx + 1) & (MAX_SPARSE_GRAM_SIZE as usize - 1)] = end_hash; + } + } +} diff --git a/crates/sparse-ngrams/src/table.rs b/crates/sparse-ngrams/src/table.rs new file mode 100644 index 0000000..70d55f5 --- /dev/null +++ b/crates/sparse-ngrams/src/table.rs @@ -0,0 +1,41 @@ +//! Bigram priority table. +//! +//! Assigns a frequency-based priority to each byte pair, used by the sparse n-gram +//! extraction algorithm to decide where n-gram boundaries fall. + +use std::sync::OnceLock; + +use crate::NUM_FREQUENT_BIGRAMS; + +/// The bigrams in this string are sorted by how frequently they occur in code (descending). +/// Bigrams are separated by null bytes. Only the first [`NUM_FREQUENT_BIGRAMS`] entries +/// receive nonzero priority; all other byte pairs default to 0. +static BIGRAMS_STR: &str = include_str!("bigrams.bin"); + +/// Flat 256×256 lookup table indexed by `a as usize * 256 + b`. +/// Entries default to 0 for bigrams not in the frequency table. +static BIGRAM_TABLE: OnceLock> = OnceLock::new(); + +/// Returns the bigram priority table. The first call initializes it (thread-safe). +pub(crate) fn get_bigram_table() -> &'static [u16; 256 * 256] { + BIGRAM_TABLE.get_or_init(|| { + let mut table = Box::new([0u16; 256 * 256]); + for (idx, s) in BIGRAMS_STR + .split('\0') + .take(NUM_FREQUENT_BIGRAMS) + .enumerate() + { + let mut chars = s.chars(); + let Some((a, b)) = chars.next().zip(chars.next()) else { + continue; + }; + let a = (a as u8).to_ascii_lowercase(); + let b = (b as u8).to_ascii_lowercase(); + // Higher-frequency bigrams get HIGHER values so they are more often + // encompassed by longer grams. + table[a as usize * 256 + b as usize] = + (NUM_FREQUENT_BIGRAMS - idx) as u16; + } + table + }) +}