diff --git a/AGENTS.md b/AGENTS.md index e5c3d0cc13b..30ddeeb9515 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -151,6 +151,47 @@ Notes: self-explanatory code. - Keep public APIs small and consistent with neighboring crates. +## Performance: avoid hidden-cost accessors in hot loops + +The most common performance trap in this codebase is calling a *per-element accessor that +hides non-trivial work* inside an `O(n)` loop, instead of doing the work once over the whole +chunk. The call site looks like a cheap getter, but each call re-pays a cost that is constant +(or amortizable) across the loop, making the loop `O(n · k)`. + +Watch for these accessors used inside `for i in 0..n { ... }`: + +| Per-element accessor (in a loop) | Hidden cost | Bulk replacement | +| --- | --- | --- | +| `Validity::is_valid(i)` / `is_null(i)` | for array-backed validity, **allocates an `ExecutionCtx` and runs a scalar lookup per call** | `validity.execute_mask(len, ctx)?` once, then `Mask::value(i)` | +| `array.scalar_at(i)` / `array.execute_scalar(i, ctx)` | per-element execution through the compute stack | canonicalize once (`execute::` / `as_slice`) then index | +| `BitBuffer::value(i)` / `Mask::value(i)` accumulated into a count | recomputes the byte address each call; defeats popcount | `true_count()`, `BitBuffer::count_range(start, end)`, `set_indices()` | +| `BitIterator::next()` to accumulate a running rank/prefix count | bit-at-a-time | `count_range` over each gap (SIMD popcount) | +| re-deriving a value inside the loop (e.g. `self.validity()?` each iteration) | re-runs the derivation `n` times | hoist it above the loop | + +Decide per site — bulk is not always the answer: + +- **Sequential / contiguous access** over an accessor that hides amortizable work → hoist and + go bulk (materialize once, then index or iterate the chunk). +- **Gather over arbitrary indices** → you cannot amortize a per-element *decode*, but you can + still materialize the backing buffer once (e.g. `execute_mask`) and then do cheap `O(1)` + random reads, avoiding per-call context/allocation. +- **The accessor is already genuinely `O(1)`** (e.g. reading an already-materialized `Mask`/ + slice, or a native bitmap) → leave it; bulk would not help. + +Even after materializing into a `Mask`/`BitBuffer`, **do not loop with `value(i)` per +element** to act on each set bit — the per-element branch dominates. Iterate a `u64` word +at a time with all-set / all-unset fast paths: use [`BitBuffer::for_each_set_index`]. It +beats `for i in 0..len { if buf.value(i) {..} }` by 2-45x (more at low density) and beats +collecting `set_indices()` by ~2x at mid/high density, while self-adapting from sparse to +dense — see `vortex-mask/benches/mask_iteration.rs`. Reach for the cached `indices()` / +`slices()` representations when you need them more than once; otherwise `for_each_set_index` +needs no materialization. + +When you touch such a loop, back the change with a benchmark (see +`vortex-array/benches/validity_is_valid.rs` for the `is_valid` case, +`vortex-mask/benches/valid_counts.rs` for the popcount case, and +`vortex-mask/benches/mask_iteration.rs` for the per-element-vs-word-vs-sparse comparison). + ## Tests - Strongly consider `rstest` cases when parameterizing repetitive test logic. @@ -179,6 +220,9 @@ Check new and modified lines against this list before finishing: - Updating expected test output to match buggy behavior without independently verifying the intended semantics. - Silently reducing the scope of an approved plan when implementation is harder than expected. +- Calling a hidden-cost per-element accessor (`Validity::is_valid`, `scalar_at`, `BitBuffer:: + value` accumulation) inside a hot loop instead of materializing once — see + "Performance: avoid hidden-cost accessors in hot loops". ## Summaries diff --git a/Cargo.lock b/Cargo.lock index 6d44819c720..eea41c22bc4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9571,6 +9571,7 @@ dependencies = [ "bitvec", "cbindgen", "cc", + "codspeed-divan-compat", "custom-labels", "futures", "itertools 0.14.0", diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 01a5b8d7a3e..caeda1ce590 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -52,3 +52,7 @@ harness = false [[bench]] name = "run_end_decode" harness = false + +[[bench]] +name = "run_end_filter" +harness = false diff --git a/encodings/runend/benches/run_end_filter.rs b/encodings/runend/benches/run_end_filter.rs new file mode 100644 index 00000000000..3f07c1c5c3c --- /dev/null +++ b/encodings/runend/benches/run_end_filter.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks for the run-end filter inner loop (`filter_run_end_primitive`). +//! +//! This measures the kernel directly rather than going through the lazy +//! `ArrayRef::filter` (which only builds a `FilterArray` node and does not run +//! the kernel). The hot work is a per-run popcount of the predicate mask, which +//! now uses `BitBuffer::count_range` (SIMD) instead of a bit-by-bit walk. + +#![expect(clippy::cast_possible_truncation)] +#![expect(clippy::cast_precision_loss)] +#![expect(clippy::cast_sign_loss)] +#![expect(clippy::expect_used)] + +use std::fmt; + +use divan::Bencher; +use rand::SeedableRng; +use rand::rngs::StdRng; +use rand::seq::SliceRandom; +use vortex_buffer::BitBuffer; +use vortex_runend::_benchmarking::filter_run_end_primitive; + +fn main() { + divan::main(); +} + +#[derive(Clone, Copy)] +struct FilterBenchArgs { + /// Total logical length of the decoded array. + length: usize, + /// Average run length used when building the run-end array. + run_length: usize, + /// Fraction of mask bits that are set to `true`. + density: f64, +} + +impl fmt::Display for FilterBenchArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "len={}_run={}_density={:.1}", + self.length, self.run_length, self.density + ) + } +} + +const FILTER_ARGS: &[FilterBenchArgs] = &[ + FilterBenchArgs { + length: 16_384, + run_length: 16, + density: 0.1, + }, + FilterBenchArgs { + length: 16_384, + run_length: 16, + density: 0.5, + }, + FilterBenchArgs { + length: 16_384, + run_length: 16, + density: 0.9, + }, + FilterBenchArgs { + length: 262_144, + run_length: 16, + density: 0.1, + }, + FilterBenchArgs { + length: 262_144, + run_length: 16, + density: 0.5, + }, + FilterBenchArgs { + length: 262_144, + run_length: 16, + density: 0.9, + }, + FilterBenchArgs { + length: 1_048_576, + run_length: 16, + density: 0.1, + }, + FilterBenchArgs { + length: 1_048_576, + run_length: 16, + density: 0.5, + }, + FilterBenchArgs { + length: 1_048_576, + run_length: 16, + density: 0.9, + }, +]; + +/// Build the run-end boundaries (cumulative run lengths) for `length` rows. +fn build_run_ends(length: usize, run_length: usize) -> Vec { + let n_runs = length.div_ceil(run_length); + (0..n_runs) + .map(|r| (((r + 1) * run_length).min(length)) as u32) + .collect() +} + +/// Build a predicate mask of `length` bits with approximately `density` set bits, +/// shuffled so the set bits are spread across runs. +fn build_mask(length: usize, density: f64) -> BitBuffer { + let n_true = (length as f64 * density).round() as usize; + let mut bits = vec![false; length]; + for b in bits.iter_mut().take(n_true) { + *b = true; + } + let mut rng = StdRng::seed_from_u64(0x5eed); + bits.shuffle(&mut rng); + BitBuffer::from(bits) +} + +#[divan::bench(args = FILTER_ARGS)] +fn filter_run_end(bencher: Bencher, args: FilterBenchArgs) { + let run_ends = build_run_ends(args.length, args.run_length); + let mask = build_mask(args.length, args.density); + let length = args.length as u64; + bencher + .with_inputs(|| (run_ends.clone(), mask.clone())) + .bench_refs(|(run_ends, mask)| { + filter_run_end_primitive::(run_ends, 0, length, mask).expect("filter") + }); +} diff --git a/encodings/runend/src/compute/filter.rs b/encodings/runend/src/compute/filter.rs index 60644e2bced..d1f112cbee0 100644 --- a/encodings/runend/src/compute/filter.rs +++ b/encodings/runend/src/compute/filter.rs @@ -5,6 +5,7 @@ use std::cmp::min; use std::ops::AddAssign; use num_traits::AsPrimitive; +use num_traits::NumCast; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -74,7 +75,7 @@ impl FilterKernel for RunEnd { } // Code adapted from apache arrow-rs https://github.com/apache/arrow-rs/blob/b1f5c250ebb6c1252b4e7c51d15b8e77f4c361fa/arrow-select/src/filter.rs#L425 -fn filter_run_end_primitive + AsPrimitive>( +pub fn filter_run_end_primitive + AsPrimitive>( run_ends: &[R], offset: u64, length: u64, @@ -87,16 +88,21 @@ fn filter_run_end_primitive + AsPrimitiv let mut count = R::zero(); let new_mask: Mask = BitBuffer::collect_bool(run_ends.len(), |i| { - let mut keep = false; let end = min(run_ends[i].as_() - offset, length); - // Safety: predicate must be the same length as the array the ends have been taken from - for pred in (start..end).map(|i| unsafe { - mask.value_unchecked(i.try_into().vortex_expect("index must fit in usize")) - }) { - count += >::from(pred); - keep |= pred - } + // SIMD popcount of the predicate bits in this run. The range matches the + // bit-by-bit `value_unchecked` read it replaces, so `end <= mask.len()`. + let start_usize = start + .try_into() + .vortex_expect("run start index must fit in usize"); + let end_usize = end + .try_into() + .vortex_expect("run end index must fit in usize"); + let run_trues = mask.count_range(start_usize, end_usize); + count += ::from(run_trues) + .vortex_expect("run popcount must fit in run-end native type"); + let keep = run_trues > 0; + // this is to avoid branching new_run_ends[j] = count; j += keep as usize; diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index 8770dbbf58e..9059f9ed673 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -21,6 +21,7 @@ mod rules; #[doc(hidden)] pub mod _benchmarking { + pub use compute::filter::filter_run_end_primitive; pub use compute::take::take_indices_unchecked; use super::*; diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index d5abd7bef44..44789f3da61 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -180,6 +180,10 @@ required-features = ["_test-harness"] name = "dict_mask" harness = false +[[bench]] +name = "validity_is_valid" +harness = false + [[bench]] name = "dict_unreferenced_mask" harness = false diff --git a/vortex-array/benches/validity_is_valid.rs b/vortex-array/benches/validity_is_valid.rs new file mode 100644 index 00000000000..d37f06cd7c9 --- /dev/null +++ b/vortex-array/benches/validity_is_valid.rs @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks the cost of reading array-backed [`Validity`] per element. +//! +//! `Validity::is_valid(i)` for the `Validity::Array` variant spins up a fresh +//! execution context and executes a scalar lookup on *every* call, so calling it +//! in a `for i in 0..n` loop is pathologically slow. The fix used by callers is to +//! materialize the validity into a `Mask` once (`execute_mask`) and then do cheap +//! O(1) bit reads via `Mask::value`. This bench contrasts the two. +//! +//! Sizes are kept small because the per-element variant is intentionally the slow +//! one we are measuring. + +#![allow(clippy::unwrap_used)] + +use divan::Bencher; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; + +fn main() { + divan::main(); +} + +const SIZES: &[usize] = &[256, 1024, 4096]; + +/// Build an array-backed validity (~10% nulls so it is `Validity::Array`). +fn array_validity(len: usize) -> Validity { + Validity::from(BitBuffer::from_iter( + (0..len).map(|i| !i.is_multiple_of(10)), + )) +} + +/// Per-element `Validity::is_valid` over array-backed validity (the antipattern). +#[divan::bench(args = SIZES)] +fn is_valid_per_element(bencher: Bencher, len: usize) { + let validity = array_validity(len); + bencher.with_inputs(|| &validity).bench_refs(|validity| { + let mut count = 0usize; + for i in 0..len { + count += validity.is_valid(i).unwrap() as usize; + } + count + }); +} + +/// Materialize the validity into a `Mask` once, then read bits (the fix). +#[divan::bench(args = SIZES)] +fn execute_mask_then_value(bencher: Bencher, len: usize) { + let validity = array_validity(len); + bencher.with_inputs(|| &validity).bench_refs(|validity| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mask = validity.execute_mask(len, &mut ctx).unwrap(); + let mut count = 0usize; + for i in 0..len { + count += mask.value(i) as usize; + } + count + }); +} diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 2bcb4ef0c1f..3aecff85bb7 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -209,9 +209,16 @@ impl ListViewArray { let mut new_sizes = BufferMut::::with_capacity(len); let mut take_indices = BufferMut::::with_capacity(self.elements().len()); + // Materialize the validity into a mask once. `self.validity()` re-derives the + // listview validity, and `Validity::is_valid` allocates a fresh execution context + // per call for array-backed validity, so checking it per row would be quadratic-ish. + let validity_mask = self + .validity()? + .execute_mask(len, &mut LEGACY_SESSION.create_execution_ctx())?; + let mut n_elements = NewOffset::zero(); for index in 0..len { - if !self.validity()?.is_valid(index)? { + if !validity_mask.value(index) { new_offsets.push(n_elements); new_sizes.push(S::zero()); continue; @@ -292,9 +299,15 @@ impl ListViewArray { let mut new_elements_builder = builder_with_capacity(element_dtype.as_ref(), self.elements().len()); + // Materialize the validity into a mask once (see `rebuild_by_offset`): avoids + // re-deriving the listview validity and allocating an execution context per row. + let validity_mask = self + .validity()? + .execute_mask(len, &mut LEGACY_SESSION.create_execution_ctx())?; + let mut n_elements = NewOffset::zero(); for index in 0..len { - if !self.validity()?.is_valid(index)? { + if !validity_mask.value(index) { // For NULL lists, place them after the previous item's data to maintain the // no-overlap invariant for zero-copy to `ListArray` arrays. new_offsets.push(n_elements); diff --git a/vortex-array/src/arrays/varbin/compute/filter.rs b/vortex-array/src/arrays/varbin/compute/filter.rs index b3e1aa87a4f..75ec3fa14af 100644 --- a/vortex-array/src/arrays/varbin/compute/filter.rs +++ b/vortex-array/src/arrays/varbin/compute/filter.rs @@ -24,7 +24,6 @@ use crate::arrays::varbin::builder::VarBinBuilder; use crate::dtype::DType; use crate::dtype::IntegerPType; use crate::match_each_integer_ptype; -use crate::validity::Validity; impl FilterKernel for VarBin { fn filter( @@ -164,13 +163,17 @@ fn filter_select_var_bin_by_index( ctx: &mut ExecutionCtx, ) -> VortexResult { let offsets = values.offsets().clone().execute::(ctx)?; + // Materialize validity into a mask once, rather than calling `Validity::is_valid` + // per selected index (which allocates an execution context per call for + // array-backed validity). + let validity_mask = values.validity()?.execute_mask(values.len(), ctx)?; match_each_integer_ptype!(offsets.ptype(), |O| { filter_select_var_bin_by_index_primitive_offset( values.dtype().clone(), offsets.as_slice::(), values.bytes().as_slice(), mask_indices, - values.validity()?, + &validity_mask, selection_count, ) }) @@ -181,13 +184,12 @@ fn filter_select_var_bin_by_index_primitive_offset( offsets: &[O], data: &[u8], mask_indices: &[usize], - // TODO(ngates): pass LogicalValidity instead - validity: Validity, + validity_mask: &Mask, selection_count: usize, ) -> VortexResult { let mut builder = VarBinBuilder::::with_capacity(selection_count); for idx in mask_indices.iter().copied() { - if validity.is_valid(idx)? { + if validity_mask.value(idx) { let (start, end) = ( offsets[idx].to_usize().ok_or_else(|| { vortex_err!("Failed to convert offset to usize: {}", offsets[idx]) diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs index 204205d1f51..b0b9bcf1bc8 100644 --- a/vortex-array/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -129,6 +129,13 @@ impl Validity { } /// Returns whether the `index` item is valid. + /// + /// # Performance + /// + /// For the [`Validity::Array`] variant this allocates an execution context and runs a + /// scalar lookup on every call, so it must not be called in a loop. To check many + /// positions, materialize the validity once with [`Validity::execute_mask`] and read it + /// with `Mask::value`. #[inline] pub fn is_valid(&self, index: usize) -> VortexResult { Ok(match self { @@ -143,6 +150,11 @@ impl Validity { }) } + /// Returns whether the `index` item is null. + /// + /// # Performance + /// + /// See [`Validity::is_valid`]: do not call this in a loop for array-backed validity. #[inline] pub fn is_null(&self, index: usize) -> VortexResult { Ok(!self.is_valid(index)?) diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs index 83b28e24b4a..12b38bf4e1b 100644 --- a/vortex-buffer/src/bit/buf.rs +++ b/vortex-buffer/src/bit/buf.rs @@ -319,6 +319,20 @@ impl BitBuffer { count_ones(self.buffer.as_slice(), self.offset, self.len) } + /// Get the number of set bits in the bit range `[start, end)`. + /// + /// Unlike `self.slice(start..end).true_count()`, this counts directly over the + /// existing backing buffer without allocating or cloning a new [`BitBuffer`], + /// making it cheap to call repeatedly over many small ranges. + /// + /// Panics if `start > end` or `end > len`. + #[inline] + pub fn count_range(&self, start: usize, end: usize) -> usize { + assert!(start <= end, "start {start} exceeds end {end}"); + assert!(end <= self.len, "end {end} exceeds len {}", self.len); + count_ones(self.buffer.as_slice(), self.offset + start, end - start) + } + /// Returns the position of the `nth` set bit (0-indexed). /// /// This is the "select" operation on a bitmap: given a rank `nth`, find @@ -349,6 +363,38 @@ impl BitBuffer { BitSliceIterator::new(self.buffer.as_slice(), self.offset, self.len) } + /// Invoke `f(index)` for every set bit, in ascending order, processing a `u64` + /// word at a time. + /// + /// This is the fast way to "do something for each set bit": it skips all-zero + /// words, fast-paths all-one words, and walks the remaining bits with + /// `trailing_zeros`. Prefer it over `for i in 0..len { if buf.value(i) { f(i) } }` + /// (which pays a branch per element) and over collecting [`Self::set_indices`] + /// (whose per-`next` iterator state does not inline as well). + #[inline] + pub fn for_each_set_index(&self, mut f: F) { + let chunks = self.chunks(); + // `remainder_bits` zero-pads above the logical length, so every set bit it + // reports is in-bounds and it is never all-ones (so the all-set fast path + // below is only ever taken for genuine full words). + let remainder = chunks.remainder_bits(); + let mut base = 0usize; + for word in chunks.iter().chain(std::iter::once(remainder)) { + if word == u64::MAX { + for k in 0..64 { + f(base + k); + } + } else { + let mut w = word; + while w != 0 { + f(base + w.trailing_zeros() as usize); + w &= w - 1; + } + } + base += 64; + } + } + /// Created a new BitBuffer with offset reset to 0 pub fn sliced(&self) -> Self { if self.offset.is_multiple_of(8) { @@ -795,6 +841,75 @@ mod tests { } } + #[rstest] + #[case(0, 0)] + #[case(0, 64)] + #[case(5, 70)] + #[case(64, 130)] + #[case(0, 200)] + fn test_count_range(#[case] start: usize, #[case] end: usize) { + let len = 200; + let buf = BitBuffer::collect_bool(len, |i| i % 3 == 0); + let expected = (start..end).filter(|i| i % 3 == 0).count(); + assert_eq!(buf.count_range(start, end), expected); + // Must agree with slicing then counting. + assert_eq!( + buf.count_range(start, end), + buf.slice(start..end).true_count() + ); + } + + #[rstest] + #[case(3)] + #[case(7)] + fn test_count_range_with_offset(#[case] offset: usize) { + let len = 150; + let buf = BitBuffer::collect_bool(offset + len, |i| i % 2 == 0); + let view = BitBuffer::new_with_offset(buf.inner().clone(), len, offset); + for (start, end) in [(0, len), (10, 100), (1, 2), (63, 129)] { + let expected = (offset + start..offset + end) + .filter(|i| i % 2 == 0) + .count(); + assert_eq!(view.count_range(start, end), expected, "[{start}, {end})"); + } + } + + #[rstest] + #[case(0)] + #[case(1)] + #[case(63)] + #[case(64)] + #[case(65)] + #[case(200)] + #[case(1000)] + fn test_for_each_set_index_matches_set_indices(#[case] len: usize) { + let buf = BitBuffer::collect_bool(len, |i| i % 5 == 0 || i % 7 == 0); + let expected: Vec = buf.set_indices().collect(); + let mut got = Vec::new(); + buf.for_each_set_index(|i| got.push(i)); + assert_eq!(got, expected); + } + + #[rstest] + #[case(3, 200)] + #[case(7, 130)] + fn test_for_each_set_index_with_offset(#[case] offset: usize, #[case] len: usize) { + let base = BitBuffer::collect_bool(offset + len, |i| i % 3 == 0); + let view = BitBuffer::new_with_offset(base.inner().clone(), len, offset); + let expected: Vec = view.set_indices().collect(); + let mut got = Vec::new(); + view.for_each_set_index(|i| got.push(i)); + assert_eq!(got, expected); + } + + #[test] + fn test_for_each_set_index_all_set() { + let buf = BitBuffer::new_set(130); + let mut got = Vec::new(); + buf.for_each_set_index(|i| got.push(i)); + assert_eq!(got, (0..130).collect::>()); + } + #[test] fn test_map_cmp_conditional() { // map_cmp with conditional logic based on index and bit value diff --git a/vortex-duckdb/Cargo.toml b/vortex-duckdb/Cargo.toml index 0f5e6b4797e..09cf40a6228 100644 --- a/vortex-duckdb/Cargo.toml +++ b/vortex-duckdb/Cargo.toml @@ -44,6 +44,7 @@ vortex-utils = { workspace = true, features = ["dashmap"] } [dev-dependencies] anyhow = { workspace = true } +divan = { workspace = true } jiff = { workspace = true } rstest = { workspace = true } tempfile = { workspace = true } @@ -51,6 +52,10 @@ vortex-array = { workspace = true, features = ["_test-harness"] } vortex-runend = { workspace = true } vortex-sequence = { workspace = true } +[[bench]] +name = "bool_export" +harness = false + [lints] workspace = true diff --git a/vortex-duckdb/benches/bool_export.rs b/vortex-duckdb/benches/bool_export.rs new file mode 100644 index 00000000000..38d2fc7c562 --- /dev/null +++ b/vortex-duckdb/benches/bool_export.rs @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Microbenchmark for the bit -> byte-bool unpack performed by `BoolExporter::export`. +//! +//! DuckDB stores booleans as one byte per value (`&mut [bool]`), while Vortex stores them +//! bit-packed in a [`BitBuffer`]. On every boolean column export we unpack a slice of the +//! bit buffer into the destination byte slice. +//! +//! This bench isolates that pure unpacking logic so it can run without a live DuckDB +//! vector: a reused `Vec` stands in for the DuckDB byte-bool destination. It compares +//! the previous implementation (`.iter().collect::>()` followed by +//! `copy_from_slice`) against the new allocation-free direct zip-write. + +use divan::Bencher; +use vortex::buffer::BitBuffer; + +fn main() { + divan::main(); +} + +const SIZES: &[usize] = &[1024, 16_384, 262_144]; + +/// Bit densities (true ratio), expressed as `(numerator, denominator)`. +const DENSITIES: &[(usize, usize)] = &[(1, 2), (1, 10), (9, 10)]; + +fn make_buffer(len: usize, density: (usize, usize)) -> BitBuffer { + let (num, den) = density; + // `+ 8` so we can slice off a non-byte-aligned offset, matching real exports. + BitBuffer::from_iter((0..len + 8).map(|i| (i % den) < num)) +} + +/// Previous implementation: allocate a throwaway `Vec` then copy into the destination. +#[divan::bench(args = SIZES, consts = [0usize, 1usize, 2usize])] +fn old_collect_copy(bencher: Bencher, len: usize) { + let buffer = make_buffer(len, DENSITIES[DENSITY_IDX]); + bencher + .with_inputs(|| vec![false; len]) + .bench_refs(|dst: &mut Vec| { + // Offset by 1 to exercise a non-byte-aligned slice like the export hot path. + dst.copy_from_slice(&buffer.slice(1..(1 + len)).iter().collect::>()); + divan::black_box(&dst); + }); +} + +/// New implementation: zip the sliced bit iterator directly into the destination slice. +#[divan::bench(args = SIZES, consts = [0usize, 1usize, 2usize])] +fn new_zip_write(bencher: Bencher, len: usize) { + let buffer = make_buffer(len, DENSITIES[DENSITY_IDX]); + bencher + .with_inputs(|| vec![false; len]) + .bench_refs(|dst: &mut Vec| { + for (slot, bit) in dst.iter_mut().zip(buffer.slice(1..(1 + len)).iter()) { + *slot = bit; + } + divan::black_box(&dst); + }); +} diff --git a/vortex-duckdb/src/exporter/bool.rs b/vortex-duckdb/src/exporter/bool.rs index 84fd17f0789..1202d0cfe1b 100644 --- a/vortex-duckdb/src/exporter/bool.rs +++ b/vortex-duckdb/src/exporter/bool.rs @@ -47,13 +47,16 @@ impl ColumnExporter for BoolExporter { ) -> VortexResult<()> { // DuckDB uses byte bools, not bit bools. // maybe we can convert into these from a compressed array sometimes?. - unsafe { vector.as_slice_mut(len) }.copy_from_slice( - &self - .bit_buffer - .slice(offset..(offset + len)) - .iter() - .collect::>(), - ); + // Unpack the bits directly into the destination byte slice, avoiding the + // throwaway `Vec` allocation and the extra copy pass. The sliced + // iterator already accounts for the buffer's bit offset. + let dst = unsafe { vector.as_slice_mut(len) }; + for (slot, bit) in dst + .iter_mut() + .zip(self.bit_buffer.slice(offset..(offset + len)).iter()) + { + *slot = bit; + } Ok(()) } diff --git a/vortex-mask/Cargo.toml b/vortex-mask/Cargo.toml index a2af3d27cb0..24aaa7110b5 100644 --- a/vortex-mask/Cargo.toml +++ b/vortex-mask/Cargo.toml @@ -39,5 +39,13 @@ harness = false name = "rank" harness = false +[[bench]] +name = "valid_counts" +harness = false + +[[bench]] +name = "mask_iteration" +harness = false + [lints] workspace = true diff --git a/vortex-mask/benches/mask_iteration.rs b/vortex-mask/benches/mask_iteration.rs new file mode 100644 index 00000000000..40caee28c3f --- /dev/null +++ b/vortex-mask/benches/mask_iteration.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Strategy comparison for "process the valid elements of a mask". +//! +//! The kernel is `sum of values[i] for every set bit i` — a stand-in for any +//! validity-gated loop. We compare: +//! +//! * `per_element_value` — `for i in 0..n { if buf.value(i) {..} }` (what callers +//! typically write after materializing validity into a mask) +//! * `bit_iterator` — arrow's `BitIterator` (tracks a word internally) +//! * `word_trailing_zeros`— iterate `u64` words, fast-path all-set/all-unset, and +//! walk set bits with `trailing_zeros` / `w &= w - 1` +//! * `set_slices` — iterate contiguous true runs (`set_slices`) +//! * `set_indices` — iterate set positions (`set_indices`) +//! +//! Run across densities to expose the dense-vs-sparse crossover. + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use divan::Bencher; +use vortex_buffer::BitBuffer; + +fn main() { + divan::main(); +} + +const ARGS: &[(usize, f64)] = &[ + (65_536, 0.01), + (65_536, 0.10), + (65_536, 0.50), + (65_536, 0.90), + (65_536, 0.99), +]; + +fn make(len: usize, density: f64) -> (BitBuffer, Vec) { + let threshold = (density * 1000.0) as usize; + let buf = BitBuffer::from_iter((0..len).map(|i| (i * 7 + 13) % 1000 < threshold)); + let values = (0..len as u64).collect(); + (buf, values) +} + +#[divan::bench(args = ARGS)] +fn per_element_value(bencher: Bencher, (len, density): (usize, f64)) { + let (buf, values) = make(len, density); + bencher + .with_inputs(|| (&buf, &values)) + .bench_refs(|(buf, values)| { + let mut acc = 0u64; + for i in 0..len { + if buf.value(i) { + acc = acc.wrapping_add(values[i]); + } + } + acc + }); +} + +#[divan::bench(args = ARGS)] +fn bit_iterator(bencher: Bencher, (len, density): (usize, f64)) { + let (buf, values) = make(len, density); + bencher + .with_inputs(|| (&buf, &values)) + .bench_refs(|(buf, values)| { + let mut acc = 0u64; + for (i, set) in buf.iter().enumerate() { + if set { + acc = acc.wrapping_add(values[i]); + } + } + acc + }); +} + +#[divan::bench(args = ARGS)] +fn word_trailing_zeros(bencher: Bencher, (len, density): (usize, f64)) { + let (buf, values) = make(len, density); + bencher + .with_inputs(|| (&buf, &values)) + .bench_refs(|(buf, values)| { + let mut acc = 0u64; + let chunks = buf.chunks(); + let mut base = 0usize; + for word in chunks.iter() { + if word == u64::MAX { + for k in 0..64 { + acc = acc.wrapping_add(values[base + k]); + } + } else { + let mut w = word; + while w != 0 { + let b = w.trailing_zeros() as usize; + acc = acc.wrapping_add(values[base + b]); + w &= w - 1; + } + } + base += 64; + } + let mut w = chunks.remainder_bits(); + let rem = buf.len() - base; + while w != 0 { + let b = w.trailing_zeros() as usize; + if b >= rem { + break; + } + acc = acc.wrapping_add(values[base + b]); + w &= w - 1; + } + acc + }); +} + +#[divan::bench(args = ARGS)] +fn for_each_set_index(bencher: Bencher, (len, density): (usize, f64)) { + let (buf, values) = make(len, density); + bencher + .with_inputs(|| (&buf, &values)) + .bench_refs(|(buf, values)| { + let mut acc = 0u64; + buf.for_each_set_index(|i| acc = acc.wrapping_add(values[i])); + acc + }); +} + +#[divan::bench(args = ARGS)] +fn set_slices(bencher: Bencher, (len, density): (usize, f64)) { + let (buf, values) = make(len, density); + bencher + .with_inputs(|| (&buf, &values)) + .bench_refs(|(buf, values)| { + let mut acc = 0u64; + for (start, end) in buf.set_slices() { + for i in start..end { + acc = acc.wrapping_add(values[i]); + } + } + acc + }); +} + +#[divan::bench(args = ARGS)] +fn set_indices(bencher: Bencher, (len, density): (usize, f64)) { + let (buf, values) = make(len, density); + bencher + .with_inputs(|| (&buf, &values)) + .bench_refs(|(buf, values)| { + let mut acc = 0u64; + for i in buf.set_indices() { + acc = acc.wrapping_add(values[i]); + } + acc + }); +} diff --git a/vortex-mask/benches/valid_counts.rs b/vortex-mask/benches/valid_counts.rs new file mode 100644 index 00000000000..359eeb2ea30 --- /dev/null +++ b/vortex-mask/benches/valid_counts.rs @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks for `Mask::valid_counts_for_indices`. +//! +//! This mirrors the hot path in the pco/zstd slice decoders, which call +//! `valid_counts_for_indices(&[slice_start, slice_stop])` to translate a row +//! range into a value range. The cost is dominated by counting set bits in the +//! prefix `[0, slice_stop)`, so a SIMD popcount over the bit buffer should beat +//! a bit-by-bit walk handily for large `slice_stop`. + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use divan::Bencher; +use vortex_buffer::BitBuffer; +use vortex_mask::Mask; + +fn main() { + divan::main(); +} + +const BENCH_SIZES: &[(usize, f64)] = &[ + (16_384, 0.1), + (16_384, 0.9), + (262_144, 0.1), + (262_144, 0.9), + (1_048_576, 0.1), + (1_048_576, 0.9), +]; + +fn create_mask(len: usize, density: f64) -> Mask { + Mask::from_buffer(BitBuffer::from_iter((0..len).map(|i| { + let threshold = (density * 1000.0) as usize; + (i * 7 + 13) % 1000 < threshold + }))) +} + +/// The pco/zstd slice pattern: two indices bracketing a sub-range. The prefix +/// up to `slice_stop` must be counted, so `slice_stop` near the end is worst case. +#[divan::bench(args = BENCH_SIZES)] +fn slice_bounds(bencher: Bencher, (len, density): (usize, f64)) { + let mask = create_mask(len, density); + let indices = [len / 4, len - len / 8]; + bencher + .with_inputs(|| (&mask, indices)) + .bench_refs(|(mask, indices)| mask.valid_counts_for_indices(indices)); +} + +/// Many monotonically increasing indices spread across the whole mask. +#[divan::bench(args = BENCH_SIZES)] +fn many_indices(bencher: Bencher, (len, density): (usize, f64)) { + let mask = create_mask(len, density); + let stride = (len / 256).max(1); + let indices: Vec = (0..len).step_by(stride).collect(); + bencher + .with_inputs(|| (&mask, indices.as_slice())) + .bench_refs(|(mask, indices)| mask.valid_counts_for_indices(indices)); +} diff --git a/vortex-mask/src/lib.rs b/vortex-mask/src/lib.rs index 41901ba2ba3..e4b9f0d8940 100644 --- a/vortex-mask/src/lib.rs +++ b/vortex-mask/src/lib.rs @@ -607,23 +607,24 @@ impl Mask { /// Given monotonically increasing `indices` in [0, n_rows], returns the /// count of valid elements up to each index. /// - /// This is O(n_rows). + /// This is O(n_rows), but the per-gap counts are computed with a SIMD + /// popcount over the underlying bit buffer rather than walking bit-by-bit. pub fn valid_counts_for_indices(&self, indices: &[usize]) -> Vec { match self { Self::AllTrue(_) => indices.to_vec(), Self::AllFalse(_) => vec![0; indices.len()], Self::Values(values) => { - let mut bool_iter = values.bit_buffer().iter(); + let buffer = values.bit_buffer(); let mut valid_counts = Vec::with_capacity(indices.len()); let mut valid_count = 0; - let mut idx = 0; + let mut prev = 0; for &next_idx in indices { - while idx < next_idx { - idx += 1; - valid_count += bool_iter - .next() - .unwrap_or_else(|| vortex_panic!("Row indices exceed array length")) - as usize; + assert!(next_idx <= buffer.len(), "Row indices exceed array length"); + // `indices` is monotonically increasing, so each gap is counted once; + // the total work across all gaps scans the prefix `[0, last_idx)` once. + if next_idx > prev { + valid_count += buffer.count_range(prev, next_idx); + prev = next_idx; } valid_counts.push(valid_count); } @@ -758,7 +759,10 @@ impl MaskValues { } let mut indices = Vec::with_capacity(self.true_count); - indices.extend(self.buffer.set_indices()); + // Word-at-a-time set-bit walk; faster than collecting `set_indices()`, + // whose per-`next` iterator state inlines less well (see + // `vortex-mask/benches/mask_iteration.rs`). + self.buffer.for_each_set_index(|i| indices.push(i)); debug_assert!(indices.is_sorted()); assert_eq!(indices.len(), self.true_count); indices diff --git a/vortex-tensor/src/scalar_fns/l2_denorm.rs b/vortex-tensor/src/scalar_fns/l2_denorm.rs index c91a614ddff..87d9c3d4892 100644 --- a/vortex-tensor/src/scalar_fns/l2_denorm.rs +++ b/vortex-tensor/src/scalar_fns/l2_denorm.rs @@ -441,6 +441,11 @@ pub fn normalize_as_l2_denorm( let normalized_dtype = input.dtype().as_nonnullable(); let flat = extract_flat_elements(input.storage_array(), tensor_flat_size, ctx)?; + // Materialize the validity into a mask once, rather than calling + // `Validity::is_valid(i)` per row (which, for array-backed validity, spins up a + // fresh execution context and executes a scalar lookup on every call). + let norms_mask = norms_validity.execute_mask(row_count, ctx)?; + // Normalize all of the vectors. let normalized = match_each_float_ptype!(flat.ptype(), |T| { let norm_values = primitive_norms.as_slice::(); @@ -448,7 +453,7 @@ pub fn normalize_as_l2_denorm( let total_elements = row_count * tensor_flat_size; let mut elements = BufferMut::::with_capacity(total_elements); for i in 0..row_count { - let is_valid = norms_validity.is_valid(i)?; + let is_valid = norms_mask.value(i); let norm = norm_values[i]; // SAFETY: We allocated `row_count * tensor_flat_size` capacity and push exactly @@ -637,12 +642,16 @@ pub fn validate_l2_normalized_rows_against_norms( Some(norms) => normalized_validity.and(norms.validity()?)?, None => normalized_validity, }; + // Materialize the validity into a mask once, rather than calling + // `Validity::is_valid(i)` per row (which, for array-backed validity, spins up a + // fresh execution context and executes a scalar lookup on every call). + let combined_mask = combined_validity.execute_mask(row_count, ctx)?; match_each_float_ptype!(element_ptype, |T| { let stored_norms = norms.as_ref().map(|norms| norms.as_slice::()); for i in 0..row_count { - if !combined_validity.is_valid(i)? { + if !combined_mask.value(i) { continue; }