diff --git a/AGENTS.md b/AGENTS.md
index e5c3d0cc13b..30ddeeb9515 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -151,6 +151,47 @@ Notes:
   self-explanatory code.
 - Keep public APIs small and consistent with neighboring crates.
 
+## Performance: avoid hidden-cost accessors in hot loops
+
+The most common performance trap in this codebase is calling a *per-element accessor that
+hides non-trivial work* inside an `O(n)` loop, instead of doing the work once over the whole
+chunk. The call site looks like a cheap getter, but each call re-pays a cost that is constant
+(or amortizable) across the loop, making the loop `O(n · k)`.
+
+Watch for these accessors used inside `for i in 0..n { ... }`:
+
+| Per-element accessor (in a loop) | Hidden cost | Bulk replacement |
+| --- | --- | --- |
+| `Validity::is_valid(i)` / `is_null(i)` | for array-backed validity, **allocates an `ExecutionCtx` and runs a scalar lookup per call** | `validity.execute_mask(len, ctx)?` once, then `Mask::value(i)` |
+| `array.scalar_at(i)` / `array.execute_scalar(i, ctx)` | per-element execution through the compute stack | canonicalize once (`execute::<PrimitiveArray>` / `as_slice`) then index |
+| `BitBuffer::value(i)` / `Mask::value(i)` accumulated into a count | recomputes the byte address each call; defeats popcount | `true_count()`, `BitBuffer::count_range(start, end)`, `set_indices()` |
+| `BitIterator::next()` to accumulate a running rank/prefix count | bit-at-a-time | `count_range` over each gap (SIMD popcount) |
+| re-deriving a value inside the loop (e.g. `self.validity()?` each iteration) | re-runs the derivation `n` times | hoist it above the loop |
+
+Decide per site — bulk is not always the answer:
+
+- **Sequential / contiguous access** over an accessor that hides amortizable work → hoist and
+  go bulk (materialize once, then index or iterate the chunk).
+- **Gather over arbitrary indices** → you cannot amortize a per-element *decode*, but you can
+  still materialize the backing buffer once (e.g. `execute_mask`) and then do cheap `O(1)`
+  random reads, avoiding per-call context/allocation.
+- **The accessor is already genuinely `O(1)`** (e.g. reading an already-materialized `Mask`/
+  slice, or a native bitmap) → leave it; bulk would not help.
+
+Even after materializing into a `Mask`/`BitBuffer`, **do not loop with `value(i)` per
+element** to act on each set bit — the per-element branch dominates. Iterate a `u64` word
+at a time with all-set / all-unset fast paths: use [`BitBuffer::for_each_set_index`]. It
+beats `for i in 0..len { if buf.value(i) {..} }` by 2-45x (more at low density) and beats
+collecting `set_indices()` by ~2x at mid/high density, while self-adapting from sparse to
+dense — see `vortex-mask/benches/mask_iteration.rs`. Reach for the cached `indices()` /
+`slices()` representations when you need them more than once; otherwise `for_each_set_index`
+needs no materialization.
+
+When you touch such a loop, back the change with a benchmark (see
+`vortex-array/benches/validity_is_valid.rs` for the `is_valid` case,
+`vortex-mask/benches/valid_counts.rs` for the popcount case, and
+`vortex-mask/benches/mask_iteration.rs` for the per-element-vs-word-vs-sparse comparison).
+
 ## Tests
 
 - Strongly consider `rstest` cases when parameterizing repetitive test logic.
@@ -179,6 +220,9 @@ Check new and modified lines against this list before finishing:
 - Updating expected test output to match buggy behavior without independently verifying the
   intended semantics.
 - Silently reducing the scope of an approved plan when implementation is harder than expected.
+- Calling a hidden-cost per-element accessor (`Validity::is_valid`, `scalar_at`, `BitBuffer::
+  value` accumulation) inside a hot loop instead of materializing once — see
+  "Performance: avoid hidden-cost accessors in hot loops".
 
 ## Summaries
 
diff --git a/Cargo.lock b/Cargo.lock
index 6d44819c720..eea41c22bc4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9571,6 +9571,7 @@ dependencies = [
  "bitvec",
  "cbindgen",
  "cc",
+ "codspeed-divan-compat",
  "custom-labels",
  "futures",
  "itertools 0.14.0",
diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml
index 01a5b8d7a3e..caeda1ce590 100644
--- a/encodings/runend/Cargo.toml
+++ b/encodings/runend/Cargo.toml
@@ -52,3 +52,7 @@ harness = false
 [[bench]]
 name = "run_end_decode"
 harness = false
+
+[[bench]]
+name = "run_end_filter"
+harness = false
diff --git a/encodings/runend/benches/run_end_filter.rs b/encodings/runend/benches/run_end_filter.rs
new file mode 100644
index 00000000000..3f07c1c5c3c
--- /dev/null
+++ b/encodings/runend/benches/run_end_filter.rs
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Benchmarks for the run-end filter inner loop (`filter_run_end_primitive`).
+//!
+//! This measures the kernel directly rather than going through the lazy
+//! `ArrayRef::filter` (which only builds a `FilterArray` node and does not run
+//! the kernel). The hot work is a per-run popcount of the predicate mask, which
+//! now uses `BitBuffer::count_range` (SIMD) instead of a bit-by-bit walk.
+
+#![expect(clippy::cast_possible_truncation)]
+#![expect(clippy::cast_precision_loss)]
+#![expect(clippy::cast_sign_loss)]
+#![expect(clippy::expect_used)]
+
+use std::fmt;
+
+use divan::Bencher;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use rand::seq::SliceRandom;
+use vortex_buffer::BitBuffer;
+use vortex_runend::_benchmarking::filter_run_end_primitive;
+
+fn main() {
+    divan::main();
+}
+
+#[derive(Clone, Copy)]
+struct FilterBenchArgs {
+    /// Total logical length of the decoded array.
+    length: usize,
+    /// Average run length used when building the run-end array.
+    run_length: usize,
+    /// Fraction of mask bits that are set to `true`.
+    density: f64,
+}
+
+impl fmt::Display for FilterBenchArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "len={}_run={}_density={:.1}",
+            self.length, self.run_length, self.density
+        )
+    }
+}
+
+const FILTER_ARGS: &[FilterBenchArgs] = &[
+    FilterBenchArgs {
+        length: 16_384,
+        run_length: 16,
+        density: 0.1,
+    },
+    FilterBenchArgs {
+        length: 16_384,
+        run_length: 16,
+        density: 0.5,
+    },
+    FilterBenchArgs {
+        length: 16_384,
+        run_length: 16,
+        density: 0.9,
+    },
+    FilterBenchArgs {
+        length: 262_144,
+        run_length: 16,
+        density: 0.1,
+    },
+    FilterBenchArgs {
+        length: 262_144,
+        run_length: 16,
+        density: 0.5,
+    },
+    FilterBenchArgs {
+        length: 262_144,
+        run_length: 16,
+        density: 0.9,
+    },
+    FilterBenchArgs {
+        length: 1_048_576,
+        run_length: 16,
+        density: 0.1,
+    },
+    FilterBenchArgs {
+        length: 1_048_576,
+        run_length: 16,
+        density: 0.5,
+    },
+    FilterBenchArgs {
+        length: 1_048_576,
+        run_length: 16,
+        density: 0.9,
+    },
+];
+
+/// Build the run-end boundaries (cumulative run lengths) for `length` rows.
+fn build_run_ends(length: usize, run_length: usize) -> Vec<u32> {
+    let n_runs = length.div_ceil(run_length);
+    (0..n_runs)
+        .map(|r| (((r + 1) * run_length).min(length)) as u32)
+        .collect()
+}
+
+/// Build a predicate mask of `length` bits with approximately `density` set bits,
+/// shuffled so the set bits are spread across runs.
+fn build_mask(length: usize, density: f64) -> BitBuffer {
+    let n_true = (length as f64 * density).round() as usize;
+    let mut bits = vec![false; length];
+    for b in bits.iter_mut().take(n_true) {
+        *b = true;
+    }
+    let mut rng = StdRng::seed_from_u64(0x5eed);
+    bits.shuffle(&mut rng);
+    BitBuffer::from(bits)
+}
+
+#[divan::bench(args = FILTER_ARGS)]
+fn filter_run_end(bencher: Bencher, args: FilterBenchArgs) {
+    let run_ends = build_run_ends(args.length, args.run_length);
+    let mask = build_mask(args.length, args.density);
+    let length = args.length as u64;
+    bencher
+        .with_inputs(|| (run_ends.clone(), mask.clone()))
+        .bench_refs(|(run_ends, mask)| {
+            filter_run_end_primitive::<u32>(run_ends, 0, length, mask).expect("filter")
+        });
+}
diff --git a/encodings/runend/src/compute/filter.rs b/encodings/runend/src/compute/filter.rs
index 60644e2bced..d1f112cbee0 100644
--- a/encodings/runend/src/compute/filter.rs
+++ b/encodings/runend/src/compute/filter.rs
@@ -5,6 +5,7 @@ use std::cmp::min;
 use std::ops::AddAssign;
 
 use num_traits::AsPrimitive;
+use num_traits::NumCast;
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
@@ -74,7 +75,7 @@ impl FilterKernel for RunEnd {
 }
 
 // Code adapted from apache arrow-rs https://github.com/apache/arrow-rs/blob/b1f5c250ebb6c1252b4e7c51d15b8e77f4c361fa/arrow-select/src/filter.rs#L425
-fn filter_run_end_primitive<R: NativePType + AddAssign + From<bool> + AsPrimitive<u64>>(
+pub fn filter_run_end_primitive<R: NativePType + AddAssign + From<bool> + AsPrimitive<u64>>(
     run_ends: &[R],
     offset: u64,
     length: u64,
@@ -87,16 +88,21 @@ fn filter_run_end_primitive<R: NativePType + AddAssign + From<bool> + AsPrimitiv
     let mut count = R::zero();
 
     let new_mask: Mask = BitBuffer::collect_bool(run_ends.len(), |i| {
-        let mut keep = false;
         let end = min(run_ends[i].as_() - offset, length);
 
-        // Safety: predicate must be the same length as the array the ends have been taken from
-        for pred in (start..end).map(|i| unsafe {
-            mask.value_unchecked(i.try_into().vortex_expect("index must fit in usize"))
-        }) {
-            count += <R as From<bool>>::from(pred);
-            keep |= pred
-        }
+        // SIMD popcount of the predicate bits in this run. The range matches the
+        // bit-by-bit `value_unchecked` read it replaces, so `end <= mask.len()`.
+        let start_usize = start
+            .try_into()
+            .vortex_expect("run start index must fit in usize");
+        let end_usize = end
+            .try_into()
+            .vortex_expect("run end index must fit in usize");
+        let run_trues = mask.count_range(start_usize, end_usize);
+        count += <R as NumCast>::from(run_trues)
+            .vortex_expect("run popcount must fit in run-end native type");
+        let keep = run_trues > 0;
+
         // this is to avoid branching
         new_run_ends[j] = count;
         j += keep as usize;
diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs
index 8770dbbf58e..9059f9ed673 100644
--- a/encodings/runend/src/lib.rs
+++ b/encodings/runend/src/lib.rs
@@ -21,6 +21,7 @@ mod rules;
 
 #[doc(hidden)]
 pub mod _benchmarking {
+    pub use compute::filter::filter_run_end_primitive;
     pub use compute::take::take_indices_unchecked;
 
     use super::*;
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index d5abd7bef44..44789f3da61 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -180,6 +180,10 @@ required-features = ["_test-harness"]
 name = "dict_mask"
 harness = false
 
+[[bench]]
+name = "validity_is_valid"
+harness = false
+
 [[bench]]
 name = "dict_unreferenced_mask"
 harness = false
diff --git a/vortex-array/benches/validity_is_valid.rs b/vortex-array/benches/validity_is_valid.rs
new file mode 100644
index 00000000000..d37f06cd7c9
--- /dev/null
+++ b/vortex-array/benches/validity_is_valid.rs
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Benchmarks the cost of reading array-backed [`Validity`] per element.
+//!
+//! `Validity::is_valid(i)` for the `Validity::Array` variant spins up a fresh
+//! execution context and executes a scalar lookup on *every* call, so calling it
+//! in a `for i in 0..n` loop is pathologically slow. The fix used by callers is to
+//! materialize the validity into a `Mask` once (`execute_mask`) and then do cheap
+//! O(1) bit reads via `Mask::value`. This bench contrasts the two.
+//!
+//! Sizes are kept small because the per-element variant is intentionally the slow
+//! one we are measuring.
+
+#![allow(clippy::unwrap_used)]
+
+use divan::Bencher;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::validity::Validity;
+use vortex_buffer::BitBuffer;
+
+fn main() {
+    divan::main();
+}
+
+const SIZES: &[usize] = &[256, 1024, 4096];
+
+/// Build an array-backed validity (~10% nulls so it is `Validity::Array`).
+fn array_validity(len: usize) -> Validity {
+    Validity::from(BitBuffer::from_iter(
+        (0..len).map(|i| !i.is_multiple_of(10)),
+    ))
+}
+
+/// Per-element `Validity::is_valid` over array-backed validity (the antipattern).
+#[divan::bench(args = SIZES)]
+fn is_valid_per_element(bencher: Bencher, len: usize) {
+    let validity = array_validity(len);
+    bencher.with_inputs(|| &validity).bench_refs(|validity| {
+        let mut count = 0usize;
+        for i in 0..len {
+            count += validity.is_valid(i).unwrap() as usize;
+        }
+        count
+    });
+}
+
+/// Materialize the validity into a `Mask` once, then read bits (the fix).
+#[divan::bench(args = SIZES)]
+fn execute_mask_then_value(bencher: Bencher, len: usize) {
+    let validity = array_validity(len);
+    bencher.with_inputs(|| &validity).bench_refs(|validity| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let mask = validity.execute_mask(len, &mut ctx).unwrap();
+        let mut count = 0usize;
+        for i in 0..len {
+            count += mask.value(i) as usize;
+        }
+        count
+    });
+}
diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs
index 2bcb4ef0c1f..3aecff85bb7 100644
--- a/vortex-array/src/arrays/listview/rebuild.rs
+++ b/vortex-array/src/arrays/listview/rebuild.rs
@@ -209,9 +209,16 @@ impl ListViewArray {
         let mut new_sizes = BufferMut::<S>::with_capacity(len);
         let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
 
+        // Materialize the validity into a mask once. `self.validity()` re-derives the
+        // listview validity, and `Validity::is_valid` allocates a fresh execution context
+        // per call for array-backed validity, so checking it per row would be quadratic-ish.
+        let validity_mask = self
+            .validity()?
+            .execute_mask(len, &mut LEGACY_SESSION.create_execution_ctx())?;
+
         let mut n_elements = NewOffset::zero();
         for index in 0..len {
-            if !self.validity()?.is_valid(index)? {
+            if !validity_mask.value(index) {
                 new_offsets.push(n_elements);
                 new_sizes.push(S::zero());
                 continue;
@@ -292,9 +299,15 @@ impl ListViewArray {
         let mut new_elements_builder =
             builder_with_capacity(element_dtype.as_ref(), self.elements().len());
 
+        // Materialize the validity into a mask once (see `rebuild_by_offset`): avoids
+        // re-deriving the listview validity and allocating an execution context per row.
+        let validity_mask = self
+            .validity()?
+            .execute_mask(len, &mut LEGACY_SESSION.create_execution_ctx())?;
+
         let mut n_elements = NewOffset::zero();
         for index in 0..len {
-            if !self.validity()?.is_valid(index)? {
+            if !validity_mask.value(index) {
                 // For NULL lists, place them after the previous item's data to maintain the
                 // no-overlap invariant for zero-copy to `ListArray` arrays.
                 new_offsets.push(n_elements);
diff --git a/vortex-array/src/arrays/varbin/compute/filter.rs b/vortex-array/src/arrays/varbin/compute/filter.rs
index b3e1aa87a4f..75ec3fa14af 100644
--- a/vortex-array/src/arrays/varbin/compute/filter.rs
+++ b/vortex-array/src/arrays/varbin/compute/filter.rs
@@ -24,7 +24,6 @@ use crate::arrays::varbin::builder::VarBinBuilder;
 use crate::dtype::DType;
 use crate::dtype::IntegerPType;
 use crate::match_each_integer_ptype;
-use crate::validity::Validity;
 
 impl FilterKernel for VarBin {
     fn filter(
@@ -164,13 +163,17 @@ fn filter_select_var_bin_by_index(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<VarBinArray> {
     let offsets = values.offsets().clone().execute::<PrimitiveArray>(ctx)?;
+    // Materialize validity into a mask once, rather than calling `Validity::is_valid`
+    // per selected index (which allocates an execution context per call for
+    // array-backed validity).
+    let validity_mask = values.validity()?.execute_mask(values.len(), ctx)?;
     match_each_integer_ptype!(offsets.ptype(), |O| {
         filter_select_var_bin_by_index_primitive_offset(
             values.dtype().clone(),
             offsets.as_slice::<O>(),
             values.bytes().as_slice(),
             mask_indices,
-            values.validity()?,
+            &validity_mask,
             selection_count,
         )
     })
@@ -181,13 +184,12 @@ fn filter_select_var_bin_by_index_primitive_offset<O: IntegerPType>(
     offsets: &[O],
     data: &[u8],
     mask_indices: &[usize],
-    // TODO(ngates): pass LogicalValidity instead
-    validity: Validity,
+    validity_mask: &Mask,
     selection_count: usize,
 ) -> VortexResult<VarBinArray> {
     let mut builder = VarBinBuilder::<O>::with_capacity(selection_count);
     for idx in mask_indices.iter().copied() {
-        if validity.is_valid(idx)? {
+        if validity_mask.value(idx) {
             let (start, end) = (
                 offsets[idx].to_usize().ok_or_else(|| {
                     vortex_err!("Failed to convert offset to usize: {}", offsets[idx])
diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs
index 204205d1f51..b0b9bcf1bc8 100644
--- a/vortex-array/src/validity.rs
+++ b/vortex-array/src/validity.rs
@@ -129,6 +129,13 @@ impl Validity {
     }
 
     /// Returns whether the `index` item is valid.
+    ///
+    /// # Performance
+    ///
+    /// For the [`Validity::Array`] variant this allocates an execution context and runs a
+    /// scalar lookup on every call, so it must not be called in a loop. To check many
+    /// positions, materialize the validity once with [`Validity::execute_mask`] and read it
+    /// with `Mask::value`.
     #[inline]
     pub fn is_valid(&self, index: usize) -> VortexResult<bool> {
         Ok(match self {
@@ -143,6 +150,11 @@ impl Validity {
         })
     }
 
+    /// Returns whether the `index` item is null.
+    ///
+    /// # Performance
+    ///
+    /// See [`Validity::is_valid`]: do not call this in a loop for array-backed validity.
     #[inline]
     pub fn is_null(&self, index: usize) -> VortexResult<bool> {
         Ok(!self.is_valid(index)?)
diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs
index 83b28e24b4a..12b38bf4e1b 100644
--- a/vortex-buffer/src/bit/buf.rs
+++ b/vortex-buffer/src/bit/buf.rs
@@ -319,6 +319,20 @@ impl BitBuffer {
         count_ones(self.buffer.as_slice(), self.offset, self.len)
     }
 
+    /// Get the number of set bits in the bit range `[start, end)`.
+    ///
+    /// Unlike `self.slice(start..end).true_count()`, this counts directly over the
+    /// existing backing buffer without allocating or cloning a new [`BitBuffer`],
+    /// making it cheap to call repeatedly over many small ranges.
+    ///
+    /// Panics if `start > end` or `end > len`.
+    #[inline]
+    pub fn count_range(&self, start: usize, end: usize) -> usize {
+        assert!(start <= end, "start {start} exceeds end {end}");
+        assert!(end <= self.len, "end {end} exceeds len {}", self.len);
+        count_ones(self.buffer.as_slice(), self.offset + start, end - start)
+    }
+
     /// Returns the position of the `nth` set bit (0-indexed).
     ///
     /// This is the "select" operation on a bitmap: given a rank `nth`, find
@@ -349,6 +363,38 @@ impl BitBuffer {
         BitSliceIterator::new(self.buffer.as_slice(), self.offset, self.len)
     }
 
+    /// Invoke `f(index)` for every set bit, in ascending order, processing a `u64`
+    /// word at a time.
+    ///
+    /// This is the fast way to "do something for each set bit": it skips all-zero
+    /// words, fast-paths all-one words, and walks the remaining bits with
+    /// `trailing_zeros`. Prefer it over `for i in 0..len { if buf.value(i) { f(i) } }`
+    /// (which pays a branch per element) and over collecting [`Self::set_indices`]
+    /// (whose per-`next` iterator state does not inline as well).
+    #[inline]
+    pub fn for_each_set_index<F: FnMut(usize)>(&self, mut f: F) {
+        let chunks = self.chunks();
+        // `remainder_bits` zero-pads above the logical length, so every set bit it
+        // reports is in-bounds and it is never all-ones (so the all-set fast path
+        // below is only ever taken for genuine full words).
+        let remainder = chunks.remainder_bits();
+        let mut base = 0usize;
+        for word in chunks.iter().chain(std::iter::once(remainder)) {
+            if word == u64::MAX {
+                for k in 0..64 {
+                    f(base + k);
+                }
+            } else {
+                let mut w = word;
+                while w != 0 {
+                    f(base + w.trailing_zeros() as usize);
+                    w &= w - 1;
+                }
+            }
+            base += 64;
+        }
+    }
+
     /// Created a new BitBuffer with offset reset to 0
     pub fn sliced(&self) -> Self {
         if self.offset.is_multiple_of(8) {
@@ -795,6 +841,75 @@ mod tests {
         }
     }
 
+    #[rstest]
+    #[case(0, 0)]
+    #[case(0, 64)]
+    #[case(5, 70)]
+    #[case(64, 130)]
+    #[case(0, 200)]
+    fn test_count_range(#[case] start: usize, #[case] end: usize) {
+        let len = 200;
+        let buf = BitBuffer::collect_bool(len, |i| i % 3 == 0);
+        let expected = (start..end).filter(|i| i % 3 == 0).count();
+        assert_eq!(buf.count_range(start, end), expected);
+        // Must agree with slicing then counting.
+        assert_eq!(
+            buf.count_range(start, end),
+            buf.slice(start..end).true_count()
+        );
+    }
+
+    #[rstest]
+    #[case(3)]
+    #[case(7)]
+    fn test_count_range_with_offset(#[case] offset: usize) {
+        let len = 150;
+        let buf = BitBuffer::collect_bool(offset + len, |i| i % 2 == 0);
+        let view = BitBuffer::new_with_offset(buf.inner().clone(), len, offset);
+        for (start, end) in [(0, len), (10, 100), (1, 2), (63, 129)] {
+            let expected = (offset + start..offset + end)
+                .filter(|i| i % 2 == 0)
+                .count();
+            assert_eq!(view.count_range(start, end), expected, "[{start}, {end})");
+        }
+    }
+
+    #[rstest]
+    #[case(0)]
+    #[case(1)]
+    #[case(63)]
+    #[case(64)]
+    #[case(65)]
+    #[case(200)]
+    #[case(1000)]
+    fn test_for_each_set_index_matches_set_indices(#[case] len: usize) {
+        let buf = BitBuffer::collect_bool(len, |i| i % 5 == 0 || i % 7 == 0);
+        let expected: Vec<usize> = buf.set_indices().collect();
+        let mut got = Vec::new();
+        buf.for_each_set_index(|i| got.push(i));
+        assert_eq!(got, expected);
+    }
+
+    #[rstest]
+    #[case(3, 200)]
+    #[case(7, 130)]
+    fn test_for_each_set_index_with_offset(#[case] offset: usize, #[case] len: usize) {
+        let base = BitBuffer::collect_bool(offset + len, |i| i % 3 == 0);
+        let view = BitBuffer::new_with_offset(base.inner().clone(), len, offset);
+        let expected: Vec<usize> = view.set_indices().collect();
+        let mut got = Vec::new();
+        view.for_each_set_index(|i| got.push(i));
+        assert_eq!(got, expected);
+    }
+
+    #[test]
+    fn test_for_each_set_index_all_set() {
+        let buf = BitBuffer::new_set(130);
+        let mut got = Vec::new();
+        buf.for_each_set_index(|i| got.push(i));
+        assert_eq!(got, (0..130).collect::<Vec<_>>());
+    }
+
     #[test]
     fn test_map_cmp_conditional() {
         // map_cmp with conditional logic based on index and bit value
diff --git a/vortex-duckdb/Cargo.toml b/vortex-duckdb/Cargo.toml
index 0f5e6b4797e..09cf40a6228 100644
--- a/vortex-duckdb/Cargo.toml
+++ b/vortex-duckdb/Cargo.toml
@@ -44,6 +44,7 @@ vortex-utils = { workspace = true, features = ["dashmap"] }
 
 [dev-dependencies]
 anyhow = { workspace = true }
+divan = { workspace = true }
 jiff = { workspace = true }
 rstest = { workspace = true }
 tempfile = { workspace = true }
@@ -51,6 +52,10 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
 vortex-runend = { workspace = true }
 vortex-sequence = { workspace = true }
 
+[[bench]]
+name = "bool_export"
+harness = false
+
 [lints]
 workspace = true
 
diff --git a/vortex-duckdb/benches/bool_export.rs b/vortex-duckdb/benches/bool_export.rs
new file mode 100644
index 00000000000..38d2fc7c562
--- /dev/null
+++ b/vortex-duckdb/benches/bool_export.rs
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Microbenchmark for the bit -> byte-bool unpack performed by `BoolExporter::export`.
+//!
+//! DuckDB stores booleans as one byte per value (`&mut [bool]`), while Vortex stores them
+//! bit-packed in a [`BitBuffer`]. On every boolean column export we unpack a slice of the
+//! bit buffer into the destination byte slice.
+//!
+//! This bench isolates that pure unpacking logic so it can run without a live DuckDB
+//! vector: a reused `Vec<bool>` stands in for the DuckDB byte-bool destination. It compares
+//! the previous implementation (`.iter().collect::<Vec<bool>>()` followed by
+//! `copy_from_slice`) against the new allocation-free direct zip-write.
+
+use divan::Bencher;
+use vortex::buffer::BitBuffer;
+
+fn main() {
+    divan::main();
+}
+
+const SIZES: &[usize] = &[1024, 16_384, 262_144];
+
+/// Bit densities (true ratio), expressed as `(numerator, denominator)`.
+const DENSITIES: &[(usize, usize)] = &[(1, 2), (1, 10), (9, 10)];
+
+fn make_buffer(len: usize, density: (usize, usize)) -> BitBuffer {
+    let (num, den) = density;
+    // `+ 8` so we can slice off a non-byte-aligned offset, matching real exports.
+    BitBuffer::from_iter((0..len + 8).map(|i| (i % den) < num))
+}
+
+/// Previous implementation: allocate a throwaway `Vec<bool>` then copy into the destination.
+#[divan::bench(args = SIZES, consts = [0usize, 1usize, 2usize])]
+fn old_collect_copy<const DENSITY_IDX: usize>(bencher: Bencher, len: usize) {
+    let buffer = make_buffer(len, DENSITIES[DENSITY_IDX]);
+    bencher
+        .with_inputs(|| vec![false; len])
+        .bench_refs(|dst: &mut Vec<bool>| {
+            // Offset by 1 to exercise a non-byte-aligned slice like the export hot path.
+            dst.copy_from_slice(&buffer.slice(1..(1 + len)).iter().collect::<Vec<bool>>());
+            divan::black_box(&dst);
+        });
+}
+
+/// New implementation: zip the sliced bit iterator directly into the destination slice.
+#[divan::bench(args = SIZES, consts = [0usize, 1usize, 2usize])]
+fn new_zip_write<const DENSITY_IDX: usize>(bencher: Bencher, len: usize) {
+    let buffer = make_buffer(len, DENSITIES[DENSITY_IDX]);
+    bencher
+        .with_inputs(|| vec![false; len])
+        .bench_refs(|dst: &mut Vec<bool>| {
+            for (slot, bit) in dst.iter_mut().zip(buffer.slice(1..(1 + len)).iter()) {
+                *slot = bit;
+            }
+            divan::black_box(&dst);
+        });
+}
diff --git a/vortex-duckdb/src/exporter/bool.rs b/vortex-duckdb/src/exporter/bool.rs
index 84fd17f0789..1202d0cfe1b 100644
--- a/vortex-duckdb/src/exporter/bool.rs
+++ b/vortex-duckdb/src/exporter/bool.rs
@@ -47,13 +47,16 @@ impl ColumnExporter for BoolExporter {
     ) -> VortexResult<()> {
         // DuckDB uses byte bools, not bit bools.
         // maybe we can convert into these from a compressed array sometimes?.
-        unsafe { vector.as_slice_mut(len) }.copy_from_slice(
-            &self
-                .bit_buffer
-                .slice(offset..(offset + len))
-                .iter()
-                .collect::<Vec<bool>>(),
-        );
+        // Unpack the bits directly into the destination byte slice, avoiding the
+        // throwaway `Vec<bool>` allocation and the extra copy pass. The sliced
+        // iterator already accounts for the buffer's bit offset.
+        let dst = unsafe { vector.as_slice_mut(len) };
+        for (slot, bit) in dst
+            .iter_mut()
+            .zip(self.bit_buffer.slice(offset..(offset + len)).iter())
+        {
+            *slot = bit;
+        }
 
         Ok(())
     }
diff --git a/vortex-mask/Cargo.toml b/vortex-mask/Cargo.toml
index a2af3d27cb0..24aaa7110b5 100644
--- a/vortex-mask/Cargo.toml
+++ b/vortex-mask/Cargo.toml
@@ -39,5 +39,13 @@ harness = false
 name = "rank"
 harness = false
 
+[[bench]]
+name = "valid_counts"
+harness = false
+
+[[bench]]
+name = "mask_iteration"
+harness = false
+
 [lints]
 workspace = true
diff --git a/vortex-mask/benches/mask_iteration.rs b/vortex-mask/benches/mask_iteration.rs
new file mode 100644
index 00000000000..40caee28c3f
--- /dev/null
+++ b/vortex-mask/benches/mask_iteration.rs
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Strategy comparison for "process the valid elements of a mask".
+//!
+//! The kernel is `sum of values[i] for every set bit i` — a stand-in for any
+//! validity-gated loop. We compare:
+//!
+//! * `per_element_value`  — `for i in 0..n { if buf.value(i) {..} }` (what callers
+//!   typically write after materializing validity into a mask)
+//! * `bit_iterator`       — arrow's `BitIterator` (tracks a word internally)
+//! * `word_trailing_zeros`— iterate `u64` words, fast-path all-set/all-unset, and
+//!   walk set bits with `trailing_zeros` / `w &= w - 1`
+//! * `set_slices`         — iterate contiguous true runs (`set_slices`)
+//! * `set_indices`        — iterate set positions (`set_indices`)
+//!
+//! Run across densities to expose the dense-vs-sparse crossover.
+
+#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)]
+
+use divan::Bencher;
+use vortex_buffer::BitBuffer;
+
+fn main() {
+    divan::main();
+}
+
+const ARGS: &[(usize, f64)] = &[
+    (65_536, 0.01),
+    (65_536, 0.10),
+    (65_536, 0.50),
+    (65_536, 0.90),
+    (65_536, 0.99),
+];
+
+fn make(len: usize, density: f64) -> (BitBuffer, Vec<u64>) {
+    let threshold = (density * 1000.0) as usize;
+    let buf = BitBuffer::from_iter((0..len).map(|i| (i * 7 + 13) % 1000 < threshold));
+    let values = (0..len as u64).collect();
+    (buf, values)
+}
+
+#[divan::bench(args = ARGS)]
+fn per_element_value(bencher: Bencher, (len, density): (usize, f64)) {
+    let (buf, values) = make(len, density);
+    bencher
+        .with_inputs(|| (&buf, &values))
+        .bench_refs(|(buf, values)| {
+            let mut acc = 0u64;
+            for i in 0..len {
+                if buf.value(i) {
+                    acc = acc.wrapping_add(values[i]);
+                }
+            }
+            acc
+        });
+}
+
+#[divan::bench(args = ARGS)]
+fn bit_iterator(bencher: Bencher, (len, density): (usize, f64)) {
+    let (buf, values) = make(len, density);
+    bencher
+        .with_inputs(|| (&buf, &values))
+        .bench_refs(|(buf, values)| {
+            let mut acc = 0u64;
+            for (i, set) in buf.iter().enumerate() {
+                if set {
+                    acc = acc.wrapping_add(values[i]);
+                }
+            }
+            acc
+        });
+}
+
+#[divan::bench(args = ARGS)]
+fn word_trailing_zeros(bencher: Bencher, (len, density): (usize, f64)) {
+    let (buf, values) = make(len, density);
+    bencher
+        .with_inputs(|| (&buf, &values))
+        .bench_refs(|(buf, values)| {
+            let mut acc = 0u64;
+            let chunks = buf.chunks();
+            let mut base = 0usize;
+            for word in chunks.iter() {
+                if word == u64::MAX {
+                    for k in 0..64 {
+                        acc = acc.wrapping_add(values[base + k]);
+                    }
+                } else {
+                    let mut w = word;
+                    while w != 0 {
+                        let b = w.trailing_zeros() as usize;
+                        acc = acc.wrapping_add(values[base + b]);
+                        w &= w - 1;
+                    }
+                }
+                base += 64;
+            }
+            let mut w = chunks.remainder_bits();
+            let rem = buf.len() - base;
+            while w != 0 {
+                let b = w.trailing_zeros() as usize;
+                if b >= rem {
+                    break;
+                }
+                acc = acc.wrapping_add(values[base + b]);
+                w &= w - 1;
+            }
+            acc
+        });
+}
+
+#[divan::bench(args = ARGS)]
+fn for_each_set_index(bencher: Bencher, (len, density): (usize, f64)) {
+    let (buf, values) = make(len, density);
+    bencher
+        .with_inputs(|| (&buf, &values))
+        .bench_refs(|(buf, values)| {
+            let mut acc = 0u64;
+            buf.for_each_set_index(|i| acc = acc.wrapping_add(values[i]));
+            acc
+        });
+}
+
+#[divan::bench(args = ARGS)]
+fn set_slices(bencher: Bencher, (len, density): (usize, f64)) {
+    let (buf, values) = make(len, density);
+    bencher
+        .with_inputs(|| (&buf, &values))
+        .bench_refs(|(buf, values)| {
+            let mut acc = 0u64;
+            for (start, end) in buf.set_slices() {
+                for i in start..end {
+                    acc = acc.wrapping_add(values[i]);
+                }
+            }
+            acc
+        });
+}
+
+#[divan::bench(args = ARGS)]
+fn set_indices(bencher: Bencher, (len, density): (usize, f64)) {
+    let (buf, values) = make(len, density);
+    bencher
+        .with_inputs(|| (&buf, &values))
+        .bench_refs(|(buf, values)| {
+            let mut acc = 0u64;
+            for i in buf.set_indices() {
+                acc = acc.wrapping_add(values[i]);
+            }
+            acc
+        });
+}
diff --git a/vortex-mask/benches/valid_counts.rs b/vortex-mask/benches/valid_counts.rs
new file mode 100644
index 00000000000..359eeb2ea30
--- /dev/null
+++ b/vortex-mask/benches/valid_counts.rs
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Benchmarks for `Mask::valid_counts_for_indices`.
+//!
+//! This mirrors the hot path in the pco/zstd slice decoders, which call
+//! `valid_counts_for_indices(&[slice_start, slice_stop])` to translate a row
+//! range into a value range. The cost is dominated by counting set bits in the
+//! prefix `[0, slice_stop)`, so a SIMD popcount over the bit buffer should beat
+//! a bit-by-bit walk handily for large `slice_stop`.
+
+#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)]
+
+use divan::Bencher;
+use vortex_buffer::BitBuffer;
+use vortex_mask::Mask;
+
+fn main() {
+    divan::main();
+}
+
+const BENCH_SIZES: &[(usize, f64)] = &[
+    (16_384, 0.1),
+    (16_384, 0.9),
+    (262_144, 0.1),
+    (262_144, 0.9),
+    (1_048_576, 0.1),
+    (1_048_576, 0.9),
+];
+
+fn create_mask(len: usize, density: f64) -> Mask {
+    Mask::from_buffer(BitBuffer::from_iter((0..len).map(|i| {
+        let threshold = (density * 1000.0) as usize;
+        (i * 7 + 13) % 1000 < threshold
+    })))
+}
+
+/// The pco/zstd slice pattern: two indices bracketing a sub-range. The prefix
+/// up to `slice_stop` must be counted, so `slice_stop` near the end is worst case.
+#[divan::bench(args = BENCH_SIZES)]
+fn slice_bounds(bencher: Bencher, (len, density): (usize, f64)) {
+    let mask = create_mask(len, density);
+    let indices = [len / 4, len - len / 8];
+    bencher
+        .with_inputs(|| (&mask, indices))
+        .bench_refs(|(mask, indices)| mask.valid_counts_for_indices(indices));
+}
+
+/// Many monotonically increasing indices spread across the whole mask.
+#[divan::bench(args = BENCH_SIZES)]
+fn many_indices(bencher: Bencher, (len, density): (usize, f64)) {
+    let mask = create_mask(len, density);
+    let stride = (len / 256).max(1);
+    let indices: Vec<usize> = (0..len).step_by(stride).collect();
+    bencher
+        .with_inputs(|| (&mask, indices.as_slice()))
+        .bench_refs(|(mask, indices)| mask.valid_counts_for_indices(indices));
+}
diff --git a/vortex-mask/src/lib.rs b/vortex-mask/src/lib.rs
index 41901ba2ba3..e4b9f0d8940 100644
--- a/vortex-mask/src/lib.rs
+++ b/vortex-mask/src/lib.rs
@@ -607,23 +607,24 @@ impl Mask {
     /// Given monotonically increasing `indices` in [0, n_rows], returns the
     /// count of valid elements up to each index.
     ///
-    /// This is O(n_rows).
+    /// This is O(n_rows), but the per-gap counts are computed with a SIMD
+    /// popcount over the underlying bit buffer rather than walking bit-by-bit.
     pub fn valid_counts_for_indices(&self, indices: &[usize]) -> Vec<usize> {
         match self {
             Self::AllTrue(_) => indices.to_vec(),
             Self::AllFalse(_) => vec![0; indices.len()],
             Self::Values(values) => {
-                let mut bool_iter = values.bit_buffer().iter();
+                let buffer = values.bit_buffer();
                 let mut valid_counts = Vec::with_capacity(indices.len());
                 let mut valid_count = 0;
-                let mut idx = 0;
+                let mut prev = 0;
                 for &next_idx in indices {
-                    while idx < next_idx {
-                        idx += 1;
-                        valid_count += bool_iter
-                            .next()
-                            .unwrap_or_else(|| vortex_panic!("Row indices exceed array length"))
-                            as usize;
+                    assert!(next_idx <= buffer.len(), "Row indices exceed array length");
+                    // `indices` is monotonically increasing, so each gap is counted once;
+                    // the total work across all gaps scans the prefix `[0, last_idx)` once.
+                    if next_idx > prev {
+                        valid_count += buffer.count_range(prev, next_idx);
+                        prev = next_idx;
                     }
                     valid_counts.push(valid_count);
                 }
@@ -758,7 +759,10 @@ impl MaskValues {
             }
 
             let mut indices = Vec::with_capacity(self.true_count);
-            indices.extend(self.buffer.set_indices());
+            // Word-at-a-time set-bit walk; faster than collecting `set_indices()`,
+            // whose per-`next` iterator state inlines less well (see
+            // `vortex-mask/benches/mask_iteration.rs`).
+            self.buffer.for_each_set_index(|i| indices.push(i));
             debug_assert!(indices.is_sorted());
             assert_eq!(indices.len(), self.true_count);
             indices
diff --git a/vortex-tensor/src/scalar_fns/l2_denorm.rs b/vortex-tensor/src/scalar_fns/l2_denorm.rs
index c91a614ddff..87d9c3d4892 100644
--- a/vortex-tensor/src/scalar_fns/l2_denorm.rs
+++ b/vortex-tensor/src/scalar_fns/l2_denorm.rs
@@ -441,6 +441,11 @@ pub fn normalize_as_l2_denorm(
     let normalized_dtype = input.dtype().as_nonnullable();
     let flat = extract_flat_elements(input.storage_array(), tensor_flat_size, ctx)?;
 
+    // Materialize the validity into a mask once, rather than calling
+    // `Validity::is_valid(i)` per row (which, for array-backed validity, spins up a
+    // fresh execution context and executes a scalar lookup on every call).
+    let norms_mask = norms_validity.execute_mask(row_count, ctx)?;
+
     // Normalize all of the vectors.
     let normalized = match_each_float_ptype!(flat.ptype(), |T| {
         let norm_values = primitive_norms.as_slice::<T>();
@@ -448,7 +453,7 @@ pub fn normalize_as_l2_denorm(
         let total_elements = row_count * tensor_flat_size;
         let mut elements = BufferMut::<T>::with_capacity(total_elements);
         for i in 0..row_count {
-            let is_valid = norms_validity.is_valid(i)?;
+            let is_valid = norms_mask.value(i);
             let norm = norm_values[i];
 
             // SAFETY: We allocated `row_count * tensor_flat_size` capacity and push exactly
@@ -637,12 +642,16 @@ pub fn validate_l2_normalized_rows_against_norms(
         Some(norms) => normalized_validity.and(norms.validity()?)?,
         None => normalized_validity,
     };
+    // Materialize the validity into a mask once, rather than calling
+    // `Validity::is_valid(i)` per row (which, for array-backed validity, spins up a
+    // fresh execution context and executes a scalar lookup on every call).
+    let combined_mask = combined_validity.execute_mask(row_count, ctx)?;
 
     match_each_float_ptype!(element_ptype, |T| {
         let stored_norms = norms.as_ref().map(|norms| norms.as_slice::<T>());
 
         for i in 0..row_count {
-            if !combined_validity.is_valid(i)? {
+            if !combined_mask.value(i) {
                 continue;
             }