From 4a30a9ce20113d75f3b138ca535640684417e3d0 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 14:28:16 +0000 Subject: [PATCH 01/19] feat(vortex-row): add row-oriented byte encoder crate Adds `vortex-row`, which encodes N columnar arrays into a single byte-comparable `ListView` (the Vortex analogue of arrow-row) for use as sort/row keys. Encoding runs as two scalar functions behind the `RowEncoder` API: a `RowSize` sizing/classification pass and a `RowEncode` pass that allocates one contiguous buffer and writes each column left-to-right into its per-row slot. Per-column ordering (`RowSortField`) controls ascending/ descending and null placement. Includes the byte codec for fixed-width, varlen, and nested canonical types, the `convert_columns`/`compute_row_sizes` helpers, round-trip + invariant tests, and arrow-row-baselined throughput benches. The crate is marked `publish = false` for now, so no public-api.lock is tracked. Signed-off-by: Joe Isaacs --- Cargo.lock | 20 + Cargo.toml | 3 + vortex-row/Cargo.toml | 41 ++ vortex-row/benches/row_encode.rs | 176 ++++++ vortex-row/src/codec.rs | 997 +++++++++++++++++++++++++++++++ vortex-row/src/encode.rs | 193 ++++++ vortex-row/src/encoder.rs | 138 +++++ vortex-row/src/lib.rs | 60 ++ vortex-row/src/options.rs | 193 ++++++ vortex-row/src/size.rs | 216 +++++++ vortex-row/src/tests.rs | 575 ++++++++++++++++++ 11 files changed, 2612 insertions(+) create mode 100644 vortex-row/Cargo.toml create mode 100644 vortex-row/benches/row_encode.rs create mode 100644 vortex-row/src/codec.rs create mode 100644 vortex-row/src/encode.rs create mode 100644 vortex-row/src/encoder.rs create mode 100644 vortex-row/src/lib.rs create mode 100644 vortex-row/src/options.rs create mode 100644 vortex-row/src/size.rs create mode 100644 vortex-row/src/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 9189591e620..967f0a18a09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9981,6 +9981,26 @@ dependencies = [ "vortex-tui", ] +[[package]] +name = "vortex-row" +version = "0.1.0" +dependencies = [ + "arrow-array", + "arrow-row", + "arrow-schema", + "bytes", + "codspeed-divan-compat", + "mimalloc", + "rand 0.10.1", + "rstest", + "smallvec", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-session", +] + [[package]] name = "vortex-runend" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index e3c3cbae67e..dd3aabd9f4a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "vortex-io", "vortex-proto", "vortex-array", + "vortex-row", "vortex-tensor", "vortex-turboquant", "vortex-compressor", @@ -104,6 +105,7 @@ arrow-cast = "58" arrow-data = "58" arrow-ipc = "58" arrow-ord = "58" +arrow-row = "58" arrow-schema = "58" arrow-select = "58" arrow-string = "58" @@ -295,6 +297,7 @@ vortex-onpair = { version = "0.1.0", path = "./encodings/experimental/onpair", d vortex-parquet-variant = { version = "0.1.0", path = "./encodings/parquet-variant" } vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false } vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false } +vortex-row = { version = "0.1.0", path = "./vortex-row", default-features = false } vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false } vortex-scan = { version = "0.1.0", path = "./vortex-scan", default-features = false } vortex-sequence = { version = "0.1.0", path = "encodings/sequence", default-features = false } diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml new file mode 100644 index 00000000000..9222c7d6a43 --- /dev/null +++ b/vortex-row/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "vortex-row" +authors = { workspace = true } +categories = { workspace = true } +description = "Row-oriented byte encoder for Vortex arrays, analogous to arrow-row." +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +publish = false +readme = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +bytes = { workspace = true } +smallvec = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-session = { workspace = true } + +[dev-dependencies] +arrow-array = { workspace = true } +arrow-row = { workspace = true } +arrow-schema = { workspace = true } +divan = { workspace = true } +mimalloc = { workspace = true } +rand = { workspace = true } +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[[bench]] +name = "row_encode" +harness = false diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs new file mode 100644 index 00000000000..07493d6ad48 --- /dev/null +++ b/vortex-row/benches/row_encode.rs @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![expect( + clippy::unwrap_used, + clippy::clone_on_ref_ptr, + clippy::cloned_ref_to_slice_refs, + clippy::redundant_clone +)] + +//! Row-encode throughput benchmarks comparing `arrow-row` against Vortex's [`RowEncoder`] +//! for the core canonical scenarios: a primitive i64 column, a Utf8 column, and a +//! mixed-field struct. + +use std::sync::Arc; + +use arrow_array::Int64Array; +use arrow_array::StringArray; +use arrow_array::StructArray as ArrowStructArray; +use arrow_row::RowConverter; +use arrow_row::SortField as ArrowSortField; +use arrow_schema::DataType; +use arrow_schema::Field; +use divan::counter::BytesCount; +use mimalloc::MiMalloc; +use rand::RngExt; +use rand::SeedableRng; +use rand::distr::Alphanumeric; +use rand::rngs::StdRng; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_row::RowEncoder; + +#[global_allocator] +static GLOBAL: MiMalloc = MiMalloc; + +const N: usize = 100_000; + +fn main() { + divan::main(); +} + +fn gen_i64(n: usize, seed: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + (0..n) + .map(|_| rng.random_range(i64::MIN..i64::MAX)) + .collect() +} + +fn gen_words(n: usize, mean_len: usize, seed: u64) -> Vec { + let rng = &mut StdRng::seed_from_u64(seed); + (0..n) + .map(|_| { + let len = rng.random_range(mean_len.saturating_sub(4)..=mean_len + 4); + rng.sample_iter(&Alphanumeric) + .take(len) + .map(char::from) + .collect::() + }) + .collect() +} + +// ---------- primitive_i64 ---------- + +#[divan::bench] +fn primitive_i64_arrow_row(bencher: divan::Bencher) { + let v = gen_i64(N, 0); + let arr = Arc::new(Int64Array::from(v.clone())) as arrow_array::ArrayRef; + let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap(); + let bytes = (N * (1 + 8)) as u64; + bencher + .counter(BytesCount::new(bytes)) + .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap()) +} + +#[divan::bench] +fn primitive_i64_vortex(bencher: divan::Bencher) { + let v = gen_i64(N, 0); + let col = PrimitiveArray::from_iter(v.clone()).into_array(); + let bytes = (N * (1 + 8)) as u64; + let encoder = RowEncoder::default(); + bencher.counter(BytesCount::new(bytes)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + encoder.encode(&[col.clone()], &mut ctx).unwrap() + }) +} + +// ---------- utf8 ---------- + +#[divan::bench] +fn utf8_arrow_row(bencher: divan::Bencher) { + let words = gen_words(N, 16, 7); + let total: u64 = words + .iter() + .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64) + .sum(); + let arr = Arc::new(StringArray::from(words.clone())) as arrow_array::ArrayRef; + let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap(); + bencher + .counter(BytesCount::new(total)) + .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap()) +} + +#[divan::bench] +fn utf8_vortex(bencher: divan::Bencher) { + let words = gen_words(N, 16, 7); + let total: u64 = words + .iter() + .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64) + .sum(); + let col = VarBinViewArray::from_iter_str(words.iter().map(String::as_str)).into_array(); + let encoder = RowEncoder::default(); + bencher.counter(BytesCount::new(total)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + encoder.encode(&[col.clone()], &mut ctx).unwrap() + }) +} + +// ---------- struct_mixed ---------- + +fn struct_mixed_inputs() -> (Vec, Vec, u64) { + let ids = gen_i64(N, 1); + let names = gen_words(N, 16, 2); + // sentinel (1) + i64 (1+8=9) + utf8-name (1 + ceil(len/32)*33) + let total: u64 = (0..N) + .map(|i| { + let name_bytes = 1 + (names[i].len().div_ceil(32) * 33) as u64; + 1u64 + 9u64 + name_bytes + }) + .sum(); + (ids, names, total) +} + +#[divan::bench] +fn struct_mixed_arrow_row(bencher: divan::Bencher) { + let (ids, names, total) = struct_mixed_inputs(); + let id_arr = Arc::new(Int64Array::from(ids)) as arrow_array::ArrayRef; + let name_arr = Arc::new(StringArray::from(names)) as arrow_array::ArrayRef; + let arrow_struct = Arc::new(ArrowStructArray::from(vec![ + (Arc::new(Field::new("id", DataType::Int64, false)), id_arr), + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + name_arr, + ), + ])) as arrow_array::ArrayRef; + let struct_fields = vec![ + Arc::new(Field::new("id", DataType::Int64, false)), + Arc::new(Field::new("name", DataType::Utf8, false)), + ]; + let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Struct( + struct_fields.into(), + ))]) + .unwrap(); + bencher + .counter(BytesCount::new(total)) + .bench_local(|| conv.convert_columns(&[arrow_struct.clone()]).unwrap()) +} + +#[divan::bench] +fn struct_mixed_vortex(bencher: divan::Bencher) { + let (ids, names, total) = struct_mixed_inputs(); + let id_arr = PrimitiveArray::from_iter(ids).into_array(); + let name_arr = VarBinViewArray::from_iter_str(names.iter().map(String::as_str)).into_array(); + let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)]) + .unwrap() + .into_array(); + let encoder = RowEncoder::default(); + bencher.counter(BytesCount::new(total)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + encoder.encode(&[struct_arr.clone()], &mut ctx).unwrap() + }) +} diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs new file mode 100644 index 00000000000..2818db62aba --- /dev/null +++ b/vortex-row/src/codec.rs @@ -0,0 +1,997 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants. +//! +//! The encoded byte format produces a lexicographically byte-comparable representation: +//! comparing the byte slices of two encoded rows yields the same ordering as the +//! original logical (tuple) comparison of their values, modulo nulls placement and +//! descending-ness as configured by [`RowSortField`]. +//! +//! Conventions: +//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to +//! non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), +//! not the sentinel. +//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each +//! for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the +//! value and column-byte boundaries stay aligned across rows. See +//! [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`]. +//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types. +//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top +//! bit; negative flips all bits. +//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null +//! body** so two null parent rows produce byte-equal encodings: fixed-width children +//! contribute their fixed null encoding, and variable-width children collapse to a single +//! null sentinel byte. + +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::FixedSizeListArray; +use vortex_array::arrays::NullArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::DecimalType; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::half::f16; +use vortex_array::match_each_native_ptype; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::options::RowSortField; + +/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte). +pub(crate) const BOOL_ENCODED_SIZE: u32 = 2; + +/// Block size used in the variable-length encoding. +pub(crate) const VARLEN_BLOCK_SIZE: usize = 32; +/// Total bytes per varlen block including the trailing continuation marker. +pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; +const VARLEN_BLOCK_TOTAL_U32: u32 = 33; + +/// Size in bytes of an encoded null varlen value (just the sentinel byte). +pub(crate) const VARLEN_NULL_SIZE: u32 = 1; +/// Size in bytes of an encoded empty varlen value (just the sentinel byte). +pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1; + +/// Returns the size in bytes of the encoded form of a non-empty variable-length value. +/// +/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1 +/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and +/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView` +/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`. +#[inline] +fn encoded_size_for_non_empty_varlen(len: usize) -> u32 { + debug_assert!(len > 0); + let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE)) + .vortex_expect("varlen block count must fit in u32"); + 1 + blocks * VARLEN_BLOCK_TOTAL_U32 +} + +/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel). +#[inline] +const fn encoded_size_for_fixed(value_bytes: u32) -> u32 { + 1 + value_bytes +} + +fn byte_width_u32(width: usize) -> u32 { + u32::try_from(width).vortex_expect("native byte width must fit in u32") +} + +/// Returns the sentinel byte for a null varlen value. +/// +/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and +/// independent of `descending`, matching the convention used by `arrow-row`. +#[inline] +fn varlen_null_sentinel(field: RowSortField) -> u8 { + if field.nulls_first { 0x00 } else { 0xFF } +} + +/// Returns the sentinel byte for an empty varlen value. +/// +/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode. +#[inline] +fn varlen_empty_sentinel(field: RowSortField) -> u8 { + if field.descending { !0x01u8 } else { 0x01u8 } +} + +/// Returns the sentinel byte for a non-empty varlen value. +/// +/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode. +#[inline] +fn varlen_non_empty_sentinel(field: RowSortField) -> u8 { + if field.descending { !0x02u8 } else { 0x02u8 } +} + +/// Returns the single-byte null sentinel used when a child contributes its canonical null +/// encoding inside a null parent struct/FSL row. +/// +/// For varlen children that is the varlen null sentinel; for everything else (including +/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel. +fn child_canonical_null_byte(child_dtype: &DType, field: RowSortField) -> u8 { + match child_dtype { + DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field), + _ => field.null_sentinel(), + } +} + +/// Per-row width classification for a column. +/// +/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless +/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary, +/// List, or any composite that recurses through a variable-width field). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum RowWidth { + /// Per-row width is the same constant for every row in the column. + Fixed(u32), + /// Per-row width is data-dependent. + Variable, +} + +/// Classify a column's per-row encoded width by inspecting only its [`DType`]. +/// +/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value), +/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the +/// data. +/// +/// Classification does not depend on the [`RowSortField`]: null-vs-non-null encoding width is +/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls). +/// +/// # Errors +/// +/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that +/// would overflow `u32` is also reported as an error rather than silently saturating. +pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { + match dtype { + DType::Null => Ok(RowWidth::Fixed(1)), + DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)), + DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( + ptype.byte_width(), + )))), + DType::Decimal(dt, _) => { + let vt = DecimalType::smallest_decimal_value_type(dt); + if matches!(vt, DecimalType::I256) { + vortex_bail!("row encoding for Decimal256 is not yet implemented"); + } + Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( + vt.byte_width(), + )))) + } + DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable), + DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? { + // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL + // itself, then `n` copies of the element width. + RowWidth::Fixed(w) => { + let body = w + .checked_mul(*n) + .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?; + let total = body + .checked_add(1) + .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?; + Ok(RowWidth::Fixed(total)) + } + RowWidth::Variable => Ok(RowWidth::Variable), + }, + DType::Struct(fields, _) => { + // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel. + let mut total: u32 = 1; // outer sentinel + for field_dtype in fields.fields() { + match row_width_for_dtype(&field_dtype)? { + RowWidth::Fixed(w) => { + total = total.checked_add(w).ok_or_else(|| { + vortex_error::vortex_err!("Struct row width overflows u32") + })?; + } + RowWidth::Variable => return Ok(RowWidth::Variable), + } + } + Ok(RowWidth::Fixed(total)) + } + DType::List(..) => { + vortex_bail!( + "row encoding does not support variable-size List arrays (no well-defined ordering)" + ) + } + DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()), + DType::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"), + } +} + +/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`. +/// +/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the +/// per-row size to each entry so multiple columns can accumulate into the same buffer. +/// +/// # Errors +/// +/// Returns an error for unsupported canonical variants. +pub(crate) fn field_size( + canonical: &Canonical, + field: RowSortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match canonical { + Canonical::Null(arr) => add_size_null(arr, sizes), + Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)), + Canonical::Primitive(arr) => add_size_primitive(arr, sizes), + Canonical::Decimal(arr) => add_size_decimal(arr, sizes), + Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?, + Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?, + Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?, + Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?, + Canonical::List(_) => vortex_bail!( + "row encoding does not support canonical List arrays: {:?}", + canonical.dtype() + ), + Canonical::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + } + Ok(()) +} + +/// Encode each row's bytes for the given canonical view into `out`, writing starting at +/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of +/// bytes written. +/// +/// After this call returns successfully, `cursors[i]` will have advanced by exactly the +/// per-row contribution previously computed by [`field_size`] for the same column. +pub(crate) fn field_encode( + canonical: &Canonical, + field: RowSortField, + offsets: &[u32], + cursors: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match canonical { + Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out), + Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?, + Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?, + Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?, + Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?, + Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?, + Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?, + Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?, + Canonical::List(_) => vortex_bail!( + "row encoding does not support canonical List arrays: {:?}", + canonical.dtype() + ), + Canonical::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + } + Ok(()) +} + +fn add_size_const(sizes: &mut [u32], add: u32) { + for s in sizes.iter_mut() { + *s += add; + } +} + +fn add_size_null(arr: &NullArray, sizes: &mut [u32]) { + debug_assert_eq!(arr.len(), sizes.len()); + // Just a sentinel byte per row. + for s in sizes.iter_mut() { + *s += 1; + } +} + +fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) { + let width = byte_width_u32(arr.ptype().byte_width()); + add_size_const(sizes, encoded_size_for_fixed(width)); +} + +fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) { + let width = byte_width_u32(arr.values_type().byte_width()); + add_size_const(sizes, encoded_size_for_fixed(width)); +} + +fn add_size_varbinview( + arr: &VarBinViewArray, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let views = arr.views(); + for (i, view) in views.iter().enumerate() { + let contribution = if !mask.value(i) { + VARLEN_NULL_SIZE + } else if view.is_empty() { + VARLEN_EMPTY_SIZE + } else { + encoded_size_for_non_empty_varlen(view.len() as usize) + }; + sizes[i] = sizes[i] + .checked_add(contribution) + .vortex_expect("per-row size overflow"); + } + Ok(()) +} + +fn add_size_struct( + arr: &StructArray, + field: RowSortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + // Outer sentinel: 1 byte per row. + for s in sizes.iter_mut() { + *s = s.checked_add(1).vortex_expect("per-row size overflow"); + } + // Each child contributes its per-row size when the parent is non-null, and a canonical + // null contribution when the parent is null. For fixed-width children both are equal, + // so we can simply add the fixed width to every row. For variable-width children the + // null contribution collapses to 1 byte, ensuring null parent rows have a constant body. + for child in arr.iter_unmasked_fields() { + match row_width_for_dtype(child.dtype())? { + RowWidth::Fixed(w) => add_size_const(sizes, w), + RowWidth::Variable => { + let canonical = child.clone().execute::(ctx)?; + let mut child_sizes = vec![0u32; n]; + field_size(&canonical, field, &mut child_sizes, ctx)?; + for i in 0..n { + let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 }; + sizes[i] = sizes[i] + .checked_add(contribution) + .vortex_expect("per-row size overflow"); + } + } + } + } + Ok(()) +} + +fn add_size_fsl( + arr: &FixedSizeListArray, + field: RowSortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + debug_assert_eq!(n, sizes.len()); + let list_size = arr.list_size() as usize; + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let elem_dtype = arr.elements().dtype(); + // Outer sentinel: 1 byte per row. + for s in sizes.iter_mut() { + *s = s.checked_add(1).vortex_expect("per-row size overflow"); + } + match row_width_for_dtype(elem_dtype)? { + RowWidth::Fixed(w) => { + // Each row has `list_size` fixed-width elements regardless of null parent mask. + let body = w + .checked_mul(u32::try_from(list_size).vortex_expect("list_size fits u32")) + .vortex_expect("FSL body width overflow"); + add_size_const(sizes, body); + } + RowWidth::Variable => { + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), n * list_size); + let mut elem_sizes = vec![0u32; n * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + for i in 0..n { + let body: u32 = if mask.value(i) { + let base = i * list_size; + let mut sum: u32 = 0; + for j in 0..list_size { + sum = sum + .checked_add(elem_sizes[base + j]) + .vortex_expect("FSL row body overflow"); + } + sum + } else { + // Canonical null body for FSL with variable element: one null sentinel + // per element. (Each element contributes `child_null_width = 1`.) + u32::try_from(list_size).vortex_expect("list_size fits u32") + }; + sizes[i] = sizes[i] + .checked_add(body) + .vortex_expect("FSL per-row size overflow"); + } + } + } + Ok(()) +} + +fn add_size_extension( + arr: &ExtensionArray, + field: RowSortField, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let storage = arr.storage_array().clone().execute::(ctx)?; + field_size(&storage, field, sizes, ctx) +} + +fn encode_null( + arr: &NullArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], +) { + let sentinel = field.null_sentinel(); + for i in 0..arr.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = sentinel; + col_offset[i] += 1; + } +} + +fn encode_bool( + arr: &BoolArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let bits = arr.clone().into_bit_buffer(); + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let xor = if field.descending { 0xFF } else { 0x00 }; + for i in 0..bits.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + // false=0x01, true=0x02 so false < true; XOR for descending + let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; + out[pos + 1] = raw ^ xor; + } else { + out[pos] = null; + out[pos + 1] = 0; + } + col_offset[i] += BOOL_ENCODED_SIZE; + } + Ok(()) +} + +fn encode_primitive( + arr: &PrimitiveArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match_each_native_ptype!(arr.ptype(), |T| { + encode_primitive_typed::(arr, field, row_offsets, col_offset, out, ctx)?; + }); + Ok(()) +} + +fn encode_primitive_typed( + arr: &PrimitiveArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let slice: &[T] = arr.as_slice(); + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let value_bytes = size_of::(); + for (i, &v) in slice.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + // Zero-fill the value bytes. + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += encoded_size_for_fixed(byte_width_u32(value_bytes)); + } + Ok(()) +} + +fn encode_decimal( + arr: &DecimalArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + match arr.values_type() { + DecimalType::I8 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I16 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I32 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I64 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I128 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I256 => { + vortex_bail!("row encoding for Decimal256 is not yet implemented") + } + } + Ok(()) +} + +fn encode_decimal_typed( + arr: &DecimalArray, + mask: &vortex_mask::Mask, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], +) where + T: vortex_array::dtype::NativeDecimalType + RowEncode, +{ + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let value_bytes = size_of::(); + let total = encoded_size_for_fixed(byte_width_u32(value_bytes)); + let slice = arr.buffer::(); + for i in 0..slice.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += total; + } +} + +fn encode_varbinview( + arr: &VarBinViewArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + _ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let null_byte = varlen_null_sentinel(field); + let empty_byte = varlen_empty_sentinel(field); + let non_empty_byte = varlen_non_empty_sentinel(field); + + // `with_iterator` yields `Some(bytes)` for non-null rows and `None` for null rows, + // so the iterator alone fully describes validity — no separate mask lookup needed. + arr.with_iterator(|iter| { + for (i, maybe) in iter.enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + match maybe { + None => { + out[pos] = null_byte; + col_offset[i] += VARLEN_NULL_SIZE; + } + Some([]) => { + out[pos] = empty_byte; + col_offset[i] += VARLEN_EMPTY_SIZE; + } + Some(bytes) => { + out[pos] = non_empty_byte; + let written = + encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], field.descending); + col_offset[i] += 1 + written; + } + } + } + }); + Ok(()) +} + +fn encode_struct( + arr: &StructArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + + // Write the outer sentinel for each row. + for i in 0..n { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = if mask.value(i) { non_null } else { null }; + col_offset[i] += 1; + } + + // Encode each child. For non-null parent rows the child contributes its actual encoding; + // for null parent rows the child contributes its canonical null encoding so that two null + // parent rows produce byte-equal output regardless of underlying child values. + for child in arr.iter_unmasked_fields() { + match row_width_for_dtype(child.dtype())? { + RowWidth::Fixed(w) => { + let canonical = child.clone().execute::(ctx)?; + field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?; + // Replace null parent rows with the canonical null encoding (the same as a + // child-level null: null sentinel followed by zero-padded value bytes). + let null_byte = child_canonical_null_byte(child.dtype(), field); + for i in 0..n { + if !mask.value(i) { + let end = (row_offsets[i] + col_offset[i]) as usize; + let start = end - w as usize; + out[start] = null_byte; + for b in &mut out[start + 1..end] { + *b = 0; + } + } + } + } + RowWidth::Variable => { + encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?; + } + } + } + + Ok(()) +} + +fn encode_fsl( + arr: &FixedSizeListArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let nrows = arr.len(); + let list_size = arr.list_size() as usize; + let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?; + let non_null = field.non_null_sentinel(); + let null = field.null_sentinel(); + let elem_dtype = arr.elements().dtype().clone(); + + // Outer sentinel. + for i in 0..nrows { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = if mask.value(i) { non_null } else { null }; + col_offset[i] += 1; + } + + match row_width_for_dtype(&elem_dtype)? { + RowWidth::Fixed(w) => { + // Fixed-width elements: encode the elements array directly (its length is + // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite + // the body of null parent rows with the canonical null encoding per element. + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), nrows * list_size); + let list_size_u32 = arr.list_size(); + let row_body_bytes = w + .checked_mul(list_size_u32) + .vortex_expect("FSL body width overflow"); + let mut elem_offsets = vec![0u32; nrows * list_size]; + for i in 0..nrows { + let base = row_offsets[i] + col_offset[i]; + for j in 0u32..list_size_u32 { + elem_offsets[i * list_size + j as usize] = base + j * w; + } + } + let mut elem_cursors = vec![0u32; nrows * list_size]; + field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?; + for i in 0..nrows { + col_offset[i] = col_offset[i] + .checked_add(row_body_bytes) + .vortex_expect("FSL row body overflow"); + } + // Canonical null body for null parent rows: one null encoding per element. + let null_byte = child_canonical_null_byte(&elem_dtype, field); + let elem_width = w as usize; + for i in 0..nrows { + if !mask.value(i) { + let end = (row_offsets[i] + col_offset[i]) as usize; + let start = end - row_body_bytes as usize; + let mut pos = start; + for _ in 0..list_size { + out[pos] = null_byte; + for b in &mut out[pos + 1..pos + elem_width] { + *b = 0; + } + pos += elem_width; + } + } + } + } + RowWidth::Variable => { + // Variable-width elements: for null parent rows the canonical body is exactly + // `list_size` null sentinel bytes (one per element). For non-null parent rows, + // encode each element via a scratch buffer and copy into out. + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), nrows * list_size); + let mut elem_sizes = vec![0u32; nrows * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum(); + let total_usize = + usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize"); + let mut scratch = vec![0u8; total_usize]; + let mut scratch_offsets = Vec::with_capacity(nrows * list_size); + let mut acc: u32 = 0; + for &s in &elem_sizes { + scratch_offsets.push(acc); + acc = acc + .checked_add(s) + .vortex_expect("FSL scratch offset overflow"); + } + let mut scratch_cursors = vec![0u32; nrows * list_size]; + field_encode( + &elements, + field, + &scratch_offsets, + &mut scratch_cursors, + &mut scratch, + ctx, + )?; + let null_byte = child_canonical_null_byte(&elem_dtype, field); + for i in 0..nrows { + let dst = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + let mut body_bytes: u32 = 0; + for j in 0..list_size { + let k = i * list_size + j; + let src = scratch_offsets[k] as usize; + let sz = elem_sizes[k] as usize; + out[dst + body_bytes as usize..dst + body_bytes as usize + sz] + .copy_from_slice(&scratch[src..src + sz]); + body_bytes = body_bytes + .checked_add(elem_sizes[k]) + .vortex_expect("FSL body bytes overflow"); + } + col_offset[i] = col_offset[i] + .checked_add(body_bytes) + .vortex_expect("FSL row offset overflow"); + } else { + for offset in 0..list_size { + out[dst + offset] = null_byte; + } + col_offset[i] = col_offset[i] + .checked_add(u32::try_from(list_size).vortex_expect("list_size fits u32")) + .vortex_expect("FSL row offset overflow"); + } + } + } + } + + Ok(()) +} + +/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's +/// natural encoding from a scratch buffer; for null parent rows, write a single +/// `child_canonical_null_byte`. +fn encode_variable_child( + child: &vortex_array::ArrayRef, + field: RowSortField, + parent_mask: &vortex_mask::Mask, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = child.len(); + let canonical = child.clone().execute::(ctx)?; + + // Size and encode the child into a sequential scratch buffer. + let mut child_sizes = vec![0u32; n]; + field_size(&canonical, field, &mut child_sizes, ctx)?; + let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum(); + let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize"); + let mut scratch = vec![0u8; total_usize]; + let mut scratch_offsets = Vec::with_capacity(n); + let mut acc: u32 = 0; + for &s in &child_sizes { + scratch_offsets.push(acc); + acc = acc + .checked_add(s) + .vortex_expect("child scratch offset overflow"); + } + let mut scratch_cursors = vec![0u32; n]; + field_encode( + &canonical, + field, + &scratch_offsets, + &mut scratch_cursors, + &mut scratch, + ctx, + )?; + + let null_byte = child_canonical_null_byte(child.dtype(), field); + for i in 0..n { + let dst = (row_offsets[i] + col_offset[i]) as usize; + if parent_mask.value(i) { + let src = scratch_offsets[i] as usize; + let sz = child_sizes[i] as usize; + out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]); + col_offset[i] = col_offset[i] + .checked_add(child_sizes[i]) + .vortex_expect("col_offset overflow"); + } else { + out[dst] = null_byte; + col_offset[i] = col_offset[i] + .checked_add(1) + .vortex_expect("col_offset overflow"); + } + } + Ok(()) +} + +fn encode_extension( + arr: &ExtensionArray, + field: RowSortField, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let storage = arr.storage_array().clone().execute::(ctx)?; + field_encode(&storage, field, row_offsets, col_offset, out, ctx) +} + +/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with +/// continuation/length markers. Returns the number of bytes written. Empty values are +/// encoded by the caller as a single sentinel byte and never reach this function. +fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { + debug_assert!(!bytes.is_empty()); + let xor = if descending { 0xFFu8 } else { 0x00 }; + let mut written = 0usize; + let mut remaining = bytes; + while remaining.len() > VARLEN_BLOCK_SIZE { + // Full block, continuation marker 0xFF (then XORed if descending). + let block = &remaining[..VARLEN_BLOCK_SIZE]; + for (i, &b) in block.iter().enumerate() { + out[written + i] = b ^ xor; + } + out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor; + written += VARLEN_BLOCK_TOTAL; + remaining = &remaining[VARLEN_BLOCK_SIZE..]; + } + // Final partial block: pad with zeros, last byte = remaining.len() (1..=32). + let n = remaining.len(); + for (i, &b) in remaining.iter().enumerate() { + out[written + i] = b ^ xor; + } + for j in n..VARLEN_BLOCK_SIZE { + out[written + j] = xor; + } + out[written + VARLEN_BLOCK_SIZE] = + u8::try_from(n).vortex_expect("final varlen block length must fit in u8") ^ xor; + written += VARLEN_BLOCK_TOTAL; + u32::try_from(written).vortex_expect("encoded varlen byte length must fit in u32") +} + +/// Internal trait for encoding a fixed-width native value into byte slots. +/// +/// Implementations must produce a sequence of `size_of::()` bytes that is +/// lexicographically byte-comparable according to the natural ordering of the type. +pub(crate) trait RowEncode: Copy { + /// Encode this value into `out`, inverting the bytes for descending order. + fn encode_to(self, out: &mut [u8], descending: bool); +} + +macro_rules! impl_row_encode_unsigned { + ($t:ty) => { + impl RowEncode for $t { + #[inline] + fn encode_to(self, out: &mut [u8], descending: bool) { + let bytes = self.to_be_bytes(); + if descending { + for (i, b) in bytes.iter().enumerate() { + out[i] = b ^ 0xFF; + } + } else { + out.copy_from_slice(&bytes); + } + } + } + }; +} + +macro_rules! impl_row_encode_signed { + ($t:ty) => { + impl RowEncode for $t { + #[inline] + fn encode_to(self, out: &mut [u8], descending: bool) { + let mut bytes = self.to_be_bytes(); + // Flip sign bit so negatives < non-negatives lexicographically. + bytes[0] ^= 0x80; + if descending { + for (i, b) in bytes.iter().enumerate() { + out[i] = b ^ 0xFF; + } + } else { + out.copy_from_slice(&bytes); + } + } + } + }; +} + +impl_row_encode_unsigned!(u8); +impl_row_encode_unsigned!(u16); +impl_row_encode_unsigned!(u32); +impl_row_encode_unsigned!(u64); +impl_row_encode_signed!(i8); +impl_row_encode_signed!(i16); +impl_row_encode_signed!(i32); +impl_row_encode_signed!(i64); +impl_row_encode_signed!(i128); + +impl RowEncode for f32 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u32 = if (bits >> 31) == 0 { + 0x8000_0000 + } else { + 0xFFFF_FFFF + }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +impl RowEncode for f64 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u64 = if (bits >> 63) == 0 { + 0x8000_0000_0000_0000 + } else { + 0xFFFF_FFFF_FFFF_FFFF + }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +impl RowEncode for f16 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs new file mode 100644 index 00000000000..d3721e49a6e --- /dev/null +++ b/vortex-row/src/encode.rs @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `RowEncode` variadic scalar function: encode N input columns into a single `ListView`. +//! +//! The output's `(elements, offsets, sizes)` triple is built up in a single left-to-right +//! pass over the input columns. The `sizes` array doubles as the per-row write cursor, so +//! when the last column finishes encoding, the accumulator is the final array - no separate +//! conversion step is needed. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_session::VortexSession; + +use crate::codec; +use crate::options::RowEncodingOptions; +use crate::options::deserialize_row_encoding_options; +use crate::options::serialize_row_encoding_options; +use crate::size::compute_sizes; + +/// Variadic scalar function that encodes N input columns into a single `List` +/// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values +/// `cols[0][i], cols[1][i], ...` concatenated left-to-right. +/// +/// This scalar function is public for session registration and encoding extension work. +/// Most callers should use [`RowEncoder`](crate::RowEncoder) rather than invoking the scalar +/// function directly. +#[derive(Clone, Debug)] +pub struct RowEncode; + +impl ScalarFnVTable for RowEncode { + type Options = RowEncodingOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::from("vortex.row_encode") + } + + fn serialize(&self, options: &Self::Options) -> VortexResult>> { + Ok(Some(serialize_row_encoding_options(options))) + } + + fn deserialize( + &self, + metadata: &[u8], + _session: &VortexSession, + ) -> VortexResult { + deserialize_row_encoding_options(metadata) + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Variadic { min: 1, max: None } + } + + fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName { + ChildName::from(Arc::from(format!("col_{}", child_idx))) + } + + fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult { + Ok(DType::List( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + Nullability::NonNullable, + )) + } + + fn execute( + &self, + options: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + execute_row_encode(options, args, ctx) + } + + fn is_null_sensitive(&self, _options: &Self::Options) -> bool { + true + } + + fn is_fallible(&self, _options: &Self::Options) -> bool { + false + } +} + +fn execute_row_encode( + options: &RowEncodingOptions, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let nrows = args.row_count(); + if u32::try_from(nrows).is_err() { + vortex_bail!("row-encoded input has {} rows, exceeds u32::MAX", nrows); + } + + // ===== Phase 1: classify + size pass ===== + let crate::size::SizePassResult { + fixed_per_row, + var_lengths, + columns, + } = compute_sizes(options, args, ctx)?; + + // ===== Phase 2: totals + buffer ===== + let var_total: u64 = var_lengths + .as_ref() + .map_or(0, |v| v.iter().map(|&x| u64::from(x)).sum()); + let total: u64 = (nrows as u64) + .checked_mul(u64::from(fixed_per_row)) + .and_then(|t| t.checked_add(var_total)) + .ok_or_else(|| { + vortex_error::vortex_err!("row-encoded total bytes overflow u64 (nrows * fixed + var)") + })?; + if total > u32::MAX as u64 { + vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total); + } + let total_len = + usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize"); + + // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder + // assume previously untouched bytes are zero, simplifying the null-row fill paths. + let mut out_buf: BufferMut = BufferMut::with_capacity(total_len); + out_buf.push_n(0u8, total_len); + + // ===== Phase 3: per-row offsets ===== + // listview_offsets[i] is the absolute byte offset where row `i` begins. + // For pure-fixed: i * fixed_per_row. + // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths. + // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end. + let nrows_u32 = + u32::try_from(nrows).vortex_expect("nrows fits u32 (validated earlier in this function)"); + let mut listview_offsets: BufferMut = BufferMut::with_capacity(nrows); + match var_lengths.as_ref() { + None => { + for row_idx in 0..nrows_u32 { + // Total bytes already fit in u32, so row_idx * fixed_per_row also does. + listview_offsets.push(row_idx * fixed_per_row); + } + } + Some(v) => { + let mut acc: u32 = 0; + for (row_idx, &l) in (0..nrows_u32).zip(v.iter()) { + // The arithmetic below cannot overflow because we already verified the + // total fits in u32. + listview_offsets.push(row_idx * fixed_per_row + acc); + acc += l; + } + } + } + let listview_offsets_slice: &[u32] = listview_offsets.as_slice(); + + // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build + // it as a BufferMut so we can hand it directly to the output PrimitiveArray. + let mut row_cursors: BufferMut = BufferMut::with_capacity(nrows); + row_cursors.push_n(0u32, nrows); + + // ===== Phase 4: encode columns via the cursor path ===== + // Each column was canonicalized once during the size pass; reuse that canonical form. + for (i, canonical) in columns.iter().enumerate() { + codec::field_encode( + canonical, + options.fields[i], + listview_offsets_slice, + row_cursors.as_mut_slice(), + &mut out_buf, + ctx, + )?; + } + + // ===== Phase 5: build ListView output ===== + let elements = PrimitiveArray::new(out_buf.freeze(), Validity::NonNullable).into_array(); + let offsets_arr = + PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array(); + let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array(); + Ok( + ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)? + .into_array(), + ) +} diff --git a/vortex-row/src/encoder.rs b/vortex-row/src/encoder.rs new file mode 100644 index 00000000000..15eeda6d2f1 --- /dev/null +++ b/vortex-row/src/encoder.rs @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! User-facing entry point: turn N columnar arrays into one row-encoded `ListView`. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::ListViewArray; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::scalar_fn::VecExecutionArgs; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::encode::RowEncode; +use crate::options::RowEncodingOptions; +use crate::options::RowSortField; +use crate::size::RowSize; + +/// Encodes N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose row +/// byte slices compare lexicographically in the same order as a tuple comparison of the input +/// values under the configured [`RowSortField`]s. +/// +/// Construct with [`RowEncoder::new`] or [`RowEncoder::with_options`] to pin the per-column +/// sort options, or use [`RowEncoder::default`] to apply ascending, nulls-first ordering to +/// every column. The same encoder can be reused across calls. +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct RowEncoder { + options: Option, +} + +impl RowEncoder { + /// Construct a `RowEncoder` from one [`RowSortField`] per input column. + pub fn new(fields: impl IntoIterator) -> Self { + Self { + options: Some(RowEncodingOptions::new(fields)), + } + } + + /// Construct a `RowEncoder` from an explicit [`RowEncodingOptions`]. + pub fn with_options(options: RowEncodingOptions) -> Self { + Self { + options: Some(options), + } + } + + /// Borrow the configured options, or `None` when the encoder applies default + /// (ascending, nulls-first) ordering inferred from the column count at encode time. + pub fn options(&self) -> Option<&RowEncodingOptions> { + self.options.as_ref() + } + + /// Encode `cols` into a single row-oriented [`ListViewArray`] of `u8`. + pub fn encode(&self, cols: &[ArrayRef], ctx: &mut ExecutionCtx) -> VortexResult { + let (options, args) = self.prepare(cols)?; + RowEncode + .execute(&options, &args, ctx)? + .execute::(ctx) + } + + /// Compute only the per-row sizes (the `Struct { fixed: u32, var: u32 }` produced by + /// [`RowSize`]) without materializing the encoded rows. + pub fn row_sizes(&self, cols: &[ArrayRef], ctx: &mut ExecutionCtx) -> VortexResult { + let (options, args) = self.prepare(cols)?; + RowSize.execute(&options, &args, ctx) + } + + /// Validate the input columns and resolve the options + execution args shared by + /// [`encode`](Self::encode) and [`row_sizes`](Self::row_sizes). + fn prepare(&self, cols: &[ArrayRef]) -> VortexResult<(RowEncodingOptions, VecExecutionArgs)> { + if cols.is_empty() { + vortex_bail!("RowEncoder: at least one column is required"); + } + let options = match &self.options { + Some(options) => { + if options.len() != cols.len() { + vortex_bail!( + "RowEncoder: options describe {} columns but {} were provided", + options.len(), + cols.len() + ); + } + options.clone() + } + None => RowEncodingOptions::default_for_columns(cols.len()), + }; + let nrows = cols[0].len(); + for (i, col) in cols.iter().enumerate() { + if col.len() != nrows { + vortex_bail!( + "RowEncoder: column {} has length {} but expected {}", + i, + col.len(), + nrows + ); + } + } + Ok((options, VecExecutionArgs::new(cols.to_vec(), nrows))) + } +} + +/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes +/// are lexicographically comparable in the same order as a tuple comparison of the input +/// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`]. +pub fn convert_columns( + cols: &[ArrayRef], + fields: &[RowSortField], + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::new(fields.iter().copied()).encode(cols, ctx) +} + +/// Like [`convert_columns`] but takes a prebuilt [`RowEncodingOptions`]. +pub fn convert_columns_with_options( + cols: &[ArrayRef], + options: &RowEncodingOptions, + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::with_options(options.clone()).encode(cols, ctx) +} + +/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns. +/// Convenience wrapper over [`RowEncoder::row_sizes`]. +pub fn compute_row_sizes( + cols: &[ArrayRef], + fields: &[RowSortField], + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::new(fields.iter().copied()).row_sizes(cols, ctx) +} + +/// Like [`compute_row_sizes`] but takes a prebuilt [`RowEncodingOptions`]. +pub fn compute_row_sizes_with_options( + cols: &[ArrayRef], + options: &RowEncodingOptions, + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::with_options(options.clone()).row_sizes(cols, ctx) +} diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs new file mode 100644 index 00000000000..d921e2998e3 --- /dev/null +++ b/vortex-row/src/lib.rs @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Row-oriented byte encoding for Vortex arrays. +//! +//! This crate converts one or more columnar arrays into a single `ListView` array whose +//! row byte slices can be compared lexicographically. The byte ordering matches tuple +//! ordering of the input values under the requested [`RowSortField`] settings, making the +//! representation useful for sort keys and other row-key operations. +//! +//! The public entry points are: +//! - [`RowEncoder`], the primary API for encoding columns into row bytes. +//! - [`RowEncoder::row_sizes`], which computes the fixed and variable byte contributions +//! without materializing the encoded rows. +//! - [`convert_columns`] and [`compute_row_sizes`], compatibility helpers around +//! [`RowEncoder`]. +//! - [`initialize`], which registers the [`RowSize`] and [`RowEncode`] scalar functions on a +//! [`VortexSession`]. +//! +//! Internally, encoding is split into two scalar functions. [`RowSize`] performs the sizing +//! pass and classifies fixed-width versus variable-width input columns. [`RowEncode`] uses +//! those sizes to allocate one contiguous elements buffer, then writes each column's bytes +//! into the per-row slots from left to right. +//! +//! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to +//! 128 bits, UTF-8 and binary values, structs, fixed-size lists, and extensions whose storage +//! type is supported. Variant, union, and variable-size list arrays are rejected because this +//! crate does not define an ordering for them. + +mod codec; +mod encode; +mod encoder; +mod options; +mod size; + +#[cfg(test)] +mod tests; + +pub use encode::RowEncode; +pub use encoder::RowEncoder; +pub use encoder::compute_row_sizes; +pub use encoder::compute_row_sizes_with_options; +pub use encoder::convert_columns; +pub use encoder::convert_columns_with_options; +pub use options::RowEncodingOptions; +pub use options::RowSortField; +pub use size::RowSize; +use vortex_array::scalar_fn::session::ScalarFnSessionExt; +use vortex_session::VortexSession; + +/// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given +/// session. +/// +/// Call this during session construction when row encoding must be available through the +/// expression layer. The direct [`RowEncoder`] API constructs the scalar-function calls +/// itself and does not require global registration. +pub fn initialize(session: &VortexSession) { + session.scalar_fns().register(RowSize); + session.scalar_fns().register(RowEncode); +} diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs new file mode 100644 index 00000000000..380c9a3827f --- /dev/null +++ b/vortex-row/src/options.rs @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Display; +use std::fmt::Formatter; + +use smallvec::SmallVec; + +/// Per-column ordering options for row-oriented encoding. +/// +/// A `RowSortField` describes how one input column contributes to a row key. Descending order +/// reverses the encoded value bytes for that column. Null placement is controlled separately, +/// so nulls keep the requested position relative to non-null values in either direction. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct RowSortField { + /// If true, this column sorts in descending order. + pub descending: bool, + /// If true, nulls sort before non-null values. + pub nulls_first: bool, +} + +impl Default for RowSortField { + fn default() -> Self { + Self::ascending() + } +} + +impl Display for RowSortField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "descending={}, nulls_first={}", + self.descending, self.nulls_first + ) + } +} + +impl RowSortField { + /// Construct a new `RowSortField` with explicit options. + pub const fn new(descending: bool, nulls_first: bool) -> Self { + Self { + descending, + nulls_first, + } + } + + /// Construct an ascending field with nulls first. + pub const fn ascending() -> Self { + Self::new(false, true) + } + + /// Construct a descending field with nulls first. + pub const fn descending() -> Self { + Self::new(true, true) + } + + /// Return this field with nulls ordered before non-null values. + pub const fn nulls_first(mut self) -> Self { + self.nulls_first = true; + self + } + + /// Return this field with nulls ordered after non-null values. + pub const fn nulls_last(mut self) -> Self { + self.nulls_first = false; + self + } + + /// Returns the sentinel byte to write for a non-null value. + #[inline] + pub(crate) fn non_null_sentinel(&self) -> u8 { + // Non-null is always 0x01. Null choices are < or > 0x01. + 0x01 + } + + /// Returns the sentinel byte to write for a null value. + #[inline] + pub(crate) fn null_sentinel(&self) -> u8 { + if self.nulls_first { + // Nulls before non-nulls (smaller byte sorts first). + 0x00 + } else { + // Nulls after non-nulls (larger byte sorts later). + 0x02 + } + } +} + +const FIELDS_INLINE: usize = 4; + +/// Ordering options for row-oriented encoding. +/// +/// The options contain one [`RowSortField`] per input column, in the same order as the columns +/// passed to [`convert_columns`](crate::convert_columns), +/// [`compute_row_sizes`](crate::compute_row_sizes), [`RowSize`](crate::RowSize), or +/// [`RowEncode`](crate::RowEncode). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct RowEncodingOptions { + pub(crate) fields: SmallVec<[RowSortField; FIELDS_INLINE]>, +} + +impl RowEncodingOptions { + /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortField`]s. + pub fn new(fields: impl IntoIterator) -> Self { + Self { + fields: fields.into_iter().collect(), + } + } + + /// Construct default ascending, nulls-first options for `column_count` input columns. + pub fn default_for_columns(column_count: usize) -> Self { + Self::new(std::iter::repeat_n(RowSortField::default(), column_count)) + } + + /// Borrow the per-column sort fields. + pub fn fields(&self) -> &[RowSortField] { + &self.fields + } + + /// Return the number of input columns described by these options. + pub fn len(&self) -> usize { + self.fields.len() + } + + /// Return true when the options do not describe any input columns. + pub fn is_empty(&self) -> bool { + self.fields.is_empty() + } +} + +impl FromIterator for RowEncodingOptions { + fn from_iter>(iter: T) -> Self { + Self::new(iter) + } +} + +impl Display for RowEncodingOptions { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for (i, field) in self.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}", field)?; + } + write!(f, "]") + } +} + +/// Serialize a [`RowEncodingOptions`] to a compact byte vector: 4-byte LE length followed by +/// `2 * len` bytes (descending + nulls_first booleans for each field). +pub(crate) fn serialize_row_encoding_options(opts: &RowEncodingOptions) -> Vec { + use vortex_error::VortexExpect; + let n = + u32::try_from(opts.fields.len()).vortex_expect("RowEncodingOptions length must fit in u32"); + let mut out = Vec::with_capacity(4 + 2 * opts.fields.len()); + out.extend_from_slice(&n.to_le_bytes()); + for f in &opts.fields { + out.push(u8::from(f.descending)); + out.push(u8::from(f.nulls_first)); + } + out +} + +/// Deserialize a [`RowEncodingOptions`] produced by [`serialize_row_encoding_options`]. +pub(crate) fn deserialize_row_encoding_options( + bytes: &[u8], +) -> vortex_error::VortexResult { + if bytes.len() < 4 { + vortex_error::vortex_bail!( + "RowEncodingOptions metadata must contain a 4-byte length prefix" + ); + } + let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; + let expected = 4 + 2 * n; + if bytes.len() != expected { + vortex_error::vortex_bail!( + "RowEncodingOptions metadata wrong size: got {}, expected {}", + bytes.len(), + expected + ); + } + let mut fields: SmallVec<[RowSortField; FIELDS_INLINE]> = SmallVec::with_capacity(n); + let mut i = 4; + for _ in 0..n { + fields.push(RowSortField { + descending: bytes[i] != 0, + nulls_first: bytes[i + 1] != 0, + }); + i += 2; + } + Ok(RowEncodingOptions { fields }) +} diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs new file mode 100644 index 00000000000..26269081ce7 --- /dev/null +++ b/vortex-row/src/size.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::dtype::StructFields; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_session::VortexSession; + +use crate::codec; +use crate::codec::RowWidth; +use crate::options::RowEncodingOptions; +use crate::options::deserialize_row_encoding_options; +use crate::options::serialize_row_encoding_options; + +/// Result of the size pass: enough information for both [`RowSize::execute`] and the +/// downstream [`RowEncode`](super::encode::RowEncode) pipeline. +/// +/// `columns` holds the canonicalized form of each input so the encode pass can write bytes +/// without re-decoding — a single canonicalization per column is shared between size and +/// encode. +pub(crate) struct SizePassResult { + pub fixed_per_row: u32, + pub var_lengths: Option>, + pub columns: Vec, +} + +/// Walk N input columns once, classifying each as fixed-width or variable-length and +/// accumulating per-row size contributions. +/// +/// Fixed-width columns contribute a single scalar increment to `fixed_per_row`; they do +/// not touch `var_lengths`. Variable-length columns add per-row contributions into the +/// lazily-allocated `var_lengths` vec via [`dispatch_size`]. +/// +/// This is shared by [`RowSize::execute`] (which wraps the result into a +/// `Struct { fixed, var }`) and the [`RowEncode`](super::encode::RowEncode) pipeline +/// (which reuses the canonicalized columns for the encode pass). +pub(crate) fn compute_sizes( + options: &RowEncodingOptions, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let n_inputs = args.num_inputs(); + if n_inputs == 0 { + vortex_bail!("at least one input column is required"); + } + if options.len() != n_inputs { + vortex_bail!( + "options len ({}) does not match num_inputs ({})", + options.len(), + n_inputs + ); + } + let nrows = args.row_count(); + + let mut columns: Vec = Vec::with_capacity(n_inputs); + let mut fixed_per_row: u32 = 0; + let mut var_lengths: Option> = None; + + for i in 0..n_inputs { + let col = args.get(i)?; + if col.len() != nrows { + vortex_bail!( + "column {} has length {} but expected {}", + i, + col.len(), + nrows + ); + } + let width = codec::row_width_for_dtype(col.dtype())?; + // Canonicalize once and reuse for both sizing (variable columns) and encoding. + let canonical = col.execute::(ctx)?; + match width { + RowWidth::Fixed(w) => { + fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(|| { + vortex_error::vortex_err!("per-row fixed width overflows u32 at column {}", i) + })?; + } + RowWidth::Variable => { + let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]); + codec::field_size(&canonical, options.fields[i], v, ctx)?; + } + } + columns.push(canonical); + } + + Ok(SizePassResult { + fixed_per_row, + var_lengths, + columns, + }) +} + +/// Variadic scalar function that, given N input columns and per-column +/// [`RowSortField`](crate::RowSortField)s, +/// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the +/// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode). +/// +/// The `fixed` field is always a [`ConstantArray`] holding the sum of the per-column +/// constant widths of fixed-width inputs (sentinel + value bytes). The `var` field is a +/// `ConstantArray(0)` when there are no variable-length input columns, and a +/// [`PrimitiveArray`] of per-row varlen-byte sums otherwise. +/// +/// The total per-row byte size is `fixed + var`. +/// +/// This scalar function is public for session registration and encoding extension work. +/// Most callers should use [`RowEncoder::row_sizes`](crate::RowEncoder::row_sizes) rather +/// than invoking the scalar function directly. +#[derive(Clone, Debug)] +pub struct RowSize; + +/// Returns the [`FieldNames`] used by the [`RowSize`] output struct. +pub(crate) fn row_size_field_names() -> FieldNames { + FieldNames::from([FieldName::from("fixed"), FieldName::from("var")]) +} + +/// Returns the output [`DType`] of [`RowSize`]. +pub(crate) fn row_size_struct_dtype() -> DType { + DType::Struct( + StructFields::new( + row_size_field_names(), + vec![ + DType::Primitive(PType::U32, Nullability::NonNullable), + DType::Primitive(PType::U32, Nullability::NonNullable), + ], + ), + Nullability::NonNullable, + ) +} + +impl ScalarFnVTable for RowSize { + type Options = RowEncodingOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::from("vortex.row_size") + } + + fn serialize(&self, options: &Self::Options) -> VortexResult>> { + Ok(Some(serialize_row_encoding_options(options))) + } + + fn deserialize( + &self, + metadata: &[u8], + _session: &VortexSession, + ) -> VortexResult { + deserialize_row_encoding_options(metadata) + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Variadic { min: 1, max: None } + } + + fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName { + ChildName::from(Arc::from(format!("col_{}", child_idx))) + } + + fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult { + Ok(row_size_struct_dtype()) + } + + fn execute( + &self, + options: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let nrows = args.row_count(); + let result = compute_sizes(options, args, ctx)?; + let fixed_array = + ConstantArray::new(Scalar::from(result.fixed_per_row), nrows).into_array(); + let var_array = match result.var_lengths { + Some(v) => PrimitiveArray::new(Buffer::::copy_from(&v), Validity::NonNullable) + .into_array(), + None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(), + }; + Ok(StructArray::try_new( + row_size_field_names(), + vec![fixed_array, var_array], + nrows, + Validity::NonNullable, + )? + .into_array()) + } + + fn is_null_sensitive(&self, _options: &Self::Options) -> bool { + true + } + + fn is_fallible(&self, _options: &Self::Options) -> bool { + false + } +} diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs new file mode 100644 index 00000000000..62e0e4cfb98 --- /dev/null +++ b/vortex-row/src/tests.rs @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Tests for the row encoder. + +use std::f64::consts::PI; + +use rstest::rstest; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::listview::ListViewArrayExt; +use vortex_error::VortexResult; + +use crate::RowEncoder; +use crate::RowEncodingOptions; +use crate::RowSortField; +use crate::compute_row_sizes_with_options; +use crate::convert_columns; +use crate::convert_columns_with_options; + +fn collect_row_bytes(array: &ListViewArray) -> Vec> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let nrows = array.len(); + (0..nrows) + .map(|i| { + let slice = array.list_elements_at(i).unwrap(); + let p = slice.execute::(&mut ctx).unwrap(); + p.as_slice::().to_vec() + }) + .collect() +} + +/// Encode each column independently, sort the resulting row bytes, and check the permutation +/// matches the natural sort order of `values`. +fn assert_sort_order_i64(values: Vec, descending: bool) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let field = RowSortField::new(descending, true); + let encoded = convert_columns(&[col], &[field], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + // Build expected permutation: sort values naturally then compare to bytes-sorted order. + let mut idx: Vec = (0..values.len()).collect(); + if descending { + idx.sort_by(|a, b| values[*b].cmp(&values[*a])); + } else { + idx.sort_by(|a, b| values[*a].cmp(&values[*b])); + } + let expected_order: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + + let mut sorted = rows; + sorted.sort(); + assert_eq!( + sorted, expected_order, + "Row-encoded bytes do not match natural sort order" + ); + Ok(()) +} + +#[rstest] +#[case::ascending(false)] +#[case::descending(true)] +fn primitive_i64_roundtrip(#[case] descending: bool) -> VortexResult<()> { + let values: Vec = vec![-5, 0, 5, i64::MIN, i64::MAX, 7, -7, 1]; + assert_sort_order_i64(values, descending) +} + +#[test] +fn primitive_u32_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = vec![0, 1, 100, u32::MAX, 42, 17]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].cmp(&values[*b])); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn primitive_f64_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // We use IEEE total-ordering semantics: -0.0 < +0.0 in the byte encoding (matches + // `arrow-row`). Avoid -0.0 in the natural-order baseline since partial_cmp says + // -0.0 == 0.0. + let values: Vec = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, PI]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap()); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn bool_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col = BoolArray::from_iter([true, false, true, false]).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + // false rows come first (2x), true rows after (2x) + assert_eq!(sorted[0], rows[1]); + assert_eq!(sorted[1], rows[3]); + assert_eq!(sorted[2], rows[0]); + assert_eq!(sorted[3], rows[2]); + Ok(()) +} + +#[test] +fn utf8_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values = vec![ + "banana", + "apple", + "", + "cherry", + "ban", + "banana_loaf_for_test", + ]; + let col = VarBinViewArray::from_iter_str(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].cmp(values[*b])); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted, expected); + Ok(()) +} + +#[test] +fn multi_column_sort() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let ints: Vec = vec![1, 2, 1, 2, 1, 3]; + let strs = vec!["b", "a", "a", "b", "c", "z"]; + let col0 = PrimitiveArray::from_iter(ints.clone()).into_array(); + let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array(); + let encoded = convert_columns( + &[col0, col1], + &[RowSortField::default(), RowSortField::default()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + let mut idx: Vec = (0..ints.len()).collect(); + idx.sort_by(|a, b| ints[*a].cmp(&ints[*b]).then_with(|| strs[*a].cmp(strs[*b]))); + let expected: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted, expected); + Ok(()) +} + +#[test] +fn nulls_first_and_last() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec> = vec![Some(5), None, Some(1), None, Some(3)]; + let col = PrimitiveArray::from_option_iter(values.clone()).into_array(); + + // nulls_first=true + let encoded = convert_columns( + std::slice::from_ref(&col), + &[RowSortField::ascending()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + let mut sorted = rows; + sorted.sort(); + // The first two sorted entries should be nulls + let null_count = values.iter().filter(|v| v.is_none()).count(); + for i in 0..null_count { + // a null encoded row begins with 0x00 + assert_eq!(sorted[i][0], 0x00); + } + // nulls_first=false + let encoded = convert_columns(&[col], &[RowSortField::ascending().nulls_last()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + let mut sorted = rows; + sorted.sort(); + // The last two sorted entries should be nulls + for i in 0..null_count { + let pos = sorted.len() - 1 - i; + assert_eq!(sorted[pos][0], 0x02); + } + Ok(()) +} + +#[test] +fn reusable_options_helpers() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let options = RowEncodingOptions::new([RowSortField::descending().nulls_last()]); + assert_eq!(options.len(), 1); + assert!(!options.is_empty()); + assert_eq!( + options.fields(), + &[RowSortField { + descending: true, + nulls_first: false + }] + ); + + let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + let encoder = RowEncoder::with_options(options.clone()); + assert_eq!(encoder.options(), Some(&options)); + + let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?; + assert_eq!(encoded.len(), 3); + + let sizes = encoder.row_sizes(std::slice::from_ref(&col), &mut ctx)?; + assert_eq!(sizes.len(), 3); + + let encoded = convert_columns_with_options(std::slice::from_ref(&col), &options, &mut ctx)?; + assert_eq!(encoded.len(), 3); + + let sizes = compute_row_sizes_with_options(std::slice::from_ref(&col), &options, &mut ctx)?; + assert_eq!(sizes.len(), 3); + Ok(()) +} + +#[test] +fn row_encoder_new_accepts_sort_fields() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let encoder = RowEncoder::new([RowSortField::ascending()]); + let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + + let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?; + assert_eq!(encoded.len(), 3); + Ok(()) +} + +#[test] +fn default_row_encoder_uses_default_fields() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col0 = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + let col1 = PrimitiveArray::from_iter([4i32, 5, 6]).into_array(); + + let encoded = RowEncoder::default().encode(&[col0, col1], &mut ctx)?; + assert_eq!(encoded.len(), 3); + Ok(()) +} + +#[test] +fn struct_sort_order() -> VortexResult<()> { + use vortex_array::arrays::StructArray; + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let ids: Vec = vec![3, 1, 3, 1, 2]; + let names = vec!["b", "a", "a", "b", "z"]; + let id_arr = PrimitiveArray::from_iter(ids.clone()).into_array(); + let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array(); + let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array(); + + let encoded = convert_columns(&[struct_arr], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + let mut sorted = rows.clone(); + sorted.sort(); + let mut idx: Vec = (0..ids.len()).collect(); + idx.sort_by(|a, b| ids[*a].cmp(&ids[*b]).then_with(|| names[*a].cmp(names[*b]))); + let expected: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted, expected); + Ok(()) +} + +#[test] +fn row_size_struct_shape() -> VortexResult<()> { + use vortex_array::arrays::Constant; + use vortex_array::arrays::StructArray; + use vortex_array::arrays::struct_::StructArrayExt; + + use crate::compute_row_sizes; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let ints: Vec = vec![1, 2, 3, 4, 5]; + let strs = vec!["a", "bb", "ccc", "", "eeeee"]; + let col0 = PrimitiveArray::from_iter(ints).into_array(); + let col1 = VarBinViewArray::from_iter_str(strs).into_array(); + + let sizes = compute_row_sizes( + &[col0, col1], + &[RowSortField::default(), RowSortField::default()], + &mut ctx, + )?; + // Shape must be Struct { fixed, var } + let struct_arr = sizes.execute::(&mut ctx)?; + assert_eq!(struct_arr.struct_fields().nfields(), 2); + let fixed = struct_arr.unmasked_field(0); + let var = struct_arr.unmasked_field(1); + + // `fixed` must be ConstantArray with value = encoded i32 width = 1 + 4 = 5. + let fixed_const = fixed + .as_opt::() + .expect("fixed field should be a ConstantArray"); + assert_eq!( + fixed_const.scalar(), + &vortex_array::scalar::Scalar::from(5u32), + "fixed scalar should be encoded primitive i32 width" + ); + + // `var` must be a PrimitiveArray, since we have a varlen column. + let var_prim = var.clone().execute::(&mut ctx)?; + let v: &[u32] = var_prim.as_slice(); + assert_eq!(v.len(), 5); + // empty string: just the empty sentinel (1 byte); null or non-empty: + // sentinel(1) + 33 bytes (single block). + let expected: Vec = vec![34, 34, 34, 1, 34]; + assert_eq!(v, expected.as_slice()); + Ok(()) +} + +#[test] +fn single_buffer_invariant() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // Encoded rows here are all > 12 bytes, forcing the Ref-view path that points back into + // the shared data buffer. + let nrows = 64usize; + let primitives: Vec = (0..nrows as i64).collect(); + let strings: Vec = (0..nrows) + .map(|i| format!("row_{}_with_padding", i)) + .collect(); + let col0 = PrimitiveArray::from_iter(primitives).into_array(); + let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array(); + let encoded = convert_columns( + &[col0, col1], + &[RowSortField::default(), RowSortField::default()], + &mut ctx, + )?; + + let rows = collect_row_bytes(&encoded); + let expected_total: usize = rows.iter().map(|r| r.len()).sum(); + + // The shared data buffer holds the contiguous concatenation of every row's encoded bytes; + // per-row allocations would produce many small buffers instead of one shared buffer. + // ListView's elements array is a single contiguous primitive (u8) array; its length + // equals the sum of all per-row sizes. A per-row allocation strategy would instead + // produce N separate elements arrays or a sparse one. + let elements_len = encoded.elements().len(); + assert_eq!( + elements_len, expected_total, + "elements buffer size mismatch" + ); + Ok(()) +} + +/// Regression: with the previous 2-sentinel varlen scheme, an empty col1 followed by a +/// non-empty col1 that happened to start with `\0` would corrupt multi-column lex order +/// because col2's first byte aligned against col1's pad in the longer row. With the +/// 3-sentinel scheme byte position 0 alone distinguishes empty from non-empty, so column +/// boundaries always align. +#[test] +fn multi_column_varlen_empty_vs_nul_byte_string() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // col1: empty vs single 0-byte. col2: same int for all rows. + let col1 = VarBinViewArray::from_iter_str(["", "\0", "a", "ab"]).into_array(); + let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1]).into_array(); + let encoded = convert_columns( + &[col1, col2], + &[RowSortField::default(), RowSortField::default()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + + // Logical natural order of col1: "" < "\0" < "a" < "ab". + // Byte sort of the encoded rows must put them in that same order. + let sorted_indices_by_bytes = { + let mut indices: Vec = (0..rows.len()).collect(); + indices.sort_by(|a, b| rows[*a].cmp(&rows[*b])); + indices + }; + assert_eq!( + sorted_indices_by_bytes, + vec![0, 1, 2, 3], + "byte sort must match natural col1 order; sorted indices were {:?}", + sorted_indices_by_bytes + ); + Ok(()) +} + +/// Regression: null col1 must sort distinct from empty col1 even when col2 follows. With +/// the 3-sentinel scheme null=0x00, empty=0x01 differ at byte 0. +#[test] +fn multi_column_varlen_null_vs_empty() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col1 = VarBinViewArray::from_iter_nullable_str([ + None::<&str>, + Some(""), + Some("a"), + None, + Some(""), + ]) + .into_array(); + let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1, 1]).into_array(); + let encoded = convert_columns( + &[col1, col2], + &[RowSortField::ascending(), RowSortField::ascending()], + &mut ctx, + )?; + let rows = collect_row_bytes(&encoded); + + // Nulls first, then empties, then non-empties — and all the col2 values are identical + // so col1 fully determines the order. + // Categorise each row by the leading byte of col1's encoding. + let mut buckets: [Vec; 3] = [Vec::new(), Vec::new(), Vec::new()]; + for (i, row) in rows.iter().enumerate() { + let bucket = match row[0] { + 0x00 => 0, // null + 0x01 => 1, // empty + 0x02 => 2, // non-empty + other => panic!("unexpected varlen sentinel: {:#x}", other), + }; + buckets[bucket].push(i); + } + assert_eq!(buckets[0].len(), 2, "two null col1 rows"); + assert_eq!(buckets[1].len(), 2, "two empty col1 rows"); + assert_eq!(buckets[2].len(), 1, "one non-empty col1 row"); + + // All null rows must be byte-equal (same col2 value, both col1 null, single sentinel). + let null_rows: Vec<&Vec> = buckets[0].iter().map(|&i| &rows[i]).collect(); + assert_eq!( + null_rows[0], null_rows[1], + "null col1 rows must be byte-equal" + ); + // Same for empty. + let empty_rows: Vec<&Vec> = buckets[1].iter().map(|&i| &rows[i]).collect(); + assert_eq!( + empty_rows[0], empty_rows[1], + "empty col1 rows must be byte-equal" + ); + + // Byte sort must group: nulls, empties, non-empties (because leading byte differs). + let mut sorted = rows.clone(); + sorted.sort(); + assert_eq!(sorted[0][0], 0x00); + assert_eq!(sorted[1][0], 0x00); + assert_eq!(sorted[2][0], 0x01); + assert_eq!(sorted[3][0], 0x01); + assert_eq!(sorted[4][0], 0x02); + Ok(()) +} + +/// Regression: descending varlen must put non-empty before empty (natural "" < "a" inverts +/// to "a" < "" under descending). The 3-sentinel scheme uses `!empty < !non_empty` so +/// non-empty's first byte is smaller than empty's first byte. +#[test] +fn varlen_descending_empty_vs_non_empty() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let col = VarBinViewArray::from_iter_str(["a", "", "abc"]).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::descending()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + // Natural order: "" < "a" < "abc"; descending byte sort: "abc" first, "" last. + let mut sorted = rows.clone(); + sorted.sort(); + // sorted[0] = encoded("abc"), sorted[1] = encoded("a"), sorted[2] = encoded("") + assert_eq!(sorted[0], rows[2], "abc first in descending"); + assert_eq!(sorted[1], rows[0], "a second"); + assert_eq!(sorted[2], rows[1], "empty last"); + Ok(()) +} + +/// Regression: two null parent struct rows whose underlying child values differ in length +/// must still produce byte-equal encodings, because the parent emits a canonical null +/// body (one null sentinel per variable child) regardless of the underlying values. +#[test] +fn null_struct_rows_with_varying_child_lengths_are_byte_equal() -> VortexResult<()> { + use vortex_array::arrays::StructArray; + use vortex_array::dtype::FieldName; + use vortex_array::dtype::FieldNames; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // Build a nullable struct{name: utf8} where rows 0 and 2 are null but the underlying + // child has different length data ("short" vs "much longer text data"). + let names = + VarBinViewArray::from_iter_str(["short", "x", "much longer text data"]).into_array(); + let field_names = FieldNames::from([FieldName::from("name")]); + let bits = BitBuffer::from_iter([false, true, false]); + let validity = Validity::from(bits); + let struct_arr = StructArray::try_new(field_names, vec![names], 3, validity)?.into_array(); + + let encoded = convert_columns(&[struct_arr], &[RowSortField::ascending()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + assert_eq!(rows.len(), 3); + // Both null parent rows must produce identical bytes despite the divergent children. + assert_eq!( + rows[0], rows[2], + "two null parent struct rows must encode to byte-equal slices" + ); + // And the non-null row's leading sentinel must differ from the null sentinel. + assert_ne!(rows[0][0], rows[1][0], "null vs non-null sentinel differs"); + Ok(()) +} + +#[test] +fn primitive_f32_sort_order() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = vec![-1.5, 0.0, 1.5, f32::INFINITY, f32::NEG_INFINITY]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap()); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn primitive_f16_sort_order() -> VortexResult<()> { + use vortex_array::dtype::half::f16; + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = vec![ + f16::from_f32(-1.5), + f16::from_f32(0.0), + f16::from_f32(1.5), + f16::INFINITY, + f16::NEG_INFINITY, + ]; + let col = PrimitiveArray::from_iter(values.clone()).into_array(); + let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + let mut sorted_rows = rows.clone(); + sorted_rows.sort(); + let mut sorted_idx: Vec = (0..values.len()).collect(); + sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap()); + let expected: Vec> = sorted_idx.iter().map(|&i| rows[i].clone()).collect(); + assert_eq!(sorted_rows, expected); + Ok(()) +} + +#[test] +fn reject_list_dtype_early() { + use vortex_array::ArrayRef; + use vortex_array::arrays::ListArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let offsets = PrimitiveArray::new(buffer![0u32, 1, 2], Validity::NonNullable).into_array(); + let elements = PrimitiveArray::from_iter([10i32, 20]).into_array(); + let list: ArrayRef = ListArray::try_new(elements, offsets, Validity::NonNullable) + .unwrap() + .into_array(); + let err = convert_columns(&[list], &[RowSortField::default()], &mut ctx) + .expect_err("List should not be accepted"); + assert!( + err.to_string().contains("List"), + "expected error mentioning List, got: {err}" + ); +} From 083c7f3565b49ce519a9988f3e8065c237d63c05 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 15:26:39 +0000 Subject: [PATCH 02/19] ci(vortex-row): run row_encode benchmarks on CodSpeed Add a CodSpeed shard for `vortex-row` so the `row_encode` divan benchmarks (vortex vs arrow-row) build and run in CI alongside the other crates. Signed-off-by: Joe Isaacs --- .github/workflows/codspeed.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 50a5cbaa75a..b741aaaf41d 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -35,6 +35,7 @@ jobs: - { shard: 6, name: "Encodings 3", packages: "vortex-pco vortex-runend vortex-sequence" } - { shard: 7, name: "Encodings 4", packages: "vortex-sparse vortex-zigzag vortex-zstd" } - { shard: 8, name: "Storage formats", packages: "vortex-flatbuffers vortex-proto vortex-btrblocks" } + - { shard: 9, name: "Row encoding", packages: "vortex-row" } name: "Benchmark with Codspeed (Shard #${{ matrix.shard }})" timeout-minutes: 30 runs-on: >- From 37936e27c6a6d44031512883b0deb5b4ea3aabf3 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:24:26 +0000 Subject: [PATCH 03/19] vortex-row: skip ListView validation in row encoder output The row encoder builds the output `(elements, offsets, sizes)` triple itself, so the invariants `ListViewArray::try_new` validates (monotone offsets, per-row slices within bounds and disjoint) already hold by construction. Skip the revalidation walk via `new_unchecked`. Signed-off-by: Joe Isaacs --- vortex-row/src/encode.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index d3721e49a6e..e7a1569739a 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -186,8 +186,16 @@ fn execute_row_encode( let offsets_arr = PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array(); let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array(); - Ok( - ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)? - .into_array(), - ) + // SAFETY: this encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself: + // - `elements` is a `PrimitiveArray` of length `total_len`. + // - `offsets_arr[i]` is `i * fixed_per_row + var_prefix[i]`, monotonically increasing and + // in `0..=total_len`. + // - `offsets_arr[i] + sizes_arr[i] <= total_len` by construction, and each row's slice is + // disjoint from every other row's. + // `try_new`'s validation re-walks every row to check exactly these invariants, which we + // already guarantee by construction, so we skip it. + Ok(unsafe { + ListViewArray::new_unchecked(elements, offsets_arr, sizes_arr, Validity::NonNullable) + } + .into_array()) } From 48b92d13df0770897ee70ae378623d5dc566548d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:26:18 +0000 Subject: [PATCH 04/19] vortex-row: add validity fast-path helper for the hot encoders Introduce `ValidityKind`/`resolve_validity`: resolve a column's validity once, materializing the per-row mask only when the column may actually contain nulls. The size pass for varbinview and the bool and primitive encoders now branch once on validity, so the all-valid path drops the per-row `mask.value(i)` check (and mask allocation) entirely. Signed-off-by: Joe Isaacs --- vortex-row/src/codec.rs | 142 +++++++++++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 38 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 2818db62aba..ee9fd4578c6 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -43,6 +43,7 @@ use vortex_array::dtype::DecimalType; use vortex_array::dtype::NativePType; use vortex_array::dtype::half::f16; use vortex_array::match_each_native_ptype; +use vortex_array::validity::Validity; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -87,6 +88,32 @@ fn byte_width_u32(width: usize) -> u32 { u32::try_from(width).vortex_expect("native byte width must fit in u32") } +/// Pre-resolved per-row validity for the row encoders. +/// +/// Encoders pattern-match on this once before their inner loop so the no-nulls fast path +/// avoids per-row `mask.value(i)` branches entirely, and the nullable path materializes the +/// mask exactly once. +pub(crate) enum ValidityKind { + /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask needed. + AllValid, + /// Column may have nulls; carries the materialized per-row mask. + Mask(vortex_mask::Mask), +} + +/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when the column +/// may actually have nulls. +#[inline] +pub(crate) fn resolve_validity( + validity: Validity, + len: usize, + ctx: &mut ExecutionCtx, +) -> VortexResult { + Ok(match validity { + Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid, + other => ValidityKind::Mask(other.execute_mask(len, ctx)?), + }) +} + /// Returns the sentinel byte for a null varlen value. /// /// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and @@ -306,19 +333,34 @@ fn add_size_varbinview( sizes: &mut [u32], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; let views = arr.views(); - for (i, view) in views.iter().enumerate() { - let contribution = if !mask.value(i) { - VARLEN_NULL_SIZE - } else if view.is_empty() { - VARLEN_EMPTY_SIZE - } else { - encoded_size_for_non_empty_varlen(view.len() as usize) - }; - sizes[i] = sizes[i] - .checked_add(contribution) - .vortex_expect("per-row size overflow"); + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for (i, view) in views.iter().enumerate() { + let contribution = if view.is_empty() { + VARLEN_EMPTY_SIZE + } else { + encoded_size_for_non_empty_varlen(view.len() as usize) + }; + sizes[i] = sizes[i] + .checked_add(contribution) + .vortex_expect("per-row size overflow"); + } + } + ValidityKind::Mask(mask) => { + for (i, view) in views.iter().enumerate() { + let contribution = if !mask.value(i) { + VARLEN_NULL_SIZE + } else if view.is_empty() { + VARLEN_EMPTY_SIZE + } else { + encoded_size_for_non_empty_varlen(view.len() as usize) + }; + sizes[i] = sizes[i] + .checked_add(contribution) + .vortex_expect("per-row size overflow"); + } + } } Ok(()) } @@ -443,23 +485,35 @@ fn encode_bool( out: &mut [u8], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; let bits = arr.clone().into_bit_buffer(); let non_null = field.non_null_sentinel(); - let null = field.null_sentinel(); let xor = if field.descending { 0xFF } else { 0x00 }; - for i in 0..bits.len() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - if mask.value(i) { - out[pos] = non_null; - // false=0x01, true=0x02 so false < true; XOR for descending - let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; - out[pos + 1] = raw ^ xor; - } else { - out[pos] = null; - out[pos + 1] = 0; + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for i in 0..bits.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = non_null; + // false=0x01, true=0x02 so false < true; XOR for descending + let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; + out[pos + 1] = raw ^ xor; + col_offset[i] += BOOL_ENCODED_SIZE; + } + } + ValidityKind::Mask(mask) => { + let null = field.null_sentinel(); + for i in 0..bits.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; + out[pos + 1] = raw ^ xor; + } else { + out[pos] = null; + out[pos + 1] = 0; + } + col_offset[i] += BOOL_ENCODED_SIZE; + } } - col_offset[i] += BOOL_ENCODED_SIZE; } Ok(()) } @@ -486,24 +540,36 @@ fn encode_primitive_typed( out: &mut [u8], ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; let slice: &[T] = arr.as_slice(); let non_null = field.non_null_sentinel(); - let null = field.null_sentinel(); let value_bytes = size_of::(); - for (i, &v) in slice.iter().enumerate() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - if mask.value(i) { - out[pos] = non_null; - v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); - } else { - out[pos] = null; - // Zero-fill the value bytes. - for b in &mut out[pos + 1..pos + 1 + value_bytes] { - *b = 0; + let stride = encoded_size_for_fixed(byte_width_u32(value_bytes)); + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for (i, &v) in slice.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = non_null; + v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + col_offset[i] += stride; + } + } + ValidityKind::Mask(mask) => { + let null = field.null_sentinel(); + for (i, &v) in slice.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + // Zero-fill the value bytes. + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += stride; } } - col_offset[i] += encoded_size_for_fixed(byte_width_u32(value_bytes)); } Ok(()) } From 578495d9c6b72ee4936c7c9e94fd305e7bee41e4 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:28:18 +0000 Subject: [PATCH 05/19] vortex-row: skip zero-init of the output buffer Every byte of the output range is written by some encoder: fixed-width null rows write sentinel + explicit zero-fill, varlen encoders zero-pad their final partial block, and struct/FSL null parent bodies are overwritten with the canonical null encoding. The pre-zero-init memset is therefore redundant, so replace it with `set_len`, saving a `total_len`-byte memset per call. Signed-off-by: Joe Isaacs --- vortex-row/src/encode.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index e7a1569739a..e6c9f5e2443 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -131,10 +131,19 @@ fn execute_row_encode( let total_len = usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize"); - // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder - // assume previously untouched bytes are zero, simplifying the null-row fill paths. let mut out_buf: BufferMut = BufferMut::with_capacity(total_len); - out_buf.push_n(0u8, total_len); + // Every encoder writes every byte in its row range: fixed-width values write + // sentinel + value (null rows write sentinel + explicit zero-fill); varlen blocks + // zero-pad their final partial block; struct/FSL fixed children are written for all + // rows then null parent rows are overwritten with the canonical null body. So the + // size-pass + encoder contract guarantees `[0, total_len)` is fully written before + // the buffer is read out, making the pre-zero-init redundant. Skipping it saves a + // `total_len`-byte memset per call (significant for varlen-heavy inputs, where + // `total_len` reaches multiple MB). + // + // SAFETY: `total_len` bytes of capacity were just reserved, and by the contract above + // every byte in that range is written before `out_buf` is frozen and read. + unsafe { out_buf.set_len(total_len) }; // ===== Phase 3: per-row offsets ===== // listview_offsets[i] is the absolute byte offset where row `i` begins. From 6e401b97ba7bb7806ac852665f03ce2804ba7dc0 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:29:48 +0000 Subject: [PATCH 06/19] vortex-row: auto-vectorize pure-fixed offsets construction Materialize the listview offsets buffer with `set_len` + a slice write instead of per-row `push`. For the pure-fixed path, `iter_mut().enumerate()` lets LLVM auto-vectorize `offsets[i] = i * fixed_per_row` (no per-element bounds or capacity checks). `nrows` is validated to fit u32 at function entry, so the cast is exact. Signed-off-by: Joe Isaacs --- vortex-row/src/encode.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index e6c9f5e2443..4862678d31d 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -150,22 +150,24 @@ fn execute_row_encode( // For pure-fixed: i * fixed_per_row. // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths. // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end. - let nrows_u32 = - u32::try_from(nrows).vortex_expect("nrows fits u32 (validated earlier in this function)"); let mut listview_offsets: BufferMut = BufferMut::with_capacity(nrows); + // SAFETY: `nrows` of capacity reserved above; every index in `[0, nrows)` is written + // before the buffer is read out. `nrows` was validated to fit `u32` at function entry, + // so `i as u32` below is exact and the multiplications can't overflow. + unsafe { listview_offsets.set_len(nrows) }; + let off = listview_offsets.as_mut_slice(); match var_lengths.as_ref() { None => { - for row_idx in 0..nrows_u32 { - // Total bytes already fit in u32, so row_idx * fixed_per_row also does. - listview_offsets.push(row_idx * fixed_per_row); + // Pure-fixed: offsets[i] = i * fixed_per_row. `iter_mut().enumerate()` elides + // per-element bounds checks, so LLVM auto-vectorizes this multiply. + for (i, slot) in off.iter_mut().enumerate() { + *slot = (i as u32) * fixed_per_row; } } Some(v) => { let mut acc: u32 = 0; - for (row_idx, &l) in (0..nrows_u32).zip(v.iter()) { - // The arithmetic below cannot overflow because we already verified the - // total fits in u32. - listview_offsets.push(row_idx * fixed_per_row + acc); + for (i, &l) in v.iter().enumerate() { + off[i] = (i as u32) * fixed_per_row + acc; acc += l; } } From bd781d7f6460fd61984360d18223cc2e4f9730bb Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:30:10 +0000 Subject: [PATCH 07/19] vortex-row: auto-vectorize mixed-path offsets construction Write the mixed (fixed + varlen) offsets through `iter_mut().zip` with wrapping arithmetic, mirroring the pure-fixed path: this elides per-element bounds checks so the `i * fixed_per_row` multiply auto-vectorizes while the varlen prefix sum stays a cheap sequential accumulator. The total is validated to fit u32 upstream, so the wrapping operations never actually wrap. Signed-off-by: Joe Isaacs --- vortex-row/src/encode.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index 4862678d31d..a82f571af86 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -165,10 +165,14 @@ fn execute_row_encode( } } Some(v) => { + // Mixed: offsets[i] = i * fixed_per_row + var_prefix[i], where var_prefix is the + // exclusive cumsum of varlen lengths. `iter_mut().zip` elides per-element bounds + // checks; the total was validated to fit u32 upstream so the wrapping arithmetic + // is exact (it never actually wraps). let mut acc: u32 = 0; - for (i, &l) in v.iter().enumerate() { - off[i] = (i as u32) * fixed_per_row + acc; - acc += l; + for (i, (slot, &l)) in off.iter_mut().zip(v.iter()).enumerate() { + *slot = (i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc); + acc = acc.wrapping_add(l); } } } From bb39136307dce1c6a362d3582f93eb1fc5f9595d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:32:06 +0000 Subject: [PATCH 08/19] vortex-row: rewrite varlen 32-byte block encoder with copy_nonoverlapping The varlen body writer was a per-byte XOR loop. Split it into an ascending fast path (`copy_nonoverlapping` of each 32-byte block plus a single stamped continuation byte, then a partial final block) and a descending path that XORs a u64 at a time via `xor_copy_block` for a vectorizable inner loop. The emitted bytes are identical to the previous implementation for every length and direction (full-block counts and final length byte match exactly); only the write strategy changes. Signed-off-by: Joe Isaacs --- vortex-row/src/codec.rs | 99 ++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 22 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index ee9fd4578c6..daf7c4efd48 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -926,33 +926,88 @@ fn encode_extension( /// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with /// continuation/length markers. Returns the number of bytes written. Empty values are /// encoded by the caller as a single sentinel byte and never reach this function. +/// +/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block +/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and +/// XORs with `0xFF`, giving LLVM a vectorizable inner loop. fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { debug_assert!(!bytes.is_empty()); - let xor = if descending { 0xFFu8 } else { 0x00 }; - let mut written = 0usize; - let mut remaining = bytes; - while remaining.len() > VARLEN_BLOCK_SIZE { - // Full block, continuation marker 0xFF (then XORed if descending). - let block = &remaining[..VARLEN_BLOCK_SIZE]; - for (i, &b) in block.iter().enumerate() { - out[written + i] = b ^ xor; + let len = bytes.len(); + let full_blocks = len / VARLEN_BLOCK_SIZE; + let partial = len % VARLEN_BLOCK_SIZE; + let (full_to_write, partial_block_len) = if partial == 0 { + // Length is an exact multiple of 32: emit (full_blocks - 1) full blocks with the + // 0xFF continuation marker, then a final block whose continuation byte is 32. + (full_blocks - 1, VARLEN_BLOCK_SIZE) + } else { + (full_blocks, partial) + }; + let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL; + debug_assert!(out.len() >= total); + + // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via + // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the + // leading sentinel that the caller wrote and that is not part of `out`). `bytes` is valid + // for `len` reads, and every pointer advance below stays within `[0, total)` for `dst` + // and `[0, len)` for `src`. + unsafe { + let mut src = bytes.as_ptr(); + let mut dst = out.as_mut_ptr(); + + if !descending { + // Ascending fast path: each full block is a 32-byte memcpy + a single 0xFF stamp. + for _ in 0..full_to_write { + std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE); + *dst.add(VARLEN_BLOCK_SIZE) = 0xFF; + src = src.add(VARLEN_BLOCK_SIZE); + dst = dst.add(VARLEN_BLOCK_TOTAL); + } + // Final block: copy the partial data, zero-pad the tail, write the length byte. + std::ptr::copy_nonoverlapping(src, dst, partial_block_len); + std::ptr::write_bytes( + dst.add(partial_block_len), + 0, + VARLEN_BLOCK_SIZE - partial_block_len, + ); + *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8; + } else { + // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable + // inner loop; the tail handles the partial block byte-wise. + for _ in 0..full_to_write { + xor_copy_block(src, dst); + *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF + src = src.add(VARLEN_BLOCK_SIZE); + dst = dst.add(VARLEN_BLOCK_TOTAL); + } + for i in 0..partial_block_len { + *dst.add(i) = *src.add(i) ^ 0xFF; + } + std::ptr::write_bytes( + dst.add(partial_block_len), + 0xFF, // 0x00 XOR 0xFF + VARLEN_BLOCK_SIZE - partial_block_len, + ); + *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF; } - out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor; - written += VARLEN_BLOCK_TOTAL; - remaining = &remaining[VARLEN_BLOCK_SIZE..]; - } - // Final partial block: pad with zeros, last byte = remaining.len() (1..=32). - let n = remaining.len(); - for (i, &b) in remaining.iter().enumerate() { - out[written + i] = b ^ xor; } - for j in n..VARLEN_BLOCK_SIZE { - out[written + j] = xor; + total as u32 +} + +/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the +/// four u64-wide iterations into SIMD on x86. +/// +/// # Safety +/// `src` must be valid for 32 reads, `dst` valid for 32 writes, and the regions must not +/// overlap. +#[inline(always)] +unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) { + // Four u64 lanes of 8 bytes each = 32 bytes total. + for i in 0..4 { + let off = i * 8; + // SAFETY: the caller guarantees src/dst are valid for the full 32-byte block. + let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) }; + unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) }; } - out[written + VARLEN_BLOCK_SIZE] = - u8::try_from(n).vortex_expect("final varlen block length must fit in u8") ^ xor; - written += VARLEN_BLOCK_TOTAL; - u32::try_from(written).vortex_expect("encoded varlen byte length must fit in u32") } /// Internal trait for encoding a fixed-width native value into byte slots. From ee049ae050375f71b12d39f346bc730b565714d1 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:34:09 +0000 Subject: [PATCH 09/19] vortex-row: walk VarBinView rows directly in the encoder hot loop Replace the `with_iterator` traversal in `encode_varbinview` with a direct walk over the view array: cache the data-buffer slices once, then for each row read the bytes straight from the inlined view slot or the referenced buffer at `offset..offset+len`. This drops the iterator's per-row option/bounds machinery. Validity is resolved once via `resolve_validity`, keeping the no-nulls path branch-free on validity. Signed-off-by: Joe Isaacs --- vortex-row/src/codec.rs | 69 +++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index daf7c4efd48..5056ab3e6f6 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -26,7 +26,6 @@ use vortex_array::Canonical; use vortex_array::ExecutionCtx; -use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; use vortex_array::arrays::ExtensionArray; @@ -642,35 +641,71 @@ fn encode_varbinview( row_offsets: &[u32], col_offset: &mut [u32], out: &mut [u8], - _ctx: &mut ExecutionCtx, + ctx: &mut ExecutionCtx, ) -> VortexResult<()> { let null_byte = varlen_null_sentinel(field); let empty_byte = varlen_empty_sentinel(field); let non_empty_byte = varlen_non_empty_sentinel(field); + let descending = field.descending; + + let views = arr.views(); + // Cache the data-buffer slices once. Inlined views (len <= 12) carry their bytes inline, + // so they never touch `buffers`; referenced views index into the pre-validated buffer at + // `offset..offset + len`. Walking views directly avoids the per-row bounds and branch work + // of `with_iterator`. + let buffers: smallvec::SmallVec<[&[u8]; 4]> = (0..arr.data_buffers().len()) + .map(|i| arr.buffer(i).as_slice()) + .collect(); - // `with_iterator` yields `Some(bytes)` for non-null rows and `None` for null rows, - // so the iterator alone fully describes validity — no separate mask lookup needed. - arr.with_iterator(|iter| { - for (i, maybe) in iter.enumerate() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - match maybe { - None => { + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for (i, view) in views.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + let len = view.len() as usize; + if len == 0 { + out[pos] = empty_byte; + col_offset[i] += VARLEN_EMPTY_SIZE; + continue; + } + let bytes: &[u8] = if view.is_inlined() { + view.as_inlined().value() + } else { + let r = view.as_view(); + let off = r.offset as usize; + &buffers[r.buffer_index as usize][off..off + len] + }; + out[pos] = non_empty_byte; + let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending); + col_offset[i] += 1 + written; + } + } + ValidityKind::Mask(mask) => { + for (i, view) in views.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if !mask.value(i) { out[pos] = null_byte; col_offset[i] += VARLEN_NULL_SIZE; + continue; } - Some([]) => { + let len = view.len() as usize; + if len == 0 { out[pos] = empty_byte; col_offset[i] += VARLEN_EMPTY_SIZE; + continue; } - Some(bytes) => { - out[pos] = non_empty_byte; - let written = - encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], field.descending); - col_offset[i] += 1 + written; - } + let bytes: &[u8] = if view.is_inlined() { + view.as_inlined().value() + } else { + let r = view.as_view(); + let off = r.offset as usize; + &buffers[r.buffer_index as usize][off..off + len] + }; + out[pos] = non_empty_byte; + let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending); + col_offset[i] += 1 + written; } } - }); + } Ok(()) } From 65a24f9ed92cadf2e8f9653d94cfe19b6bc909e2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:37:41 +0000 Subject: [PATCH 10/19] vortex-row: satisfy cast-truncation lints in ported hot paths The auto-vectorized offset loops and the varlen block writer used raw `as` casts that trip this crate's `cast_possible_truncation` lint. Iterate a `u32` counter instead of casting `usize` per element, and use `u8`/`u32` `try_from` for the varlen final-block length byte and total byte count. No behavior change. Signed-off-by: Joe Isaacs --- vortex-row/src/codec.rs | 9 ++++++--- vortex-row/src/encode.rs | 14 ++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 5056ab3e6f6..92f3bc13a0e 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -979,6 +979,9 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) }; let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL; debug_assert!(out.len() >= total); + // The final block's continuation byte encodes its content length (1..=32). + let len_byte = + u8::try_from(partial_block_len).vortex_expect("varlen final block length (1..=32) fits u8"); // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the @@ -1004,7 +1007,7 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) 0, VARLEN_BLOCK_SIZE - partial_block_len, ); - *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8; + *dst.add(VARLEN_BLOCK_SIZE) = len_byte; } else { // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable // inner loop; the tail handles the partial block byte-wise. @@ -1022,10 +1025,10 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) 0xFF, // 0x00 XOR 0xFF VARLEN_BLOCK_SIZE - partial_block_len, ); - *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF; + *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF; } } - total as u32 + u32::try_from(total).vortex_expect("encoded varlen byte length fits u32") } /// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index a82f571af86..f789382c4a1 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -158,10 +158,12 @@ fn execute_row_encode( let off = listview_offsets.as_mut_slice(); match var_lengths.as_ref() { None => { - // Pure-fixed: offsets[i] = i * fixed_per_row. `iter_mut().enumerate()` elides - // per-element bounds checks, so LLVM auto-vectorizes this multiply. - for (i, slot) in off.iter_mut().enumerate() { - *slot = (i as u32) * fixed_per_row; + // Pure-fixed: offsets[i] = i * fixed_per_row. Zipping against a `u32` counter + // elides per-element bounds checks (and avoids a per-element `usize as u32` + // cast), so LLVM auto-vectorizes this multiply. `nrows` fits u32, so the counter + // never overflows. + for (slot, i) in off.iter_mut().zip(0u32..) { + *slot = i * fixed_per_row; } } Some(v) => { @@ -170,8 +172,8 @@ fn execute_row_encode( // checks; the total was validated to fit u32 upstream so the wrapping arithmetic // is exact (it never actually wraps). let mut acc: u32 = 0; - for (i, (slot, &l)) in off.iter_mut().zip(v.iter()).enumerate() { - *slot = (i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc); + for ((slot, &l), i) in off.iter_mut().zip(v.iter()).zip(0u32..) { + *slot = i.wrapping_mul(fixed_per_row).wrapping_add(acc); acc = acc.wrapping_add(l); } } From 2711504490e8e73fe3004184aca20882adfd2f0d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 18:57:45 +0000 Subject: [PATCH 11/19] vortex-row: arithmetic-write fast path for fixed-before-varlen columns Classify each column in the size pass (`ColKind` + `first_varlen_idx`): a fixed-width column with no varlen column before it has a constant within-row offset, so its write position is pure arithmetic (`i * fixed_per_row + prefix + var_prefix[i]`) with no per-row cursor. Route those columns through `field_encode_fixed_arithmetic`; the cursor path is seeded to start at the first varlen column. Primitive columns in the pure-fixed case use a `chunks_exact_mut` hot loop (matching arrow-row's not-null path); all other fixed types reuse the cursor encoder at the computed offsets, so output is byte-identical. Signed-off-by: Joe Isaacs --- vortex-row/src/codec.rs | 115 +++++++++++++++++++++++++++++++++++++++ vortex-row/src/encode.rs | 93 +++++++++++++++++++++++++------ vortex-row/src/size.rs | 42 +++++++++++++- 3 files changed, 229 insertions(+), 21 deletions(-) diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index 92f3bc13a0e..d0cb32ce13d 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -269,6 +269,59 @@ pub(crate) fn field_size( Ok(()) } +/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row +/// cursor. +/// +/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix +/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen +/// contributions (`None` when the row layout has no variable-length columns). This is the +/// fast path for fixed-width columns that appear before any varlen column, so their +/// within-row position is a constant offset rather than a running cursor. +/// +/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that +/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`). +/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written +/// are byte-identical to the cursor path. +#[allow(clippy::too_many_arguments)] +pub(crate) fn field_encode_fixed_arithmetic( + canonical: &Canonical, + field: RowSortField, + col_prefix: u32, + row_stride: u32, + var_prefix: Option<&[u32]>, + nrows: usize, + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + if var_prefix.is_none() + && let Canonical::Primitive(arr) = canonical + { + return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx); + } + + // General path: materialize this column's per-row start offsets and reuse the cursor + // encoder with zero-initialized cursors, so every row is written at its arithmetic + // offset with the exact same bytes the cursor path would produce. + let mut offsets: Vec = Vec::with_capacity(nrows); + let mut base = col_prefix; + match var_prefix { + None => { + for _ in 0..nrows { + offsets.push(base); + base = base.wrapping_add(row_stride); + } + } + Some(vp) => { + for &p in vp.iter().take(nrows) { + offsets.push(base.wrapping_add(p)); + base = base.wrapping_add(row_stride); + } + } + } + let mut cursors = vec![0u32; nrows]; + field_encode(canonical, field, &offsets, &mut cursors, out, ctx) +} + /// Encode each row's bytes for the given canonical view into `out`, writing starting at /// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of /// bytes written. @@ -958,6 +1011,68 @@ fn encode_extension( field_encode(&storage, field, row_offsets, col_offset, out, ctx) } +/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a +/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the +/// compiler can drop the per-row offset/cursor indirection. +fn encode_primitive_arith( + arr: &PrimitiveArray, + field: RowSortField, + col_prefix: u32, + row_stride: u32, + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match_each_native_ptype!(arr.ptype(), |T| { + encode_primitive_arith_typed::(arr, field, col_prefix, row_stride, out, ctx)?; + }); + Ok(()) +} + +fn encode_primitive_arith_typed( + arr: &PrimitiveArray, + field: RowSortField, + col_prefix: u32, + row_stride: u32, + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let slice: &[T] = arr.as_slice(); + let non_null = field.non_null_sentinel(); + let value_bytes = size_of::(); + let slot_size = 1 + value_bytes; + let stride = row_stride as usize; + let prefix = col_prefix as usize; + let descending = field.descending; + + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk, + // so the inner write vectorizes the same way as `arrow-row`'s not-null path. + for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) { + let slot = &mut chunk[prefix..prefix + slot_size]; + slot[0] = non_null; + v.encode_to(&mut slot[1..], descending); + } + } + ValidityKind::Mask(mask) => { + let null = field.null_sentinel(); + for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() { + let slot = &mut chunk[prefix..prefix + slot_size]; + if mask.value(i) { + slot[0] = non_null; + v.encode_to(&mut slot[1..], descending); + } else { + slot[0] = null; + for b in &mut slot[1..] { + *b = 0; + } + } + } + } + } + Ok(()) +} + /// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with /// continuation/length markers. Returns the number of bytes written. Empty values are /// encoded by the caller as a single sentinel byte and never reach this function. diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index f789382c4a1..46a4be778d4 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -34,6 +34,7 @@ use crate::codec; use crate::options::RowEncodingOptions; use crate::options::deserialize_row_encoding_options; use crate::options::serialize_row_encoding_options; +use crate::size::ColKind; use crate::size::compute_sizes; /// Variadic scalar function that encodes N input columns into a single `List` @@ -112,6 +113,8 @@ fn execute_row_encode( let crate::size::SizePassResult { fixed_per_row, var_lengths, + col_kinds, + first_varlen_idx, columns, } = compute_sizes(options, args, ctx)?; @@ -149,53 +152,107 @@ fn execute_row_encode( // listview_offsets[i] is the absolute byte offset where row `i` begins. // For pure-fixed: i * fixed_per_row. // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths. + // + // When fixed-before-varlen columns coexist with a varlen column, we additionally build + // `var_prefix_for_arith[i] = exclusive cumsum of var_lengths[..i]` and hand it to the + // arithmetic encoders so they can compute per-row write positions without a cursor. + let need_arith_prefix = first_varlen_idx.is_some() + && col_kinds.iter().any(|k| { + matches!( + k, + ColKind::Fixed { + before_varlen: true, + .. + } + ) + }); + // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end. let mut listview_offsets: BufferMut = BufferMut::with_capacity(nrows); // SAFETY: `nrows` of capacity reserved above; every index in `[0, nrows)` is written // before the buffer is read out. `nrows` was validated to fit `u32` at function entry, - // so `i as u32` below is exact and the multiplications can't overflow. + // so the `0u32..` counters below are exact and the multiplications can't overflow. unsafe { listview_offsets.set_len(nrows) }; let off = listview_offsets.as_mut_slice(); + let mut var_prefix_for_arith: Option> = None; match var_lengths.as_ref() { None => { // Pure-fixed: offsets[i] = i * fixed_per_row. Zipping against a `u32` counter - // elides per-element bounds checks (and avoids a per-element `usize as u32` - // cast), so LLVM auto-vectorizes this multiply. `nrows` fits u32, so the counter - // never overflows. + // elides per-element bounds checks, so LLVM auto-vectorizes this multiply. for (slot, i) in off.iter_mut().zip(0u32..) { *slot = i * fixed_per_row; } } Some(v) => { // Mixed: offsets[i] = i * fixed_per_row + var_prefix[i], where var_prefix is the - // exclusive cumsum of varlen lengths. `iter_mut().zip` elides per-element bounds - // checks; the total was validated to fit u32 upstream so the wrapping arithmetic - // is exact (it never actually wraps). + // exclusive cumsum of varlen lengths. The total was validated to fit u32 upstream + // so the wrapping arithmetic is exact (it never actually wraps). + let mut vp: Option> = need_arith_prefix.then(|| Vec::with_capacity(nrows)); let mut acc: u32 = 0; for ((slot, &l), i) in off.iter_mut().zip(v.iter()).zip(0u32..) { + if let Some(p) = vp.as_mut() { + p.push(acc); + } *slot = i.wrapping_mul(fixed_per_row).wrapping_add(acc); acc = acc.wrapping_add(l); } + var_prefix_for_arith = vp; } } let listview_offsets_slice: &[u32] = listview_offsets.as_slice(); // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build // it as a BufferMut so we can hand it directly to the output PrimitiveArray. + // + // The cursor path begins at the first cursor-path column. Fixed-before-varlen columns + // are written by the arithmetic path and do not touch the cursor, so the cursor is + // pre-seeded with the within-row offset of the first varlen column (its `fixed_prefix`). + // When there are no varlen columns at all, every column takes the arithmetic path and + // the cursor loop runs zero iterations; seeding with `fixed_per_row` then leaves the + // cursors already correct as per-row sizes. + let initial_cursor: u32 = match first_varlen_idx { + Some(idx) => match col_kinds[idx] { + ColKind::Variable { fixed_prefix } => fixed_prefix, + ColKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"), + }, + None => fixed_per_row, + }; let mut row_cursors: BufferMut = BufferMut::with_capacity(nrows); - row_cursors.push_n(0u32, nrows); + row_cursors.push_n(initial_cursor, nrows); - // ===== Phase 4: encode columns via the cursor path ===== - // Each column was canonicalized once during the size pass; reuse that canonical form. + // ===== Phase 4: encode columns ===== + // Fixed-before-varlen columns take the arithmetic-write path (constant within-row + // offset, no cursor mutation). Fixed-after-varlen and varlen columns take the cursor + // path. Each column was canonicalized once during the size pass; reuse that form. for (i, canonical) in columns.iter().enumerate() { - codec::field_encode( - canonical, - options.fields[i], - listview_offsets_slice, - row_cursors.as_mut_slice(), - &mut out_buf, - ctx, - )?; + match col_kinds[i] { + ColKind::Fixed { + prefix, + before_varlen: true, + .. + } => { + codec::field_encode_fixed_arithmetic( + canonical, + options.fields[i], + prefix, + fixed_per_row, + var_prefix_for_arith.as_deref(), + nrows, + &mut out_buf, + ctx, + )?; + } + ColKind::Fixed { .. } | ColKind::Variable { .. } => { + codec::field_encode( + canonical, + options.fields[i], + listview_offsets_slice, + row_cursors.as_mut_slice(), + &mut out_buf, + ctx, + )?; + } + } } // ===== Phase 5: build ListView output ===== diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index 26269081ce7..860fe3c2a2c 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -36,6 +36,24 @@ use crate::options::RowEncodingOptions; use crate::options::deserialize_row_encoding_options; use crate::options::serialize_row_encoding_options; +/// Classification of a single input column for the size pass. +/// +/// Tracks each column's within-row byte offset (the constant prefix from all preceding +/// fixed-width columns) and, for fixed columns, whether any variable-length column has +/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast +/// path (no varlen before this column, so the within-row position is constant per row) and +/// the cursor-write path. +#[derive(Clone, Copy, Debug)] +pub(crate) enum ColKind { + /// Fixed-width column. `prefix` is the within-row byte offset of this column's first + /// byte. When `before_varlen` is true no variable-length column precedes this one, so the + /// within-row offset is constant for every row. + Fixed { prefix: u32, before_varlen: bool }, + /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all + /// preceding fixed columns; the contribution of earlier varlen columns is added per row. + Variable { fixed_prefix: u32 }, +} + /// Result of the size pass: enough information for both [`RowSize::execute`] and the /// downstream [`RowEncode`](super::encode::RowEncode) pipeline. /// @@ -45,6 +63,8 @@ use crate::options::serialize_row_encoding_options; pub(crate) struct SizePassResult { pub fixed_per_row: u32, pub var_lengths: Option>, + pub col_kinds: Vec, + pub first_varlen_idx: Option, pub columns: Vec, } @@ -77,8 +97,11 @@ pub(crate) fn compute_sizes( let nrows = args.row_count(); let mut columns: Vec = Vec::with_capacity(n_inputs); + let mut col_kinds: Vec = Vec::with_capacity(n_inputs); let mut fixed_per_row: u32 = 0; let mut var_lengths: Option> = None; + let mut first_varlen_idx: Option = None; + let mut running_fixed_prefix: u32 = 0; for i in 0..n_inputs { let col = args.get(i)?; @@ -95,13 +118,24 @@ pub(crate) fn compute_sizes( let canonical = col.execute::(ctx)?; match width { RowWidth::Fixed(w) => { - fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(|| { - vortex_error::vortex_err!("per-row fixed width overflows u32 at column {}", i) - })?; + col_kinds.push(ColKind::Fixed { + prefix: running_fixed_prefix, + before_varlen: first_varlen_idx.is_none(), + }); + let overflow = + || vortex_error::vortex_err!("per-row fixed width overflows u32 at column {i}"); + fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(overflow)?; + running_fixed_prefix = running_fixed_prefix.checked_add(w).ok_or_else(overflow)?; } RowWidth::Variable => { + if first_varlen_idx.is_none() { + first_varlen_idx = Some(i); + } let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]); codec::field_size(&canonical, options.fields[i], v, ctx)?; + col_kinds.push(ColKind::Variable { + fixed_prefix: running_fixed_prefix, + }); } } columns.push(canonical); @@ -110,6 +144,8 @@ pub(crate) fn compute_sizes( Ok(SizePassResult { fixed_per_row, var_lengths, + col_kinds, + first_varlen_idx, columns, }) } From 2fc07fa1accf9541d469c757b10a960c3e1f488f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 5 Jun 2026 08:59:19 +0000 Subject: [PATCH 12/19] ci(vortex-row): fold row_encode benchmarks into CodSpeed shard 8 Run the vortex-row row_encode benchmarks as part of the existing 'Storage formats' shard rather than adding a dedicated ninth shard. Signed-off-by: Joe Isaacs --- .github/workflows/codspeed.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index b741aaaf41d..81b37e510b8 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -34,8 +34,7 @@ jobs: - { shard: 5, name: "Encodings 2", packages: "vortex-decimal-byte-parts vortex-fastlanes vortex-fsst", features: "--features _test-harness" } - { shard: 6, name: "Encodings 3", packages: "vortex-pco vortex-runend vortex-sequence" } - { shard: 7, name: "Encodings 4", packages: "vortex-sparse vortex-zigzag vortex-zstd" } - - { shard: 8, name: "Storage formats", packages: "vortex-flatbuffers vortex-proto vortex-btrblocks" } - - { shard: 9, name: "Row encoding", packages: "vortex-row" } + - { shard: 8, name: "Storage formats & row encoding", packages: "vortex-flatbuffers vortex-proto vortex-btrblocks vortex-row" } name: "Benchmark with Codspeed (Shard #${{ matrix.shard }})" timeout-minutes: 30 runs-on: >- From b97b7e4f66c5e01868d989f066bc2d587a9c8d4d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 21:58:52 +0000 Subject: [PATCH 13/19] vortex-row: benchmark FSST row-encoding (unpack-then-convert vs phases) FSST is not order-preserving, so row keys must be the decompressed bytes; the only strategy today is decompress to a canonical VarBinView then row-encode it. This bench measures that path and its two phases (decompress-only, and row-encode of an already-decompressed column) on compressible multi-block strings, to quantify the opportunity for a future fused FSST row-encode kernel: the phases are additive (decompress ~46%, row-encode ~54%), and the row-encode phase re-reads/re-writes the decompressed bytes a fused kernel could emit once. Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-row/Cargo.toml | 5 ++ vortex-row/benches/fsst_row_encode.rs | 120 ++++++++++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 vortex-row/benches/fsst_row_encode.rs diff --git a/Cargo.lock b/Cargo.lock index 967f0a18a09..bf24dafe3dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9997,6 +9997,7 @@ dependencies = [ "vortex-array", "vortex-buffer", "vortex-error", + "vortex-fsst", "vortex-mask", "vortex-session", ] diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml index 9222c7d6a43..e58a48f16e7 100644 --- a/vortex-row/Cargo.toml +++ b/vortex-row/Cargo.toml @@ -35,7 +35,12 @@ mimalloc = { workspace = true } rand = { workspace = true } rstest = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } +vortex-fsst = { workspace = true } [[bench]] name = "row_encode" harness = false + +[[bench]] +name = "fsst_row_encode" +harness = false diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs new file mode 100644 index 00000000000..750e23faa1e --- /dev/null +++ b/vortex-row/benches/fsst_row_encode.rs @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![expect(clippy::unwrap_used)] + +//! Row-encoding an FSST-compressed string column: the only realizable strategy is +//! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it), +//! because FSST is **not order-preserving** — its 1-byte codes are assigned by compression +//! gain, not by value, so the compressed bytes cannot be compared lexicographically. A +//! hypothetical "direct" kernel could only *fuse* decompression with row-key emission; it +//! still has to expand every symbol. +//! +//! These benchmarks measure the full path and its two phases so the fusion opportunity is +//! quantifiable: +//! * `fsst_unpack_then_convert` — decompress + row-encode (the status quo). +//! * `fsst_decompress_only` — decompress alone (the irreducible floor: a direct kernel +//! must still produce these bytes). +//! * `plain_row_encode_only` — row-encode an already-decompressed `VarBinView` (the part +//! a fused kernel would overlap with decompression; its writes into the intermediate +//! buffer + views are what fusion removes). + +use divan::counter::BytesCount; +use mimalloc::MiMalloc; +use rand::RngExt; +use rand::SeedableRng; +use rand::rngs::StdRng; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::VarBinArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_fsst::fsst_compress; +use vortex_fsst::fsst_train_compressor; +use vortex_row::RowEncoder; + +#[global_allocator] +static GLOBAL: MiMalloc = MiMalloc; + +const N: usize = 100_000; +const AVG_LEN: usize = 64; +const UNIQUE_CHARS: u8 = 8; + +/// Generate compressible, multi-block (>32 byte) strings over a small alphabet so FSST finds +/// a strong symbol table — the regime where a direct kernel would matter most. +fn generate_strings() -> (VarBinArray, u64) { + let mut rng = StdRng::seed_from_u64(0); + let mut strings = Vec::with_capacity(N); + let mut total_bytes: u64 = 0; + for _ in 0..N { + let len = AVG_LEN * rng.random_range(50..=150) / 100; + total_bytes += len as u64; + let s = (0..len) + .map(|_| rng.random_range(b'a'..(b'a' + UNIQUE_CHARS)) as char) + .collect::() + .into_bytes(); + strings.push(Some(s.into_boxed_slice())); + } + let arr = VarBinArray::from_iter(strings, DType::Binary(Nullability::NonNullable)); + (arr, total_bytes) +} + +fn build_fsst() -> (ArrayRef, u64) { + let (arr, total_bytes) = generate_strings(); + let compressor = fsst_train_compressor(&arr); + let len = arr.len(); + let dtype = arr.dtype().clone(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let fsst = fsst_compress(arr, len, &dtype, &compressor, &mut ctx).into_array(); + (fsst, total_bytes) +} + +fn decompress(fsst: &ArrayRef) -> ArrayRef { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + fsst.clone() + .execute::(&mut ctx) + .unwrap() + .into_array() +} + +fn main() { + divan::main(); +} + +/// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it. +#[divan::bench] +fn fsst_unpack_then_convert(bencher: divan::Bencher) { + let (fsst, total_bytes) = build_fsst(); + let encoder = RowEncoder::default(); + bencher.counter(BytesCount::new(total_bytes)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let decoded = fsst.clone().execute::(&mut ctx).unwrap().into_array(); + encoder.encode(&[decoded], &mut ctx).unwrap() + }); +} + +/// Irreducible floor: FSST decompression alone (a direct kernel must still produce these +/// bytes, since the sort key *is* the decompressed bytes). +#[divan::bench] +fn fsst_decompress_only(bencher: divan::Bencher) { + let (fsst, total_bytes) = build_fsst(); + bencher + .counter(BytesCount::new(total_bytes)) + .bench_local(|| decompress(&fsst)); +} + +/// Row-encode an already-decompressed `VarBinView`. The writes into the decompressed buffer + +/// views that precede this step are what a fused direct kernel would eliminate. +#[divan::bench] +fn plain_row_encode_only(bencher: divan::Bencher) { + let (fsst, total_bytes) = build_fsst(); + let decoded = decompress(&fsst); + let encoder = RowEncoder::default(); + bencher.counter(BytesCount::new(total_bytes)).bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + encoder.encode(std::slice::from_ref(&decoded), &mut ctx).unwrap() + }); +} From d6f1f4ecb11c0c24566aa43f5aa4993e5fb17c64 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 22:04:16 +0000 Subject: [PATCH 14/19] vortex-row: rustfmt the fsst row-encode benchmark Apply nightly rustfmt formatting to the FSST benchmark added in the previous commit. Signed-off-by: Joe Isaacs --- vortex-row/benches/fsst_row_encode.rs | 28 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs index 750e23faa1e..58c9eed9843 100644 --- a/vortex-row/benches/fsst_row_encode.rs +++ b/vortex-row/benches/fsst_row_encode.rs @@ -89,11 +89,17 @@ fn main() { fn fsst_unpack_then_convert(bencher: divan::Bencher) { let (fsst, total_bytes) = build_fsst(); let encoder = RowEncoder::default(); - bencher.counter(BytesCount::new(total_bytes)).bench_local(|| { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let decoded = fsst.clone().execute::(&mut ctx).unwrap().into_array(); - encoder.encode(&[decoded], &mut ctx).unwrap() - }); + bencher + .counter(BytesCount::new(total_bytes)) + .bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let decoded = fsst + .clone() + .execute::(&mut ctx) + .unwrap() + .into_array(); + encoder.encode(&[decoded], &mut ctx).unwrap() + }); } /// Irreducible floor: FSST decompression alone (a direct kernel must still produce these @@ -113,8 +119,12 @@ fn plain_row_encode_only(bencher: divan::Bencher) { let (fsst, total_bytes) = build_fsst(); let decoded = decompress(&fsst); let encoder = RowEncoder::default(); - bencher.counter(BytesCount::new(total_bytes)).bench_local(|| { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - encoder.encode(std::slice::from_ref(&decoded), &mut ctx).unwrap() - }); + bencher + .counter(BytesCount::new(total_bytes)) + .bench_local(|| { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + encoder + .encode(std::slice::from_ref(&decoded), &mut ctx) + .unwrap() + }); } From 43915bd9c915e314c899be0c7aca3f260f4a132e Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 22:12:10 +0000 Subject: [PATCH 15/19] vortex-row: add fused fast-path FSST row-encode benchmark Adds `fsst_fast_fused`: bulk-decompresses the FSST code heap straight into a contiguous buffer (no intermediate VarBinViewArray) and block-encodes rows directly into the row-key ListView using the stored uncompressed_lengths (free size pass), with the same no-zero-init / no-extra-copy techniques as the row encoder. Lets us compare the fused path head-to-head against decode-then-convert. Signed-off-by: Joe Isaacs --- vortex-row/benches/fsst_row_encode.rs | 135 +++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs index 58c9eed9843..e7ee9cf221e 100644 --- a/vortex-row/benches/fsst_row_encode.rs +++ b/vortex-row/benches/fsst_row_encode.rs @@ -1,7 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#![expect(clippy::unwrap_used)] +#![expect( + clippy::unwrap_used, + clippy::expect_used, + clippy::cast_possible_truncation +)] //! Row-encoding an FSST-compressed string column: the only realizable strategy is //! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it), @@ -29,9 +33,17 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; +use vortex_array::match_each_integer_ptype; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBufferMut; +use vortex_fsst::FSST; +use vortex_fsst::FSSTArrayExt; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; use vortex_row::RowEncoder; @@ -80,6 +92,117 @@ fn decompress(fsst: &ArrayRef) -> ArrayRef { .into_array() } +const VARLEN_BLOCK: usize = 32; +const VARLEN_BLOCK_TOTAL: usize = 33; +// Sentinel for a non-empty varlen value (ascending, non-null) — value is irrelevant to timing. +const NON_EMPTY_SENTINEL: u8 = 0x02; + +/// Encoded row-key length for a non-empty value of `len` decompressed bytes: a leading +/// sentinel plus `ceil(len/32)` 32-byte blocks, each followed by a continuation/length byte. +fn encoded_len(len: usize) -> u32 { + if len == 0 { + 1 + } else { + 1 + (len.div_ceil(VARLEN_BLOCK) as u32) * VARLEN_BLOCK_TOTAL as u32 + } +} + +/// Block-encode `bytes` (ascending) into `out`, matching vortex-row's varlen body format. +fn block_encode(bytes: &[u8], out: &mut [u8]) { + let len = bytes.len(); + let full = len / VARLEN_BLOCK; + let partial = len % VARLEN_BLOCK; + let (full_to_write, partial_len) = if partial == 0 { + (full - 1, VARLEN_BLOCK) + } else { + (full, partial) + }; + let mut src = 0; + let mut dst = 0; + for _ in 0..full_to_write { + out[dst..dst + VARLEN_BLOCK].copy_from_slice(&bytes[src..src + VARLEN_BLOCK]); + out[dst + VARLEN_BLOCK] = 0xFF; + src += VARLEN_BLOCK; + dst += VARLEN_BLOCK_TOTAL; + } + out[dst..dst + partial_len].copy_from_slice(&bytes[src..src + partial_len]); + for b in &mut out[dst + partial_len..dst + VARLEN_BLOCK] { + *b = 0; + } + out[dst + VARLEN_BLOCK] = partial_len as u8; +} + +/// Fused FSST → row-key kernel: bulk-decompress the code heap into one contiguous buffer (no +/// intermediate `VarBinViewArray`), then block-encode each row straight into the row-key +/// `ListView` using the stored `uncompressed_lengths` for boundaries (no size-pass walk). +fn fast_fused(fsst: &ArrayRef) -> ArrayRef { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = fsst.as_opt::().expect("FSST array"); + + // Per-row decompressed lengths are already stored — the size pass is free. + let lens_arr = view + .uncompressed_lengths() + .clone() + .execute::(&mut ctx) + .unwrap(); + let lens: Vec = match_each_integer_ptype!(lens_arr.ptype(), |P| { + lens_arr + .as_slice::

() + .iter() + .map(|x| *x as usize) + .collect() + }); + + // Bulk-decompress the whole code heap once into a contiguous buffer (no VarBinView). + let heap = view.codes_bytes(); + let total: usize = lens.iter().sum(); + let decompressor = view.decompressor(); + let mut decompressed = ByteBufferMut::with_capacity(total + 7); + let n = decompressor.decompress_into(heap.as_slice(), decompressed.spare_capacity_mut()); + unsafe { decompressed.set_len(n) }; + let bytes = decompressed.as_slice(); + + // Size + offsets for the row-key ListView (lengths are free, no view walk). + let nrows = lens.len(); + let mut offsets: Vec = Vec::with_capacity(nrows); + let mut sizes: Vec = Vec::with_capacity(nrows); + let mut acc: u32 = 0; + for &l in &lens { + offsets.push(acc); + let sz = encoded_len(l); + sizes.push(sz); + acc += sz; + } + + // Block-encode every row directly into the elements buffer. No zero-init (every byte is + // written: sentinel + block body with zero-padded final block) and no Vec→Buffer copy. + let mut out = ByteBufferMut::with_capacity(acc as usize); + unsafe { out.set_len(acc as usize) }; + let out_slice = out.as_mut_slice(); + let mut src = 0usize; + for (i, &l) in lens.iter().enumerate() { + let pos = offsets[i] as usize; + out_slice[pos] = NON_EMPTY_SENTINEL; + if l != 0 { + block_encode(&bytes[src..src + l], &mut out_slice[pos + 1..]); + } + src += l; + } + + let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable); + let offsets_arr = + PrimitiveArray::new(Buffer::::copy_from(&offsets), Validity::NonNullable); + let sizes_arr = PrimitiveArray::new(Buffer::::copy_from(&sizes), Validity::NonNullable); + ListViewArray::try_new( + elements.into_array(), + offsets_arr.into_array(), + sizes_arr.into_array(), + Validity::NonNullable, + ) + .unwrap() + .into_array() +} + fn main() { divan::main(); } @@ -102,6 +225,16 @@ fn fsst_unpack_then_convert(bencher: divan::Bencher) { }); } +/// Fused fast path: bulk-decompress directly into the row-key block format, skipping the +/// intermediate `VarBinViewArray` and the generic row-encoder (size pass is free). +#[divan::bench] +fn fsst_fast_fused(bencher: divan::Bencher) { + let (fsst, total_bytes) = build_fsst(); + bencher + .counter(BytesCount::new(total_bytes)) + .bench_local(|| fast_fused(&fsst)); +} + /// Irreducible floor: FSST decompression alone (a direct kernel must still produce these /// bytes, since the sort key *is* the decompressed bytes). #[divan::bench] From 2c54e62931dd27bef50e1de35dbe2fa6742f8200 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 5 Jun 2026 08:02:46 +0000 Subject: [PATCH 16/19] vortex-row: add scatter-right fused FSST row-encode benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `fsst_fast_scatter`: keeps FSST's fast contiguous bulk decompressor but runs it into a cache-resident scratch one row-batch at a time, scattering each row into block form from cache so the decompressed bytes never round-trip through main memory. A one-time assert_arrays_eq! check confirms it produces byte-identical row keys to the straightforward fused path. Result: fast_scatter is on par with fast_fused (no speedup) — the decompressed buffer is already consumed cache-warm in the simple fused path, so avoiding the round-trip saves nothing; the workload is CPU-bound on FSST symbol decode plus block-copy. Signed-off-by: Joe Isaacs --- vortex-row/benches/fsst_row_encode.rs | 129 +++++++++++++++++++++++++- 1 file changed, 128 insertions(+), 1 deletion(-) diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs index e7ee9cf221e..083788ff6ab 100644 --- a/vortex-row/benches/fsst_row_encode.rs +++ b/vortex-row/benches/fsst_row_encode.rs @@ -4,7 +4,8 @@ #![expect( clippy::unwrap_used, clippy::expect_used, - clippy::cast_possible_truncation + clippy::cast_possible_truncation, + clippy::many_single_char_names )] //! Row-encoding an FSST-compressed string column: the only realizable strategy is @@ -36,6 +37,8 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::ListViewArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::varbin::VarBinArrayExt; +use vortex_array::assert_arrays_eq; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::match_each_integer_ptype; @@ -203,10 +206,134 @@ fn fast_fused(fsst: &ArrayRef) -> ArrayRef { .into_array() } +/// "Scatter right": keep FSST's fast contiguous bulk decompressor, but run it into a +/// cache-resident scratch one row-batch at a time, then scatter each row into block form from +/// cache. The decompressed bytes never round-trip through main memory — unlike `fast_fused`, +/// which materializes the whole 6.4 MB decompressed buffer and reads it back to block-encode. +fn fast_scatter(fsst: &ArrayRef) -> ArrayRef { + // Scratch sized to stay resident in L1/L2; each batch decompresses up to this many bytes. + const SCRATCH: usize = 16 * 1024; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = fsst.as_opt::().expect("FSST array"); + + let lens_arr = view + .uncompressed_lengths() + .clone() + .execute::(&mut ctx) + .unwrap(); + let lens: Vec = match_each_integer_ptype!(lens_arr.ptype(), |P| { + lens_arr + .as_slice::

() + .iter() + .map(|x| *x as usize) + .collect() + }); + let nrows = lens.len(); + + // Per-row compressed code offsets (relative to the sliced heap start). + let codes = view.codes(); + let heap = codes.sliced_bytes(); + let code_off_arr = codes + .offsets() + .clone() + .execute::(&mut ctx) + .unwrap(); + let base = match_each_integer_ptype!(code_off_arr.ptype(), |P| { + code_off_arr.as_slice::

()[0] as usize + }); + let code_off: Vec = match_each_integer_ptype!(code_off_arr.ptype(), |P| { + code_off_arr + .as_slice::

() + .iter() + .map(|x| *x as usize - base) + .collect() + }); + + // Output sizing (free from stored lengths). + let mut offsets: Vec = Vec::with_capacity(nrows); + let mut sizes: Vec = Vec::with_capacity(nrows); + let mut acc: u32 = 0; + let mut max_row = 0usize; + for &l in &lens { + offsets.push(acc); + let sz = encoded_len(l); + sizes.push(sz); + acc += sz; + max_row = max_row.max(l); + } + let mut out = ByteBufferMut::with_capacity(acc as usize); + unsafe { out.set_len(acc as usize) }; + let out_slice = out.as_mut_slice(); + + let decompressor = view.decompressor(); + let scratch_cap = SCRATCH.max(max_row) + 8; + let mut scratch = ByteBufferMut::with_capacity(scratch_cap); + + let mut r = 0usize; + while r < nrows { + // Grow a batch until it would overflow the scratch (always at least one row). + let bs = r; + let mut batch_bytes = 0usize; + while r < nrows && (r == bs || batch_bytes + lens[r] <= SCRATCH) { + batch_bytes += lens[r]; + r += 1; + } + let be = r; + + // Decompress this batch's codes in one fast call into the cache-resident scratch. + let cslice = &heap.as_slice()[code_off[bs]..code_off[be]]; + let n = decompressor.decompress_into(cslice, scratch.spare_capacity_mut()); + unsafe { scratch.set_len(n) }; + let sbytes = scratch.as_slice(); + + // Scatter each row from cache into block form. + let mut local = 0usize; + for i in bs..be { + let l = lens[i]; + let pos = offsets[i] as usize; + out_slice[pos] = NON_EMPTY_SENTINEL; + if l != 0 { + block_encode(&sbytes[local..local + l], &mut out_slice[pos + 1..]); + } + local += l; + } + unsafe { scratch.set_len(0) }; + } + + let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable); + let offsets_arr = + PrimitiveArray::new(Buffer::::copy_from(&offsets), Validity::NonNullable); + let sizes_arr = PrimitiveArray::new(Buffer::::copy_from(&sizes), Validity::NonNullable); + ListViewArray::try_new( + elements.into_array(), + offsets_arr.into_array(), + sizes_arr.into_array(), + Validity::NonNullable, + ) + .unwrap() + .into_array() +} + fn main() { + // Correctness: the batched cache-resident scatter must produce identical row keys to the + // straightforward fused path. + { + let (fsst, _) = build_fsst(); + assert_arrays_eq!(fast_scatter(&fsst), fast_fused(&fsst)); + } divan::main(); } +/// "Scatter right" fused path: cache-resident batched decompress + scatter into block form. +#[divan::bench] +fn fsst_fast_scatter(bencher: divan::Bencher) { + let (fsst, total_bytes) = build_fsst(); + bencher + .counter(BytesCount::new(total_bytes)) + .bench_local(|| fast_scatter(&fsst)); +} + /// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it. #[divan::bench] fn fsst_unpack_then_convert(bencher: divan::Bencher) { From b3411f1b2d08d7a1b7cfbc036b23feeeb4c50399 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 5 Jun 2026 11:32:57 +0100 Subject: [PATCH 17/19] fix Signed-off-by: Joe Isaacs --- docs/specs/index.md | 1 + docs/specs/row-encoding.md | 539 +++++++++++++++++++++++++++++++++++++ vortex-row/README.md | 17 ++ vortex-row/src/codec.rs | 40 +-- vortex-row/src/encode.rs | 2 +- vortex-row/src/encoder.rs | 23 ++ vortex-row/src/lib.rs | 6 +- vortex-row/src/size.rs | 2 +- vortex-row/src/tests.rs | 40 +++ 9 files changed, 638 insertions(+), 32 deletions(-) create mode 100644 docs/specs/row-encoding.md create mode 100644 vortex-row/README.md diff --git a/docs/specs/index.md b/docs/specs/index.md index 547cd99f182..045ccb0710d 100644 --- a/docs/specs/index.md +++ b/docs/specs/index.md @@ -11,4 +11,5 @@ file-format ipc-format dtype-format scalar-format +row-encoding ``` diff --git a/docs/specs/row-encoding.md b/docs/specs/row-encoding.md new file mode 100644 index 00000000000..8fc3288f82b --- /dev/null +++ b/docs/specs/row-encoding.md @@ -0,0 +1,539 @@ +# Row Encoding Byte Sort Specification + +This document describes the byte-sortable row encoding implemented by the `vortex-row` +crate. The encoding converts one or more columnar arrays into a `ListView` array. Each +output row is a byte string, and lexicographic byte comparison of those byte strings matches +logical tuple comparison of the input values under the configured row sort options. + +This is a schema-aware row-key format. The bytes do not contain type tags, field names, or +sort options. Two encoded rows are comparable only when they were produced with the same +input schema and the same per-column `RowSortField` settings. + +The row encoding is not the Vortex file format or scalar IPC format. It is an internal +comparison representation used for sort keys and row-key operations. + +:::{warning} +The row encoding format is experimental. Its byte layout, supported type set, and edge-case +semantics may change between Vortex releases. Do not persist these bytes or depend on them as +a stable interchange format. +::: + +## Order Property + +For a fixed schema with columns `c0, c1, ..., cn` and per-column sort fields +`f0, f1, ..., fn`, row encoding provides this property: + +```text +encode(row_a) < encode(row_b) +``` + +if and only if tuple comparison says: + +```text +(row_a.c0, row_a.c1, ..., row_a.cn) < (row_b.c0, row_b.c1, ..., row_b.cn) +``` + +using the requested ascending or descending direction and requested null placement for each +column. + +The property is built from two rules: + +1. Each supported scalar or nested value is encoded so its bytes sort in the same order as + the value. +2. Fields are concatenated from left to right, so lexicographic byte comparison naturally + performs tuple comparison. + +## Notation + +This document uses the following notation: + +- `||` means byte concatenation. +- `BE(x)` means the fixed-width big-endian bytes of `x`. +- `!b` means `b XOR 0xFF`. +- `!bytes` means bitwise complement of every byte in `bytes`. +- `zero(n)` means `n` zero bytes. +- `ff(n)` means `n` bytes of `0xFF`. +- `width(T)` means the native byte width of fixed-width type `T`. + +`BE(x)` always emits exactly the byte width of the value being encoded, with the most +significant byte first. It is not length-prefixed and it does not drop leading zero or +leading `0xFF` bytes. The host machine's native endianness is irrelevant; encoders produce +these bytes explicitly. + +For example: + +| Value and type | `BE(value)` | +| --- | --- | +| `1_u8` | `01` | +| `258_u16` | `01 02` | +| `258_u32` | `00 00 01 02` | +| `-5_i32`, before the signed sign-bit transform | `FF FF FF FB` | +| `ordered = 0x80000000_u32` | `80 00 00 00` | + +## Field Options + +Each input column has a `RowSortField`: + +```text +RowSortField { + descending: bool, + nulls_first: bool, +} +``` + +`descending` reverses the order of non-null values. `nulls_first` is independent of +`descending`, so nulls can sort before or after non-nulls in either direction. + +## Sentinel Summary + +Sentinels are single bytes that classify nullness and, for variable-width values, whether a +value is empty or non-empty. They are chosen so byte comparison can decide those categories +before comparing any value bytes. + +| Encoding family | Case | Ascending, nulls first | Descending, nulls first | Ascending, nulls last | Descending, nulls last | +| --- | --- | --- | --- | --- | --- | +| Fixed-width | Null | `0x00` | `0x00` | `0x02` | `0x02` | +| Fixed-width | Non-null | `0x01` | `0x01` | `0x01` | `0x01` | +| Variable-width | Null | `0x00` | `0x00` | `0xFF` | `0xFF` | +| Variable-width | Empty | `0x01` | `0xFE` | `0x01` | `0xFE` | +| Variable-width | Non-empty | `0x02` | `0xFD` | `0x02` | `0xFD` | + +Fixed-width sentinels are used by null, boolean, primitive, decimal, struct, and fixed-size +list values. Variable-width sentinels are used by UTF-8 and binary values. + +## Fixed-Width Sentinels + +Every fixed-width value starts with a one-byte sentinel: + +| Case | Sentinel | +| --- | --- | +| Null, `nulls_first = true` | `0x00` | +| Non-null | `0x01` | +| Null, `nulls_first = false` | `0x02` | + +The sentinel is not inverted for descending order. Only the non-null value bytes are +inverted. This keeps null placement independent from sort direction. + +For fixed-width nulls, the sentinel is followed by zero-filled value bytes. This gives fixed +types a constant encoded width for every row. + +## Variable-Width Sentinels + +UTF-8 and binary values use three leading sentinels. The separate empty and non-empty +sentinels are important: they ensure the first byte decides null, empty, or non-empty before +later columns can affect comparison. + +| Case | Ascending | Descending | +| --- | --- | --- | +| Null, `nulls_first = true` | `0x00` | `0x00` | +| Empty | `0x01` | `0xFE` | +| Non-empty | `0x02` | `0xFD` | +| Null, `nulls_first = false` | `0xFF` | `0xFF` | + +The null sentinel is not inverted by descending order. Empty and non-empty sentinels are +inverted so non-null value order is reversed while null placement stays fixed. + +## Null + +`Null` values have no body: + +```text +fixed_null_sentinel +``` + +The sentinel is `0x00` for nulls-first and `0x02` for nulls-last. + +## Boolean + +Booleans are fixed-width and use one value byte: + +```text +sentinel || value_byte +``` + +For ascending order: + +| Value | Value byte | +| --- | --- | +| `false` | `0x01` | +| `true` | `0x02` | + +For descending order, the value byte is inverted: + +| Value | Value byte | +| --- | --- | +| `true` | `0xFD` | +| `false` | `0xFE` | + +Null booleans encode as: + +```text +null_sentinel || 0x00 +``` + +## Unsigned Integers + +Supported unsigned primitive types are `u8`, `u16`, `u32`, and `u64`. + +Ascending encoding: + +```text +0x01 || BE(value) +``` + +Descending encoding: + +```text +0x01 || !BE(value) +``` + +Big-endian byte order makes lexicographic byte order match numeric order for fixed-width +unsigned integers. Bitwise complement reverses that order for descending fields. + +Null unsigned integers encode as: + +```text +null_sentinel || zero(width(T)) +``` + +## Signed Integers + +Supported signed primitive PTypes are `i8`, `i16`, `i32`, and `i64`. The same signed +integer transform is also used for `i128` decimal storage. + +Signed integers first flip the sign bit of their big-endian two's-complement +representation: + +```text +ordered = BE(value) +ordered[0] = ordered[0] XOR 0x80 +``` + +Ascending encoding: + +```text +0x01 || ordered +``` + +Descending encoding: + +```text +0x01 || !ordered +``` + +Flipping the sign bit maps the signed numeric range into unsigned byte order: + +```text +negative values -> 0x00..0x7F prefix range +non-negative values -> 0x80..0xFF prefix range +``` + +Null signed integers encode as: + +```text +null_sentinel || zero(width(T)) +``` + +## Floating Point + +Supported floating primitive types are `f16`, `f32`, and `f64`. + +The encoder treats the IEEE bit pattern as an unsigned integer and applies a sign-aware +transform before writing big-endian bytes. + +For a floating value with raw bits `bits`: + +```text +if sign_bit(bits) == 0: + ordered = bits XOR sign_bit_mask +else: + ordered = bits XOR all_ones +``` + +Ascending encoding: + +```text +0x01 || BE(ordered) +``` + +Descending encoding: + +```text +0x01 || !BE(ordered) +``` + +This produces a total-order-style byte ordering where negative values sort before positive +values, and `-0.0` sorts before `+0.0`. NaN values are ordered by their raw bit patterns +under the same transform; they are not canonicalized by row encoding. + +Null floats encode as: + +```text +null_sentinel || zero(width(T)) +``` + +## Decimal + +Decimals are encoded as their scaled signed integer storage value. The selected storage +width is the smallest decimal value type for the decimal precision: + +| Precision | Storage | +| --- | --- | +| `1..=2` | `i8` | +| `3..=4` | `i16` | +| `5..=9` | `i32` | +| `10..=18` | `i64` | +| `19..=38` | `i128` | + +The storage integer is encoded with the signed integer encoding described above. Decimal +columns have one precision and scale, so ordering the scaled integer storage values matches +ordering the decimal values in that column. + +`Decimal256` is not supported by row encoding. + +## UTF-8 and Binary + +UTF-8 and binary values use the variable-width sentinels described above. + +Null: + +```text +varlen_null_sentinel +``` + +Empty: + +```text +varlen_empty_sentinel +``` + +Non-empty: + +```text +varlen_non_empty_sentinel || varlen_body(bytes) +``` + +For UTF-8, `bytes` are the UTF-8 bytes of the string. For binary, `bytes` are the raw binary +bytes. The byte ordering is therefore UTF-8 byte lexicographic order for strings and raw byte +lexicographic order for binary. + +### Variable-Length Body + +Non-empty variable-length values are encoded in blocks. Each block contains 32 data bytes +followed by one marker byte: + +```text +data[0..32] || marker +``` + +For ascending order: + +- Every non-final full block uses marker `0xFF`. +- The final block is padded with zeros to 32 data bytes. +- The final marker is the number of real data bytes in the final block, in `1..=32`. + +For descending order: + +- Every data byte is inverted. +- Every non-final full-block marker is `0x00`, the inverse of `0xFF`. +- The final block is padded with `0xFF`, the inverse of ascending zero padding. +- The final marker is inverted: `final_len XOR 0xFF`. + +If the input length is exactly a multiple of 32, the final block has marker `32`, and earlier +blocks, if any, use the continuation marker. + +This block structure preserves prefix order. For example, in ascending order a shorter value +that is a prefix of a longer value reaches its final marker before the longer value reaches +the continuation marker. Since final length markers in `1..=32` are less than `0xFF`, the +shorter prefix sorts first. Descending order inverts the same bytes and reverses that result. + +## Struct + +A struct is encoded as: + +```text +struct_sentinel || field_0 || field_1 || ... || field_n +``` + +The outer sentinel is the fixed-width sentinel: + +- `0x01` for a non-null struct +- `0x00` or `0x02` for a null struct, depending on null placement + +For a non-null struct, each field is encoded recursively in schema order using the same +`RowSortField` as the parent struct column. + +For a null struct, the body is canonicalized so two null parent rows produce byte-equal +output even if their physical child arrays contain different values: + +- Fixed-width children contribute their fixed-width null encoding. +- Variable-width children contribute exactly one child null sentinel byte. + +A struct has fixed row width only when all of its fields have fixed row width. If any child +is variable-width, the struct is variable-width. + +## Fixed-Size List + +A fixed-size list with `N` elements is encoded as: + +```text +list_sentinel || element_0 || element_1 || ... || element_N-1 +``` + +The outer sentinel is the fixed-width sentinel: + +- `0x01` for a non-null list +- `0x00` or `0x02` for a null list, depending on null placement + +For a non-null fixed-size list, elements are encoded recursively in element order using the +same `RowSortField` as the parent list column. + +For a null fixed-size list, the body is canonicalized: + +- Fixed-width elements contribute their fixed-width null encoding, repeated `N` times. +- Variable-width elements contribute one child null sentinel byte per element. + +A fixed-size list has fixed row width only when its element type has fixed row width. + +## Nested Values + +Nested structs and fixed-size lists apply the same rules recursively. Each nullable parent +adds its own outer sentinel. Null parents canonicalize their child body before comparison can +observe underlying child values. + +## Unsupported Types + +The current row encoder rejects types for which it does not define byte-sort semantics: + +| Type | Reason | +| --- | --- | +| Variable-size `List` | No row encoding order is defined. | +| `Variant` | No row encoding order is defined. | +| `Union` | No row encoding order is defined. | +| `Extension` | No row encoding order is defined. | +| `Decimal256` | Encoding is not implemented. | + +The absence of these encodings is intentional. Adding one requires defining both the logical +ordering and the exact byte representation that preserves that ordering. + +Temporal extensions could be added later by normalizing them to storage arrays at the +row-encoder boundary, once the supported temporal ordering contract is made explicit. + +## Size and Output Layout + +The encoded output is a `ListView`: + +```text +elements: contiguous u8 buffer containing all row bytes +offsets: per-row start offset into elements +sizes: per-row byte length +``` + +Rows are not self-describing without their `sizes`. A variable-width field can make one row +longer than another, and the enclosing `ListView` supplies the row boundary. + +The encoder computes sizes before writing bytes: + +- Fixed-width columns contribute a constant width per row. +- Variable-width columns contribute data-dependent widths per row. +- The final `sizes` array is also used as the per-row write cursor during encoding. + +## Why Concatenation Works + +For each supported field type, the field encoder is an order embedding from logical values to +byte strings: + +```text +a < b <=> encode_field(a) < encode_field(b) +a = b <=> encode_field(a) = encode_field(b) +``` + +When two rows are compared lexicographically, the first differing byte belongs to the first +field whose encoded value differs. All preceding fields have byte-equal encodings and +therefore equal logical values. The result is the same as tuple comparison. + +Variable-width fields preserve this property because their encodings are self-delimiting for +comparison: + +- Null, empty, and non-empty values differ at the first byte. +- Non-empty values use block markers to decide prefix cases before the next field can be + compared. +- Row boundaries are supplied by `ListView` sizes. + +Descending order works because complementing every byte of an equal-length order-preserving +value encoding reverses its order. The variable-width encoding also complements its sentinels, +body bytes, padding, and markers for non-null values, so the same reversal applies to strings +and binary values. Null sentinels are excluded from that reversal so null placement remains +controlled solely by `nulls_first`. + +## Example Row + +This example shows one row that contains every supported encoding family. All columns use +ascending order with nulls first. + +Schema: + +```text +( + null_col: Null, + bool_col: Bool, + uint_col: U16, + int_col: I16, + float_col: F32, + decimal_col: Decimal(precision = 9, scale = 2), + utf8_col: Utf8, + binary_col: Binary, + struct_col: Struct { x: I8, y: Utf8 }, + fsl_col: FixedSizeList, +) +``` + +Values: + +```text +( + null, + true, + 258_u16, + -5_i16, + 1.5_f32, + 123.45_decimal, // stored as 12345_i32 + "a", + DE AD BE EF, + { x: 1_i8, y: "" }, + [1_u8, 2_u8, 3_u8], +) +``` + +Encoded columns: + +| Column | Encoded bytes | +| --- | --- | +| `null_col` | `00` | +| `bool_col` | `01 02` | +| `uint_col` | `01 01 02` | +| `int_col` | `01 7F FB` | +| `float_col` | `01 BF C0 00 00` | +| `decimal_col` | `01 80 00 30 39` | +| `utf8_col` | `02 61 zero(31) 01` | +| `binary_col` | `02 DE AD BE EF zero(28) 04` | +| `struct_col` | `01 01 81 01` | +| `fsl_col` | `01 01 01 01 02 01 03` | + +The full row key is the concatenation of those byte strings in schema order: + +```text +00 +|| 01 02 +|| 01 01 02 +|| 01 7F FB +|| 01 BF C0 00 00 +|| 01 80 00 30 39 +|| 02 61 zero(31) 01 +|| 02 DE AD BE EF zero(28) 04 +|| 01 01 81 01 +|| 01 01 01 01 02 01 03 +``` + +Primitive examples here use one representative width per primitive family. Other widths use +the same transform and emit exactly `width(T)` value bytes. diff --git a/vortex-row/README.md b/vortex-row/README.md new file mode 100644 index 00000000000..e0c574eca43 --- /dev/null +++ b/vortex-row/README.md @@ -0,0 +1,17 @@ +# vortex-row + +`vortex-row` provides an experimental row-oriented byte encoder for Vortex arrays. It +produces byte strings that can be compared lexicographically to sort rows according to the +configured column ordering. + +Only supported Vortex logical types are accepted. Extension types are rejected until their +logical sort semantics are defined. + +## Experimental Format + +The row encoding byte layout is experimental. Its exact bytes, supported type set, and +edge-case semantics may change between Vortex releases. + +Do not persist row-encoded bytes or use them as a stable interchange format. They are intended +for internal sort-key and row-key operations where the encoder version, schema, and sort +options are controlled together. diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs index d0cb32ce13d..4848a750e52 100644 --- a/vortex-row/src/codec.rs +++ b/vortex-row/src/codec.rs @@ -28,13 +28,11 @@ use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; -use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::FixedSizeListArray; use vortex_array::arrays::NullArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; -use vortex_array::arrays::extension::ExtensionArrayExt; use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt; use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; @@ -227,11 +225,11 @@ pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { "row encoding does not support variable-size List arrays (no well-defined ordering)" ) } - DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()), DType::Variant(_) => { vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") } DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"), + dtype => vortex_bail!("row encoding does not support dtype: {dtype:?}"), } } @@ -257,7 +255,6 @@ pub(crate) fn field_size( Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?, Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?, Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?, - Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?, Canonical::List(_) => vortex_bail!( "row encoding does not support canonical List arrays: {:?}", canonical.dtype() @@ -265,6 +262,12 @@ pub(crate) fn field_size( Canonical::Variant(_) => { vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") } + unsupported => { + vortex_bail!( + "row encoding does not support canonical array: {:?}", + unsupported.dtype() + ) + } } Ok(()) } @@ -344,7 +347,6 @@ pub(crate) fn field_encode( Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?, Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?, Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?, - Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?, Canonical::List(_) => vortex_bail!( "row encoding does not support canonical List arrays: {:?}", canonical.dtype() @@ -352,6 +354,12 @@ pub(crate) fn field_encode( Canonical::Variant(_) => { vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") } + unsupported => { + vortex_bail!( + "row encoding does not support canonical array: {:?}", + unsupported.dtype() + ) + } } Ok(()) } @@ -504,16 +512,6 @@ fn add_size_fsl( Ok(()) } -fn add_size_extension( - arr: &ExtensionArray, - field: RowSortField, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let storage = arr.storage_array().clone().execute::(ctx)?; - field_size(&storage, field, sizes, ctx) -} - fn encode_null( arr: &NullArray, field: RowSortField, @@ -999,18 +997,6 @@ fn encode_variable_child( Ok(()) } -fn encode_extension( - arr: &ExtensionArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let storage = arr.storage_array().clone().execute::(ctx)?; - field_encode(&storage, field, row_offsets, col_offset, out, ctx) -} - /// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a /// constant within-row offset, iterating the output in `row_stride`-sized chunks so the /// compiler can drop the per-row offset/cursor indirection. diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index 46a4be778d4..04feec89415 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -41,7 +41,7 @@ use crate::size::compute_sizes; /// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values /// `cols[0][i], cols[1][i], ...` concatenated left-to-right. /// -/// This scalar function is public for session registration and encoding extension work. +/// This scalar function is public for session registration and row-encoding work. /// Most callers should use [`RowEncoder`](crate::RowEncoder) rather than invoking the scalar /// function directly. #[derive(Clone, Debug)] diff --git a/vortex-row/src/encoder.rs b/vortex-row/src/encoder.rs index 15eeda6d2f1..7bcd3e05627 100644 --- a/vortex-row/src/encoder.rs +++ b/vortex-row/src/encoder.rs @@ -6,6 +6,7 @@ use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::arrays::ListViewArray; +use vortex_array::dtype::DType; use vortex_array::scalar_fn::ScalarFnVTable; use vortex_array::scalar_fn::VecExecutionArgs; use vortex_error::VortexResult; @@ -85,6 +86,7 @@ impl RowEncoder { }; let nrows = cols[0].len(); for (i, col) in cols.iter().enumerate() { + reject_extension_dtype(col.dtype())?; if col.len() != nrows { vortex_bail!( "RowEncoder: column {} has length {} but expected {}", @@ -98,6 +100,27 @@ impl RowEncoder { } } +fn reject_extension_dtype(dtype: &DType) -> VortexResult<()> { + match dtype { + DType::Extension(ext_dtype) => { + vortex_bail!( + "row encoding does not support Extension arrays yet: {}", + ext_dtype.id() + ) + } + DType::Struct(fields, _) => { + for field_dtype in fields.fields() { + reject_extension_dtype(&field_dtype)?; + } + } + DType::FixedSizeList(elem, ..) | DType::List(elem, _) => { + reject_extension_dtype(elem)?; + } + _ => {} + } + Ok(()) +} + /// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes /// are lexicographically comparable in the same order as a tuple comparison of the input /// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`]. diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index d921e2998e3..b36121f0da2 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -23,9 +23,9 @@ //! into the per-row slots from left to right. //! //! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to -//! 128 bits, UTF-8 and binary values, structs, fixed-size lists, and extensions whose storage -//! type is supported. Variant, union, and variable-size list arrays are rejected because this -//! crate does not define an ordering for them. +//! 128 bits, UTF-8 and binary values, structs, and fixed-size lists. Extension, variant, +//! union, and variable-size list arrays are rejected because this crate does not define an +//! ordering for them. mod codec; mod encode; diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index 860fe3c2a2c..9112379a6f4 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -162,7 +162,7 @@ pub(crate) fn compute_sizes( /// /// The total per-row byte size is `fixed + var`. /// -/// This scalar function is public for session registration and encoding extension work. +/// This scalar function is public for session registration and row-encoding work. /// Most callers should use [`RowEncoder::row_sizes`](crate::RowEncoder::row_sizes) rather /// than invoking the scalar function directly. #[derive(Clone, Debug)] diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs index 62e0e4cfb98..5c85c911154 100644 --- a/vortex-row/src/tests.rs +++ b/vortex-row/src/tests.rs @@ -10,10 +10,15 @@ use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::ListViewArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::listview::ListViewArrayExt; +use vortex_array::dtype::Nullability; +use vortex_array::extension::datetime::Date; +use vortex_array::extension::datetime::TimeUnit; use vortex_error::VortexResult; use crate::RowEncoder; @@ -88,6 +93,41 @@ fn primitive_u32_sort_order() -> VortexResult<()> { Ok(()) } +#[test] +fn reject_temporal_extension_dtype_early() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let storage = PrimitiveArray::from_iter([2i32, -1, 0, 7]).into_array(); + let ext_dtype = Date::new(TimeUnit::Days, Nullability::NonNullable).erased(); + let col = ExtensionArray::new(ext_dtype, storage).into_array(); + + let err = convert_columns(&[col], &[RowSortField::ascending()], &mut ctx) + .expect_err("temporal extensions should be rejected"); + assert!( + err.to_string().contains("Extension arrays yet"), + "expected error mentioning unsupported Extension arrays, got: {err}" + ); + Ok(()) +} + +#[test] +fn reject_nested_temporal_extension_dtype_early() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let storage = PrimitiveArray::from_iter([2i32, -1, 0, 7]).into_array(); + let ext_dtype = Date::new(TimeUnit::Days, Nullability::NonNullable).erased(); + let date_col = ExtensionArray::new(ext_dtype, storage).into_array(); + let tag_col = VarBinViewArray::from_iter_str(["d", "b", "c", "a"]).into_array(); + let struct_col = + StructArray::from_fields(&[("date", date_col), ("tag", tag_col)])?.into_array(); + + let err = convert_columns(&[struct_col], &[RowSortField::ascending()], &mut ctx) + .expect_err("nested temporal extensions should be rejected"); + assert!( + err.to_string().contains("Extension arrays yet"), + "expected error mentioning unsupported Extension arrays, got: {err}" + ); + Ok(()) +} + #[test] fn primitive_f64_sort_order() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); From a213cdd675cf80595b0076da9349c9fc7fb50943 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 5 Jun 2026 11:34:54 +0100 Subject: [PATCH 18/19] fix Signed-off-by: Joe Isaacs --- Cargo.lock | 1 - vortex-row/Cargo.toml | 5 - vortex-row/benches/fsst_row_encode.rs | 390 -------------------------- 3 files changed, 396 deletions(-) delete mode 100644 vortex-row/benches/fsst_row_encode.rs diff --git a/Cargo.lock b/Cargo.lock index bf24dafe3dd..967f0a18a09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9997,7 +9997,6 @@ dependencies = [ "vortex-array", "vortex-buffer", "vortex-error", - "vortex-fsst", "vortex-mask", "vortex-session", ] diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml index e58a48f16e7..9222c7d6a43 100644 --- a/vortex-row/Cargo.toml +++ b/vortex-row/Cargo.toml @@ -35,12 +35,7 @@ mimalloc = { workspace = true } rand = { workspace = true } rstest = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } -vortex-fsst = { workspace = true } [[bench]] name = "row_encode" harness = false - -[[bench]] -name = "fsst_row_encode" -harness = false diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs deleted file mode 100644 index 083788ff6ab..00000000000 --- a/vortex-row/benches/fsst_row_encode.rs +++ /dev/null @@ -1,390 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -#![expect( - clippy::unwrap_used, - clippy::expect_used, - clippy::cast_possible_truncation, - clippy::many_single_char_names -)] - -//! Row-encoding an FSST-compressed string column: the only realizable strategy is -//! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it), -//! because FSST is **not order-preserving** — its 1-byte codes are assigned by compression -//! gain, not by value, so the compressed bytes cannot be compared lexicographically. A -//! hypothetical "direct" kernel could only *fuse* decompression with row-key emission; it -//! still has to expand every symbol. -//! -//! These benchmarks measure the full path and its two phases so the fusion opportunity is -//! quantifiable: -//! * `fsst_unpack_then_convert` — decompress + row-encode (the status quo). -//! * `fsst_decompress_only` — decompress alone (the irreducible floor: a direct kernel -//! must still produce these bytes). -//! * `plain_row_encode_only` — row-encode an already-decompressed `VarBinView` (the part -//! a fused kernel would overlap with decompression; its writes into the intermediate -//! buffer + views are what fusion removes). - -use divan::counter::BytesCount; -use mimalloc::MiMalloc; -use rand::RngExt; -use rand::SeedableRng; -use rand::rngs::StdRng; -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; -use vortex_array::VortexSessionExecute; -use vortex_array::arrays::ListViewArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::VarBinArray; -use vortex_array::arrays::varbin::VarBinArrayExt; -use vortex_array::assert_arrays_eq; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; -use vortex_array::match_each_integer_ptype; -use vortex_array::validity::Validity; -use vortex_buffer::Buffer; -use vortex_buffer::ByteBufferMut; -use vortex_fsst::FSST; -use vortex_fsst::FSSTArrayExt; -use vortex_fsst::fsst_compress; -use vortex_fsst::fsst_train_compressor; -use vortex_row::RowEncoder; - -#[global_allocator] -static GLOBAL: MiMalloc = MiMalloc; - -const N: usize = 100_000; -const AVG_LEN: usize = 64; -const UNIQUE_CHARS: u8 = 8; - -/// Generate compressible, multi-block (>32 byte) strings over a small alphabet so FSST finds -/// a strong symbol table — the regime where a direct kernel would matter most. -fn generate_strings() -> (VarBinArray, u64) { - let mut rng = StdRng::seed_from_u64(0); - let mut strings = Vec::with_capacity(N); - let mut total_bytes: u64 = 0; - for _ in 0..N { - let len = AVG_LEN * rng.random_range(50..=150) / 100; - total_bytes += len as u64; - let s = (0..len) - .map(|_| rng.random_range(b'a'..(b'a' + UNIQUE_CHARS)) as char) - .collect::() - .into_bytes(); - strings.push(Some(s.into_boxed_slice())); - } - let arr = VarBinArray::from_iter(strings, DType::Binary(Nullability::NonNullable)); - (arr, total_bytes) -} - -fn build_fsst() -> (ArrayRef, u64) { - let (arr, total_bytes) = generate_strings(); - let compressor = fsst_train_compressor(&arr); - let len = arr.len(); - let dtype = arr.dtype().clone(); - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let fsst = fsst_compress(arr, len, &dtype, &compressor, &mut ctx).into_array(); - (fsst, total_bytes) -} - -fn decompress(fsst: &ArrayRef) -> ArrayRef { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - fsst.clone() - .execute::(&mut ctx) - .unwrap() - .into_array() -} - -const VARLEN_BLOCK: usize = 32; -const VARLEN_BLOCK_TOTAL: usize = 33; -// Sentinel for a non-empty varlen value (ascending, non-null) — value is irrelevant to timing. -const NON_EMPTY_SENTINEL: u8 = 0x02; - -/// Encoded row-key length for a non-empty value of `len` decompressed bytes: a leading -/// sentinel plus `ceil(len/32)` 32-byte blocks, each followed by a continuation/length byte. -fn encoded_len(len: usize) -> u32 { - if len == 0 { - 1 - } else { - 1 + (len.div_ceil(VARLEN_BLOCK) as u32) * VARLEN_BLOCK_TOTAL as u32 - } -} - -/// Block-encode `bytes` (ascending) into `out`, matching vortex-row's varlen body format. -fn block_encode(bytes: &[u8], out: &mut [u8]) { - let len = bytes.len(); - let full = len / VARLEN_BLOCK; - let partial = len % VARLEN_BLOCK; - let (full_to_write, partial_len) = if partial == 0 { - (full - 1, VARLEN_BLOCK) - } else { - (full, partial) - }; - let mut src = 0; - let mut dst = 0; - for _ in 0..full_to_write { - out[dst..dst + VARLEN_BLOCK].copy_from_slice(&bytes[src..src + VARLEN_BLOCK]); - out[dst + VARLEN_BLOCK] = 0xFF; - src += VARLEN_BLOCK; - dst += VARLEN_BLOCK_TOTAL; - } - out[dst..dst + partial_len].copy_from_slice(&bytes[src..src + partial_len]); - for b in &mut out[dst + partial_len..dst + VARLEN_BLOCK] { - *b = 0; - } - out[dst + VARLEN_BLOCK] = partial_len as u8; -} - -/// Fused FSST → row-key kernel: bulk-decompress the code heap into one contiguous buffer (no -/// intermediate `VarBinViewArray`), then block-encode each row straight into the row-key -/// `ListView` using the stored `uncompressed_lengths` for boundaries (no size-pass walk). -fn fast_fused(fsst: &ArrayRef) -> ArrayRef { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let view = fsst.as_opt::().expect("FSST array"); - - // Per-row decompressed lengths are already stored — the size pass is free. - let lens_arr = view - .uncompressed_lengths() - .clone() - .execute::(&mut ctx) - .unwrap(); - let lens: Vec = match_each_integer_ptype!(lens_arr.ptype(), |P| { - lens_arr - .as_slice::

() - .iter() - .map(|x| *x as usize) - .collect() - }); - - // Bulk-decompress the whole code heap once into a contiguous buffer (no VarBinView). - let heap = view.codes_bytes(); - let total: usize = lens.iter().sum(); - let decompressor = view.decompressor(); - let mut decompressed = ByteBufferMut::with_capacity(total + 7); - let n = decompressor.decompress_into(heap.as_slice(), decompressed.spare_capacity_mut()); - unsafe { decompressed.set_len(n) }; - let bytes = decompressed.as_slice(); - - // Size + offsets for the row-key ListView (lengths are free, no view walk). - let nrows = lens.len(); - let mut offsets: Vec = Vec::with_capacity(nrows); - let mut sizes: Vec = Vec::with_capacity(nrows); - let mut acc: u32 = 0; - for &l in &lens { - offsets.push(acc); - let sz = encoded_len(l); - sizes.push(sz); - acc += sz; - } - - // Block-encode every row directly into the elements buffer. No zero-init (every byte is - // written: sentinel + block body with zero-padded final block) and no Vec→Buffer copy. - let mut out = ByteBufferMut::with_capacity(acc as usize); - unsafe { out.set_len(acc as usize) }; - let out_slice = out.as_mut_slice(); - let mut src = 0usize; - for (i, &l) in lens.iter().enumerate() { - let pos = offsets[i] as usize; - out_slice[pos] = NON_EMPTY_SENTINEL; - if l != 0 { - block_encode(&bytes[src..src + l], &mut out_slice[pos + 1..]); - } - src += l; - } - - let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable); - let offsets_arr = - PrimitiveArray::new(Buffer::::copy_from(&offsets), Validity::NonNullable); - let sizes_arr = PrimitiveArray::new(Buffer::::copy_from(&sizes), Validity::NonNullable); - ListViewArray::try_new( - elements.into_array(), - offsets_arr.into_array(), - sizes_arr.into_array(), - Validity::NonNullable, - ) - .unwrap() - .into_array() -} - -/// "Scatter right": keep FSST's fast contiguous bulk decompressor, but run it into a -/// cache-resident scratch one row-batch at a time, then scatter each row into block form from -/// cache. The decompressed bytes never round-trip through main memory — unlike `fast_fused`, -/// which materializes the whole 6.4 MB decompressed buffer and reads it back to block-encode. -fn fast_scatter(fsst: &ArrayRef) -> ArrayRef { - // Scratch sized to stay resident in L1/L2; each batch decompresses up to this many bytes. - const SCRATCH: usize = 16 * 1024; - - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let view = fsst.as_opt::().expect("FSST array"); - - let lens_arr = view - .uncompressed_lengths() - .clone() - .execute::(&mut ctx) - .unwrap(); - let lens: Vec = match_each_integer_ptype!(lens_arr.ptype(), |P| { - lens_arr - .as_slice::

() - .iter() - .map(|x| *x as usize) - .collect() - }); - let nrows = lens.len(); - - // Per-row compressed code offsets (relative to the sliced heap start). - let codes = view.codes(); - let heap = codes.sliced_bytes(); - let code_off_arr = codes - .offsets() - .clone() - .execute::(&mut ctx) - .unwrap(); - let base = match_each_integer_ptype!(code_off_arr.ptype(), |P| { - code_off_arr.as_slice::

()[0] as usize - }); - let code_off: Vec = match_each_integer_ptype!(code_off_arr.ptype(), |P| { - code_off_arr - .as_slice::

() - .iter() - .map(|x| *x as usize - base) - .collect() - }); - - // Output sizing (free from stored lengths). - let mut offsets: Vec = Vec::with_capacity(nrows); - let mut sizes: Vec = Vec::with_capacity(nrows); - let mut acc: u32 = 0; - let mut max_row = 0usize; - for &l in &lens { - offsets.push(acc); - let sz = encoded_len(l); - sizes.push(sz); - acc += sz; - max_row = max_row.max(l); - } - let mut out = ByteBufferMut::with_capacity(acc as usize); - unsafe { out.set_len(acc as usize) }; - let out_slice = out.as_mut_slice(); - - let decompressor = view.decompressor(); - let scratch_cap = SCRATCH.max(max_row) + 8; - let mut scratch = ByteBufferMut::with_capacity(scratch_cap); - - let mut r = 0usize; - while r < nrows { - // Grow a batch until it would overflow the scratch (always at least one row). - let bs = r; - let mut batch_bytes = 0usize; - while r < nrows && (r == bs || batch_bytes + lens[r] <= SCRATCH) { - batch_bytes += lens[r]; - r += 1; - } - let be = r; - - // Decompress this batch's codes in one fast call into the cache-resident scratch. - let cslice = &heap.as_slice()[code_off[bs]..code_off[be]]; - let n = decompressor.decompress_into(cslice, scratch.spare_capacity_mut()); - unsafe { scratch.set_len(n) }; - let sbytes = scratch.as_slice(); - - // Scatter each row from cache into block form. - let mut local = 0usize; - for i in bs..be { - let l = lens[i]; - let pos = offsets[i] as usize; - out_slice[pos] = NON_EMPTY_SENTINEL; - if l != 0 { - block_encode(&sbytes[local..local + l], &mut out_slice[pos + 1..]); - } - local += l; - } - unsafe { scratch.set_len(0) }; - } - - let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable); - let offsets_arr = - PrimitiveArray::new(Buffer::::copy_from(&offsets), Validity::NonNullable); - let sizes_arr = PrimitiveArray::new(Buffer::::copy_from(&sizes), Validity::NonNullable); - ListViewArray::try_new( - elements.into_array(), - offsets_arr.into_array(), - sizes_arr.into_array(), - Validity::NonNullable, - ) - .unwrap() - .into_array() -} - -fn main() { - // Correctness: the batched cache-resident scatter must produce identical row keys to the - // straightforward fused path. - { - let (fsst, _) = build_fsst(); - assert_arrays_eq!(fast_scatter(&fsst), fast_fused(&fsst)); - } - divan::main(); -} - -/// "Scatter right" fused path: cache-resident batched decompress + scatter into block form. -#[divan::bench] -fn fsst_fast_scatter(bencher: divan::Bencher) { - let (fsst, total_bytes) = build_fsst(); - bencher - .counter(BytesCount::new(total_bytes)) - .bench_local(|| fast_scatter(&fsst)); -} - -/// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it. -#[divan::bench] -fn fsst_unpack_then_convert(bencher: divan::Bencher) { - let (fsst, total_bytes) = build_fsst(); - let encoder = RowEncoder::default(); - bencher - .counter(BytesCount::new(total_bytes)) - .bench_local(|| { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let decoded = fsst - .clone() - .execute::(&mut ctx) - .unwrap() - .into_array(); - encoder.encode(&[decoded], &mut ctx).unwrap() - }); -} - -/// Fused fast path: bulk-decompress directly into the row-key block format, skipping the -/// intermediate `VarBinViewArray` and the generic row-encoder (size pass is free). -#[divan::bench] -fn fsst_fast_fused(bencher: divan::Bencher) { - let (fsst, total_bytes) = build_fsst(); - bencher - .counter(BytesCount::new(total_bytes)) - .bench_local(|| fast_fused(&fsst)); -} - -/// Irreducible floor: FSST decompression alone (a direct kernel must still produce these -/// bytes, since the sort key *is* the decompressed bytes). -#[divan::bench] -fn fsst_decompress_only(bencher: divan::Bencher) { - let (fsst, total_bytes) = build_fsst(); - bencher - .counter(BytesCount::new(total_bytes)) - .bench_local(|| decompress(&fsst)); -} - -/// Row-encode an already-decompressed `VarBinView`. The writes into the decompressed buffer + -/// views that precede this step are what a fused direct kernel would eliminate. -#[divan::bench] -fn plain_row_encode_only(bencher: divan::Bencher) { - let (fsst, total_bytes) = build_fsst(); - let decoded = decompress(&fsst); - let encoder = RowEncoder::default(); - bencher - .counter(BytesCount::new(total_bytes)) - .bench_local(|| { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - encoder - .encode(std::slice::from_ref(&decoded), &mut ctx) - .unwrap() - }); -} From 9e2f1432a84adf2a1668483256c43f4ca4f8c76b Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 5 Jun 2026 11:46:17 +0100 Subject: [PATCH 19/19] add row encoder order fuzzer Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + fuzz/Cargo.toml | 9 + fuzz/fuzz_targets/row_order.rs | 18 ++ fuzz/src/lib.rs | 3 + fuzz/src/row_order.rs | 354 +++++++++++++++++++++++++++++++++ 5 files changed, 385 insertions(+) create mode 100644 fuzz/fuzz_targets/row_order.rs create mode 100644 fuzz/src/row_order.rs diff --git a/Cargo.lock b/Cargo.lock index 967f0a18a09..1a54b9285e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9745,6 +9745,7 @@ dependencies = [ "vortex-fsst", "vortex-io", "vortex-mask", + "vortex-row", "vortex-runend", "vortex-session", "vortex-utils", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index e2d05b706f9..c52b9d3b1cc 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -39,6 +39,7 @@ vortex-error = { workspace = true } vortex-fsst = { workspace = true } vortex-io = { workspace = true } vortex-mask = { workspace = true } +vortex-row = { workspace = true } vortex-runend = { workspace = true, features = ["arbitrary"] } vortex-session = { workspace = true } vortex-utils = { workspace = true } @@ -97,6 +98,14 @@ path = "fuzz_targets/fsst_like.rs" test = false required-features = ["native"] +[[bin]] +bench = false +doc = false +name = "row_order" +path = "fuzz_targets/row_order.rs" +test = false +required-features = ["native"] + [[bin]] bench = false doc = false diff --git a/fuzz/fuzz_targets/row_order.rs b/fuzz/fuzz_targets/row_order.rs new file mode 100644 index 00000000000..758774fe6c7 --- /dev/null +++ b/fuzz/fuzz_targets/row_order.rs @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![no_main] + +use libfuzzer_sys::Corpus; +use libfuzzer_sys::fuzz_target; +use vortex_error::vortex_panic; +use vortex_fuzz::FuzzRowOrder; +use vortex_fuzz::run_row_order_fuzz; + +fuzz_target!(|fuzz: FuzzRowOrder| -> Corpus { + match run_row_order_fuzz(fuzz) { + Ok(true) => Corpus::Keep, + Ok(false) => Corpus::Reject, + Err(e) => vortex_panic!("{e}"), + } +}); diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index b0abf660045..43ff5a74629 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -7,6 +7,7 @@ mod array; pub mod compress; pub mod error; pub mod fsst_like; +pub mod row_order; // File module only available for native builds (requires vortex-file which uses tokio) #[cfg(not(target_arch = "wasm32"))] @@ -31,6 +32,8 @@ pub use fsst_like::run_fsst_like_fuzz; pub use gpu::FuzzCompressGpu; #[cfg(feature = "cuda")] pub use gpu::run_compress_gpu; +pub use row_order::FuzzRowOrder; +pub use row_order::run_row_order_fuzz; pub const FUZZ_ARRAY_MAX_LEN: usize = 2048; pub const FUZZ_FILE_ARRAY_MAX_LEN: usize = 16_384; diff --git a/fuzz/src/row_order.rs b/fuzz/src/row_order.rs new file mode 100644 index 00000000000..91fd962e7cf --- /dev/null +++ b/fuzz/src/row_order.rs @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::cmp::Ordering; +use std::sync::Arc; + +use arbitrary::Arbitrary; +use arbitrary::Result; +use arbitrary::Unstructured; +use vortex_array::ArrayRef; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::arbitrary::ArbitraryArray; +use vortex_array::arrays::arbitrary::ArbitraryArrayConfig; +use vortex_array::arrays::arbitrary::ArbitraryWith; +use vortex_array::arrays::listview::ListViewArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::DecimalDType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::PType; +use vortex_array::dtype::StructFields; +use vortex_array::scalar::Scalar; +use vortex_array::scalar::ScalarValue; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_row::RowEncoder; +use vortex_row::RowSortField; + +use crate::SESSION; +use crate::error::Backtrace; +use crate::error::VortexFuzzError; +use crate::error::VortexFuzzResult; + +const MAX_COLUMNS: usize = 4; +const MAX_ROWS_PER_SIDE: usize = 32; +const MAX_NESTING_DEPTH: u8 = 2; +const MAX_STRUCT_FIELDS: usize = 3; +const MAX_FIXED_SIZE_LIST_LEN: u32 = 3; + +#[derive(Debug)] +pub struct FuzzRowOrder { + left_cols: Vec, + right_cols: Vec, + sort_fields: Vec, +} + +impl<'a> Arbitrary<'a> for FuzzRowOrder { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let column_count = u.int_in_range(1..=MAX_COLUMNS)?; + let left_len = u.int_in_range(1..=MAX_ROWS_PER_SIDE)?; + let right_len = u.int_in_range(1..=MAX_ROWS_PER_SIDE)?; + + let mut left_cols = Vec::with_capacity(column_count); + let mut right_cols = Vec::with_capacity(column_count); + let mut sort_fields = Vec::with_capacity(column_count); + + for _ in 0..column_count { + let dtype = random_supported_dtype(u, MAX_NESTING_DEPTH)?; + left_cols.push(random_array(u, dtype.clone(), left_len)?); + right_cols.push(random_array(u, dtype, right_len)?); + sort_fields.push(RowSortField::new(u.arbitrary()?, u.arbitrary()?)); + } + + Ok(Self { + left_cols, + right_cols, + sort_fields, + }) + } +} + +#[expect(clippy::result_large_err)] +pub fn run_row_order_fuzz(fuzz: FuzzRowOrder) -> VortexFuzzResult { + run_row_order_fuzz_inner(fuzz) + .map_err(|err| VortexFuzzError::VortexError(err, Backtrace::capture())) +} + +fn run_row_order_fuzz_inner(fuzz: FuzzRowOrder) -> VortexResult { + let FuzzRowOrder { + left_cols, + right_cols, + sort_fields, + } = fuzz; + + let mut ctx = SESSION.create_execution_ctx(); + let encoder = RowEncoder::new(sort_fields.iter().copied()); + let left_rows = collect_row_bytes(&encoder.encode(&left_cols, &mut ctx)?, &mut ctx)?; + let right_rows = collect_row_bytes(&encoder.encode(&right_cols, &mut ctx)?, &mut ctx)?; + + for (left_idx, left_bytes) in left_rows.iter().enumerate() { + for (right_idx, right_bytes) in right_rows.iter().enumerate() { + let array_order = compare_rows( + &left_cols, + left_idx, + &right_cols, + right_idx, + &sort_fields, + &mut ctx, + )?; + let row_order = left_bytes.cmp(right_bytes); + if array_order != row_order { + vortex_bail!( + "row-order mismatch comparing left row {} to right row {}: \ + array order {:?}, row-byte order {:?}, dtypes {:?}, sort fields {:?}, \ + left bytes {:?}, right bytes {:?}", + left_idx, + right_idx, + array_order, + row_order, + left_cols.iter().map(|col| col.dtype()).collect::>(), + sort_fields, + left_bytes, + right_bytes + ); + } + } + } + + Ok(true) +} + +fn collect_row_bytes( + encoded: &vortex_array::arrays::ListViewArray, + ctx: &mut vortex_array::ExecutionCtx, +) -> VortexResult>> { + (0..encoded.len()) + .map(|row_idx| { + let row = encoded.list_elements_at(row_idx)?; + let row = row.execute::(ctx)?; + Ok(row.as_slice::().to_vec()) + }) + .collect() +} + +fn compare_rows( + left_cols: &[ArrayRef], + left_idx: usize, + right_cols: &[ArrayRef], + right_idx: usize, + sort_fields: &[RowSortField], + ctx: &mut vortex_array::ExecutionCtx, +) -> VortexResult { + for ((left_col, right_col), field) in left_cols.iter().zip(right_cols).zip(sort_fields) { + let left = left_col.execute_scalar(left_idx, ctx)?; + let right = right_col.execute_scalar(right_idx, ctx)?; + match compare_scalar(&left, &right, *field)? { + Ordering::Equal => {} + ordering => return Ok(ordering), + } + } + + Ok(Ordering::Equal) +} + +fn compare_scalar(left: &Scalar, right: &Scalar, field: RowSortField) -> VortexResult { + if !left.dtype().eq_ignore_nullability(right.dtype()) { + vortex_bail!( + "cannot compare row scalars with different dtypes: {} vs {}", + left.dtype(), + right.dtype() + ); + } + + compare_scalar_values(left.dtype(), left.value(), right.value(), field) +} + +fn compare_scalar_values( + dtype: &DType, + left: Option<&ScalarValue>, + right: Option<&ScalarValue>, + field: RowSortField, +) -> VortexResult { + let (Some(left), Some(right)) = (left, right) else { + return Ok(compare_nulls(left.is_none(), right.is_none(), field)); + }; + + match dtype { + DType::Null => Ok(Ordering::Equal), + DType::Struct(fields, _) => compare_struct_values(fields, left, right, field), + DType::FixedSizeList(element_dtype, list_size, _) => { + compare_fixed_size_list_values(element_dtype, *list_size, left, right, field) + } + DType::List(..) | DType::Variant(_) | DType::Union(_) | DType::Extension(_) => { + vortex_bail!("row-order fuzzer generated unsupported dtype: {dtype}") + } + _ => compare_leaf_values(dtype, left, right, field), + } +} + +fn compare_nulls(left_is_null: bool, right_is_null: bool, field: RowSortField) -> Ordering { + match (left_is_null, right_is_null) { + (true, true) | (false, false) => Ordering::Equal, + (true, false) => { + if field.nulls_first { + Ordering::Less + } else { + Ordering::Greater + } + } + (false, true) => { + if field.nulls_first { + Ordering::Greater + } else { + Ordering::Less + } + } + } +} + +fn compare_struct_values( + fields: &StructFields, + left: &ScalarValue, + right: &ScalarValue, + field: RowSortField, +) -> VortexResult { + let (ScalarValue::Tuple(left_fields), ScalarValue::Tuple(right_fields)) = (left, right) else { + vortex_bail!("struct dtype expected tuple scalar values"); + }; + if left_fields.len() != fields.nfields() || right_fields.len() != fields.nfields() { + vortex_bail!( + "struct scalar field count mismatch: expected {}, got {} and {}", + fields.nfields(), + left_fields.len(), + right_fields.len() + ); + } + + for ((field_dtype, left_value), right_value) in + fields.fields().zip(left_fields).zip(right_fields) + { + match compare_scalar_values( + &field_dtype, + left_value.as_ref(), + right_value.as_ref(), + field, + )? { + Ordering::Equal => {} + ordering => return Ok(ordering), + } + } + + Ok(Ordering::Equal) +} + +fn compare_fixed_size_list_values( + element_dtype: &DType, + list_size: u32, + left: &ScalarValue, + right: &ScalarValue, + field: RowSortField, +) -> VortexResult { + let (ScalarValue::Tuple(left_elements), ScalarValue::Tuple(right_elements)) = (left, right) + else { + vortex_bail!("fixed-size list dtype expected tuple scalar values"); + }; + let expected_len = list_size as usize; + if left_elements.len() != expected_len || right_elements.len() != expected_len { + vortex_bail!( + "fixed-size list scalar length mismatch: expected {}, got {} and {}", + expected_len, + left_elements.len(), + right_elements.len() + ); + } + + for (left_value, right_value) in left_elements.iter().zip(right_elements) { + match compare_scalar_values( + element_dtype, + left_value.as_ref(), + right_value.as_ref(), + field, + )? { + Ordering::Equal => {} + ordering => return Ok(ordering), + } + } + + Ok(Ordering::Equal) +} + +fn compare_leaf_values( + dtype: &DType, + left: &ScalarValue, + right: &ScalarValue, + field: RowSortField, +) -> VortexResult { + let left = Scalar::try_new(dtype.clone(), Some(left.clone()))?; + let right = Scalar::try_new(dtype.clone(), Some(right.clone()))?; + let ordering = left.partial_cmp(&right).ok_or_else(|| { + vortex_err!( + "scalar comparison returned None for matching row-order dtype {}", + dtype + ) + })?; + + Ok(if field.descending { + ordering.reverse() + } else { + ordering + }) +} + +fn random_array(u: &mut Unstructured<'_>, dtype: DType, len: usize) -> Result { + Ok(ArbitraryArray::arbitrary_with_config( + u, + &ArbitraryArrayConfig { + dtype: Some(dtype), + len: len..=len, + }, + )? + .0) +} + +fn random_supported_dtype(u: &mut Unstructured<'_>, depth: u8) -> Result { + let max_kind = if depth == 0 { 5 } else { 7 }; + Ok(match u.int_in_range(0..=max_kind)? { + 0 => DType::Null, + 1 => DType::Bool(u.arbitrary()?), + 2 => DType::Primitive(PType::arbitrary(u)?, u.arbitrary()?), + 3 => DType::Decimal(random_supported_decimal_dtype(u)?, u.arbitrary()?), + 4 => DType::Utf8(u.arbitrary()?), + 5 => DType::Binary(u.arbitrary()?), + 6 => DType::Struct( + random_supported_struct_fields(u, depth - 1)?, + u.arbitrary()?, + ), + 7 => DType::FixedSizeList( + Arc::new(random_supported_dtype(u, depth - 1)?), + u.int_in_range(0..=MAX_FIXED_SIZE_LIST_LEN)?, + u.arbitrary()?, + ), + _ => unreachable!("dtype kind range is bounded"), + }) +} + +fn random_supported_decimal_dtype(u: &mut Unstructured<'_>) -> Result { + let precision = u.int_in_range(1..=38)?; + let scale = u.int_in_range(-18..=precision as i8)?; + Ok(DecimalDType::new(precision, scale)) +} + +fn random_supported_struct_fields(u: &mut Unstructured<'_>, depth: u8) -> Result { + let field_count = u.int_in_range(0..=MAX_STRUCT_FIELDS)?; + let names = (0..field_count) + .map(|idx| FieldName::from(format!("f{idx}"))) + .collect::>(); + let dtypes = (0..field_count) + .map(|_| random_supported_dtype(u, depth)) + .collect::>>()?; + + Ok(StructFields::new(FieldNames::from(names), dtypes)) +}