From 4a30a9ce20113d75f3b138ca535640684417e3d0 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 14:28:16 +0000
Subject: [PATCH 01/19] feat(vortex-row): add row-oriented byte encoder crate

Adds `vortex-row`, which encodes N columnar arrays into a single
byte-comparable `ListView<u8>` (the Vortex analogue of arrow-row) for use as
sort/row keys. Encoding runs as two scalar functions behind the `RowEncoder`
API: a `RowSize` sizing/classification pass and a `RowEncode` pass that
allocates one contiguous buffer and writes each column left-to-right into its
per-row slot. Per-column ordering (`RowSortField`) controls ascending/
descending and null placement.

Includes the byte codec for fixed-width, varlen, and nested canonical types,
the `convert_columns`/`compute_row_sizes` helpers, round-trip + invariant
tests, and arrow-row-baselined throughput benches.

The crate is marked `publish = false` for now, so no public-api.lock is
tracked.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                       |  20 +
 Cargo.toml                       |   3 +
 vortex-row/Cargo.toml            |  41 ++
 vortex-row/benches/row_encode.rs | 176 ++++++
 vortex-row/src/codec.rs          | 997 +++++++++++++++++++++++++++++++
 vortex-row/src/encode.rs         | 193 ++++++
 vortex-row/src/encoder.rs        | 138 +++++
 vortex-row/src/lib.rs            |  60 ++
 vortex-row/src/options.rs        | 193 ++++++
 vortex-row/src/size.rs           | 216 +++++++
 vortex-row/src/tests.rs          | 575 ++++++++++++++++++
 11 files changed, 2612 insertions(+)
 create mode 100644 vortex-row/Cargo.toml
 create mode 100644 vortex-row/benches/row_encode.rs
 create mode 100644 vortex-row/src/codec.rs
 create mode 100644 vortex-row/src/encode.rs
 create mode 100644 vortex-row/src/encoder.rs
 create mode 100644 vortex-row/src/lib.rs
 create mode 100644 vortex-row/src/options.rs
 create mode 100644 vortex-row/src/size.rs
 create mode 100644 vortex-row/src/tests.rs
diff --git a/Cargo.lock b/Cargo.lock
index 9189591e620..967f0a18a09 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9981,6 +9981,26 @@ dependencies = [
  "vortex-tui",
 ]
 
+[[package]]
+name = "vortex-row"
+version = "0.1.0"
+dependencies = [
+ "arrow-array",
+ "arrow-row",
+ "arrow-schema",
+ "bytes",
+ "codspeed-divan-compat",
+ "mimalloc",
+ "rand 0.10.1",
+ "rstest",
+ "smallvec",
+ "vortex-array",
+ "vortex-buffer",
+ "vortex-error",
+ "vortex-mask",
+ "vortex-session",
+]
+
 [[package]]
 name = "vortex-runend"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index e3c3cbae67e..dd3aabd9f4a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ members = [
     "vortex-io",
     "vortex-proto",
     "vortex-array",
+    "vortex-row",
     "vortex-tensor",
     "vortex-turboquant",
     "vortex-compressor",
@@ -104,6 +105,7 @@ arrow-cast = "58"
 arrow-data = "58"
 arrow-ipc = "58"
 arrow-ord = "58"
+arrow-row = "58"
 arrow-schema = "58"
 arrow-select = "58"
 arrow-string = "58"
@@ -295,6 +297,7 @@ vortex-onpair = { version = "0.1.0", path = "./encodings/experimental/onpair", d
 vortex-parquet-variant = { version = "0.1.0", path = "./encodings/parquet-variant" }
 vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false }
 vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false }
+vortex-row = { version = "0.1.0", path = "./vortex-row", default-features = false }
 vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false }
 vortex-scan = { version = "0.1.0", path = "./vortex-scan", default-features = false }
 vortex-sequence = { version = "0.1.0", path = "encodings/sequence", default-features = false }
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
new file mode 100644
index 00000000000..9222c7d6a43
--- /dev/null
+++ b/vortex-row/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "vortex-row"
+authors = { workspace = true }
+categories = { workspace = true }
+description = "Row-oriented byte encoder for Vortex arrays, analogous to arrow-row."
+edition = { workspace = true }
+homepage = { workspace = true }
+include = { workspace = true }
+keywords = { workspace = true }
+license = { workspace = true }
+publish = false
+readme = { workspace = true }
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = { workspace = true }
+smallvec = { workspace = true }
+vortex-array = { workspace = true }
+vortex-buffer = { workspace = true }
+vortex-error = { workspace = true }
+vortex-mask = { workspace = true }
+vortex-session = { workspace = true }
+
+[dev-dependencies]
+arrow-array = { workspace = true }
+arrow-row = { workspace = true }
+arrow-schema = { workspace = true }
+divan = { workspace = true }
+mimalloc = { workspace = true }
+rand = { workspace = true }
+rstest = { workspace = true }
+vortex-array = { workspace = true, features = ["_test-harness"] }
+
+[[bench]]
+name = "row_encode"
+harness = false
diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
new file mode 100644
index 00000000000..07493d6ad48
--- /dev/null
+++ b/vortex-row/benches/row_encode.rs
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(
+    clippy::unwrap_used,
+    clippy::clone_on_ref_ptr,
+    clippy::cloned_ref_to_slice_refs,
+    clippy::redundant_clone
+)]
+
+//! Row-encode throughput benchmarks comparing `arrow-row` against Vortex's [`RowEncoder`]
+//! for the core canonical scenarios: a primitive i64 column, a Utf8 column, and a
+//! mixed-field struct.
+
+use std::sync::Arc;
+
+use arrow_array::Int64Array;
+use arrow_array::StringArray;
+use arrow_array::StructArray as ArrowStructArray;
+use arrow_row::RowConverter;
+use arrow_row::SortField as ArrowSortField;
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use divan::counter::BytesCount;
+use mimalloc::MiMalloc;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::distr::Alphanumeric;
+use rand::rngs::StdRng;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_row::RowEncoder;
+
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
+const N: usize = 100_000;
+
+fn main() {
+    divan::main();
+}
+
+fn gen_i64(n: usize, seed: u64) -> Vec<i64> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| rng.random_range(i64::MIN..i64::MAX))
+        .collect()
+}
+
+fn gen_words(n: usize, mean_len: usize, seed: u64) -> Vec<String> {
+    let rng = &mut StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| {
+            let len = rng.random_range(mean_len.saturating_sub(4)..=mean_len + 4);
+            rng.sample_iter(&Alphanumeric)
+                .take(len)
+                .map(char::from)
+                .collect::<String>()
+        })
+        .collect()
+}
+
+// ---------- primitive_i64 ----------
+
+#[divan::bench]
+fn primitive_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_i64(N, 0);
+    let arr = Arc::new(Int64Array::from(v.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn primitive_i64_vortex(bencher: divan::Bencher) {
+    let v = gen_i64(N, 0);
+    let col = PrimitiveArray::from_iter(v.clone()).into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    let encoder = RowEncoder::default();
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        encoder.encode(&[col.clone()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- utf8 ----------
+
+#[divan::bench]
+fn utf8_arrow_row(bencher: divan::Bencher) {
+    let words = gen_words(N, 16, 7);
+    let total: u64 = words
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    let arr = Arc::new(StringArray::from(words.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn utf8_vortex(bencher: divan::Bencher) {
+    let words = gen_words(N, 16, 7);
+    let total: u64 = words
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    let col = VarBinViewArray::from_iter_str(words.iter().map(String::as_str)).into_array();
+    let encoder = RowEncoder::default();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        encoder.encode(&[col.clone()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- struct_mixed ----------
+
+fn struct_mixed_inputs() -> (Vec<i64>, Vec<String>, u64) {
+    let ids = gen_i64(N, 1);
+    let names = gen_words(N, 16, 2);
+    // sentinel (1) + i64 (1+8=9) + utf8-name (1 + ceil(len/32)*33)
+    let total: u64 = (0..N)
+        .map(|i| {
+            let name_bytes = 1 + (names[i].len().div_ceil(32) * 33) as u64;
+            1u64 + 9u64 + name_bytes
+        })
+        .sum();
+    (ids, names, total)
+}
+
+#[divan::bench]
+fn struct_mixed_arrow_row(bencher: divan::Bencher) {
+    let (ids, names, total) = struct_mixed_inputs();
+    let id_arr = Arc::new(Int64Array::from(ids)) as arrow_array::ArrayRef;
+    let name_arr = Arc::new(StringArray::from(names)) as arrow_array::ArrayRef;
+    let arrow_struct = Arc::new(ArrowStructArray::from(vec![
+        (Arc::new(Field::new("id", DataType::Int64, false)), id_arr),
+        (
+            Arc::new(Field::new("name", DataType::Utf8, false)),
+            name_arr,
+        ),
+    ])) as arrow_array::ArrayRef;
+    let struct_fields = vec![
+        Arc::new(Field::new("id", DataType::Int64, false)),
+        Arc::new(Field::new("name", DataType::Utf8, false)),
+    ];
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Struct(
+        struct_fields.into(),
+    ))])
+    .unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arrow_struct.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn struct_mixed_vortex(bencher: divan::Bencher) {
+    let (ids, names, total) = struct_mixed_inputs();
+    let id_arr = PrimitiveArray::from_iter(ids).into_array();
+    let name_arr = VarBinViewArray::from_iter_str(names.iter().map(String::as_str)).into_array();
+    let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])
+        .unwrap()
+        .into_array();
+    let encoder = RowEncoder::default();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        encoder.encode(&[struct_arr.clone()], &mut ctx).unwrap()
+    })
+}
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
new file mode 100644
index 00000000000..2818db62aba
--- /dev/null
+++ b/vortex-row/src/codec.rs
@@ -0,0 +1,997 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants.
+//!
+//! The encoded byte format produces a lexicographically byte-comparable representation:
+//! comparing the byte slices of two encoded rows yields the same ordering as the
+//! original logical (tuple) comparison of their values, modulo nulls placement and
+//! descending-ness as configured by [`RowSortField`].
+//!
+//! Conventions:
+//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to
+//!   non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF),
+//!   not the sentinel.
+//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each
+//!   for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the
+//!   value and column-byte boundaries stay aligned across rows. See
+//!   [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`].
+//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types.
+//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top
+//!   bit; negative flips all bits.
+//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null
+//!   body** so two null parent rows produce byte-equal encodings: fixed-width children
+//!   contribute their fixed null encoding, and variable-width children collapse to a single
+//!   null sentinel byte.
+
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::FixedSizeListArray;
+use vortex_array::arrays::NullArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::extension::ExtensionArrayExt;
+use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
+use vortex_array::arrays::struct_::StructArrayExt;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::DecimalType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::half::f16;
+use vortex_array::match_each_native_ptype;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::options::RowSortField;
+
+/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte).
+pub(crate) const BOOL_ENCODED_SIZE: u32 = 2;
+
+/// Block size used in the variable-length encoding.
+pub(crate) const VARLEN_BLOCK_SIZE: usize = 32;
+/// Total bytes per varlen block including the trailing continuation marker.
+pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1;
+const VARLEN_BLOCK_TOTAL_U32: u32 = 33;
+
+/// Size in bytes of an encoded null varlen value (just the sentinel byte).
+pub(crate) const VARLEN_NULL_SIZE: u32 = 1;
+/// Size in bytes of an encoded empty varlen value (just the sentinel byte).
+pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1;
+
+/// Returns the size in bytes of the encoded form of a non-empty variable-length value.
+///
+/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1
+/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and
+/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView`
+/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`.
+#[inline]
+fn encoded_size_for_non_empty_varlen(len: usize) -> u32 {
+    debug_assert!(len > 0);
+    let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE))
+        .vortex_expect("varlen block count must fit in u32");
+    1 + blocks * VARLEN_BLOCK_TOTAL_U32
+}
+
+/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
+#[inline]
+const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
+    1 + value_bytes
+}
+
+fn byte_width_u32(width: usize) -> u32 {
+    u32::try_from(width).vortex_expect("native byte width must fit in u32")
+}
+
+/// Returns the sentinel byte for a null varlen value.
+///
+/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and
+/// independent of `descending`, matching the convention used by `arrow-row`.
+#[inline]
+fn varlen_null_sentinel(field: RowSortField) -> u8 {
+    if field.nulls_first { 0x00 } else { 0xFF }
+}
+
+/// Returns the sentinel byte for an empty varlen value.
+///
+/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode.
+#[inline]
+fn varlen_empty_sentinel(field: RowSortField) -> u8 {
+    if field.descending { !0x01u8 } else { 0x01u8 }
+}
+
+/// Returns the sentinel byte for a non-empty varlen value.
+///
+/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode.
+#[inline]
+fn varlen_non_empty_sentinel(field: RowSortField) -> u8 {
+    if field.descending { !0x02u8 } else { 0x02u8 }
+}
+
+/// Returns the single-byte null sentinel used when a child contributes its canonical null
+/// encoding inside a null parent struct/FSL row.
+///
+/// For varlen children that is the varlen null sentinel; for everything else (including
+/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel.
+fn child_canonical_null_byte(child_dtype: &DType, field: RowSortField) -> u8 {
+    match child_dtype {
+        DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field),
+        _ => field.null_sentinel(),
+    }
+}
+
+/// Per-row width classification for a column.
+///
+/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless
+/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary,
+/// List, or any composite that recurses through a variable-width field).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum RowWidth {
+    /// Per-row width is the same constant for every row in the column.
+    Fixed(u32),
+    /// Per-row width is data-dependent.
+    Variable,
+}
+
+/// Classify a column's per-row encoded width by inspecting only its [`DType`].
+///
+/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value),
+/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the
+/// data.
+///
+/// Classification does not depend on the [`RowSortField`]: null-vs-non-null encoding width is
+/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls).
+///
+/// # Errors
+///
+/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that
+/// would overflow `u32` is also reported as an error rather than silently saturating.
+pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
+    match dtype {
+        DType::Null => Ok(RowWidth::Fixed(1)),
+        DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)),
+        DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32(
+            ptype.byte_width(),
+        )))),
+        DType::Decimal(dt, _) => {
+            let vt = DecimalType::smallest_decimal_value_type(dt);
+            if matches!(vt, DecimalType::I256) {
+                vortex_bail!("row encoding for Decimal256 is not yet implemented");
+            }
+            Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32(
+                vt.byte_width(),
+            ))))
+        }
+        DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable),
+        DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? {
+            // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL
+            // itself, then `n` copies of the element width.
+            RowWidth::Fixed(w) => {
+                let body = w
+                    .checked_mul(*n)
+                    .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?;
+                let total = body
+                    .checked_add(1)
+                    .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?;
+                Ok(RowWidth::Fixed(total))
+            }
+            RowWidth::Variable => Ok(RowWidth::Variable),
+        },
+        DType::Struct(fields, _) => {
+            // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel.
+            let mut total: u32 = 1; // outer sentinel
+            for field_dtype in fields.fields() {
+                match row_width_for_dtype(&field_dtype)? {
+                    RowWidth::Fixed(w) => {
+                        total = total.checked_add(w).ok_or_else(|| {
+                            vortex_error::vortex_err!("Struct row width overflows u32")
+                        })?;
+                    }
+                    RowWidth::Variable => return Ok(RowWidth::Variable),
+                }
+            }
+            Ok(RowWidth::Fixed(total))
+        }
+        DType::List(..) => {
+            vortex_bail!(
+                "row encoding does not support variable-size List arrays (no well-defined ordering)"
+            )
+        }
+        DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()),
+        DType::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+        DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"),
+    }
+}
+
+/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`.
+///
+/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the
+/// per-row size to each entry so multiple columns can accumulate into the same buffer.
+///
+/// # Errors
+///
+/// Returns an error for unsupported canonical variants.
+pub(crate) fn field_size(
+    canonical: &Canonical,
+    field: RowSortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => add_size_null(arr, sizes),
+        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
+        Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
+        Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
+        Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
+        Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
+        Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
+        Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?,
+        Canonical::List(_) => vortex_bail!(
+            "row encoding does not support canonical List arrays: {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+    }
+    Ok(())
+}
+
+/// Encode each row's bytes for the given canonical view into `out`, writing starting at
+/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
+/// bytes written.
+///
+/// After this call returns successfully, `cursors[i]` will have advanced by exactly the
+/// per-row contribution previously computed by [`field_size`] for the same column.
+pub(crate) fn field_encode(
+    canonical: &Canonical,
+    field: RowSortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out),
+        Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::List(_) => vortex_bail!(
+            "row encoding does not support canonical List arrays: {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+    }
+    Ok(())
+}
+
+fn add_size_const(sizes: &mut [u32], add: u32) {
+    for s in sizes.iter_mut() {
+        *s += add;
+    }
+}
+
+fn add_size_null(arr: &NullArray, sizes: &mut [u32]) {
+    debug_assert_eq!(arr.len(), sizes.len());
+    // Just a sentinel byte per row.
+    for s in sizes.iter_mut() {
+        *s += 1;
+    }
+}
+
+fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) {
+    let width = byte_width_u32(arr.ptype().byte_width());
+    add_size_const(sizes, encoded_size_for_fixed(width));
+}
+
+fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
+    let width = byte_width_u32(arr.values_type().byte_width());
+    add_size_const(sizes, encoded_size_for_fixed(width));
+}
+
+fn add_size_varbinview(
+    arr: &VarBinViewArray,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let views = arr.views();
+    for (i, view) in views.iter().enumerate() {
+        let contribution = if !mask.value(i) {
+            VARLEN_NULL_SIZE
+        } else if view.is_empty() {
+            VARLEN_EMPTY_SIZE
+        } else {
+            encoded_size_for_non_empty_varlen(view.len() as usize)
+        };
+        sizes[i] = sizes[i]
+            .checked_add(contribution)
+            .vortex_expect("per-row size overflow");
+    }
+    Ok(())
+}
+
+fn add_size_struct(
+    arr: &StructArray,
+    field: RowSortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    // Outer sentinel: 1 byte per row.
+    for s in sizes.iter_mut() {
+        *s = s.checked_add(1).vortex_expect("per-row size overflow");
+    }
+    // Each child contributes its per-row size when the parent is non-null, and a canonical
+    // null contribution when the parent is null. For fixed-width children both are equal,
+    // so we can simply add the fixed width to every row. For variable-width children the
+    // null contribution collapses to 1 byte, ensuring null parent rows have a constant body.
+    for child in arr.iter_unmasked_fields() {
+        match row_width_for_dtype(child.dtype())? {
+            RowWidth::Fixed(w) => add_size_const(sizes, w),
+            RowWidth::Variable => {
+                let canonical = child.clone().execute::<Canonical>(ctx)?;
+                let mut child_sizes = vec![0u32; n];
+                field_size(&canonical, field, &mut child_sizes, ctx)?;
+                for i in 0..n {
+                    let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 };
+                    sizes[i] = sizes[i]
+                        .checked_add(contribution)
+                        .vortex_expect("per-row size overflow");
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+fn add_size_fsl(
+    arr: &FixedSizeListArray,
+    field: RowSortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    debug_assert_eq!(n, sizes.len());
+    let list_size = arr.list_size() as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let elem_dtype = arr.elements().dtype();
+    // Outer sentinel: 1 byte per row.
+    for s in sizes.iter_mut() {
+        *s = s.checked_add(1).vortex_expect("per-row size overflow");
+    }
+    match row_width_for_dtype(elem_dtype)? {
+        RowWidth::Fixed(w) => {
+            // Each row has `list_size` fixed-width elements regardless of null parent mask.
+            let body = w
+                .checked_mul(u32::try_from(list_size).vortex_expect("list_size fits u32"))
+                .vortex_expect("FSL body width overflow");
+            add_size_const(sizes, body);
+        }
+        RowWidth::Variable => {
+            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+            debug_assert_eq!(elements.len(), n * list_size);
+            let mut elem_sizes = vec![0u32; n * list_size];
+            field_size(&elements, field, &mut elem_sizes, ctx)?;
+            for i in 0..n {
+                let body: u32 = if mask.value(i) {
+                    let base = i * list_size;
+                    let mut sum: u32 = 0;
+                    for j in 0..list_size {
+                        sum = sum
+                            .checked_add(elem_sizes[base + j])
+                            .vortex_expect("FSL row body overflow");
+                    }
+                    sum
+                } else {
+                    // Canonical null body for FSL with variable element: one null sentinel
+                    // per element. (Each element contributes `child_null_width = 1`.)
+                    u32::try_from(list_size).vortex_expect("list_size fits u32")
+                };
+                sizes[i] = sizes[i]
+                    .checked_add(body)
+                    .vortex_expect("FSL per-row size overflow");
+            }
+        }
+    }
+    Ok(())
+}
+
+fn add_size_extension(
+    arr: &ExtensionArray,
+    field: RowSortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+    field_size(&storage, field, sizes, ctx)
+}
+
+fn encode_null(
+    arr: &NullArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) {
+    let sentinel = field.null_sentinel();
+    for i in 0..arr.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = sentinel;
+        col_offset[i] += 1;
+    }
+}
+
+fn encode_bool(
+    arr: &BoolArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let bits = arr.clone().into_bit_buffer();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let xor = if field.descending { 0xFF } else { 0x00 };
+    for i in 0..bits.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            // false=0x01, true=0x02 so false < true; XOR for descending
+            let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+            out[pos + 1] = raw ^ xor;
+        } else {
+            out[pos] = null;
+            out[pos + 1] = 0;
+        }
+        col_offset[i] += BOOL_ENCODED_SIZE;
+    }
+    Ok(())
+}
+
+fn encode_primitive(
+    arr: &PrimitiveArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_typed::<T>(arr, field, row_offsets, col_offset, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    for (i, &v) in slice.iter().enumerate() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            // Zero-fill the value bytes.
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+        col_offset[i] += encoded_size_for_fixed(byte_width_u32(value_bytes));
+    }
+    Ok(())
+}
+
+fn encode_decimal(
+    arr: &DecimalArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    match arr.values_type() {
+        DecimalType::I8 => {
+            encode_decimal_typed::<i8>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I16 => {
+            encode_decimal_typed::<i16>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I32 => {
+            encode_decimal_typed::<i32>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I64 => {
+            encode_decimal_typed::<i64>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I128 => {
+            encode_decimal_typed::<i128>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I256 => {
+            vortex_bail!("row encoding for Decimal256 is not yet implemented")
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal_typed<T>(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) where
+    T: vortex_array::dtype::NativeDecimalType + RowEncode,
+{
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let total = encoded_size_for_fixed(byte_width_u32(value_bytes));
+    let slice = arr.buffer::<T>();
+    for i in 0..slice.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+        col_offset[i] += total;
+    }
+}
+
+fn encode_varbinview(
+    arr: &VarBinViewArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let null_byte = varlen_null_sentinel(field);
+    let empty_byte = varlen_empty_sentinel(field);
+    let non_empty_byte = varlen_non_empty_sentinel(field);
+
+    // `with_iterator` yields `Some(bytes)` for non-null rows and `None` for null rows,
+    // so the iterator alone fully describes validity — no separate mask lookup needed.
+    arr.with_iterator(|iter| {
+        for (i, maybe) in iter.enumerate() {
+            let pos = (row_offsets[i] + col_offset[i]) as usize;
+            match maybe {
+                None => {
+                    out[pos] = null_byte;
+                    col_offset[i] += VARLEN_NULL_SIZE;
+                }
+                Some([]) => {
+                    out[pos] = empty_byte;
+                    col_offset[i] += VARLEN_EMPTY_SIZE;
+                }
+                Some(bytes) => {
+                    out[pos] = non_empty_byte;
+                    let written =
+                        encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], field.descending);
+                    col_offset[i] += 1 + written;
+                }
+            }
+        }
+    });
+    Ok(())
+}
+
+fn encode_struct(
+    arr: &StructArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+
+    // Write the outer sentinel for each row.
+    for i in 0..n {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    // Encode each child. For non-null parent rows the child contributes its actual encoding;
+    // for null parent rows the child contributes its canonical null encoding so that two null
+    // parent rows produce byte-equal output regardless of underlying child values.
+    for child in arr.iter_unmasked_fields() {
+        match row_width_for_dtype(child.dtype())? {
+            RowWidth::Fixed(w) => {
+                let canonical = child.clone().execute::<Canonical>(ctx)?;
+                field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?;
+                // Replace null parent rows with the canonical null encoding (the same as a
+                // child-level null: null sentinel followed by zero-padded value bytes).
+                let null_byte = child_canonical_null_byte(child.dtype(), field);
+                for i in 0..n {
+                    if !mask.value(i) {
+                        let end = (row_offsets[i] + col_offset[i]) as usize;
+                        let start = end - w as usize;
+                        out[start] = null_byte;
+                        for b in &mut out[start + 1..end] {
+                            *b = 0;
+                        }
+                    }
+                }
+            }
+            RowWidth::Variable => {
+                encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn encode_fsl(
+    arr: &FixedSizeListArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let nrows = arr.len();
+    let list_size = arr.list_size() as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let elem_dtype = arr.elements().dtype().clone();
+
+    // Outer sentinel.
+    for i in 0..nrows {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    match row_width_for_dtype(&elem_dtype)? {
+        RowWidth::Fixed(w) => {
+            // Fixed-width elements: encode the elements array directly (its length is
+            // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite
+            // the body of null parent rows with the canonical null encoding per element.
+            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+            debug_assert_eq!(elements.len(), nrows * list_size);
+            let list_size_u32 = arr.list_size();
+            let row_body_bytes = w
+                .checked_mul(list_size_u32)
+                .vortex_expect("FSL body width overflow");
+            let mut elem_offsets = vec![0u32; nrows * list_size];
+            for i in 0..nrows {
+                let base = row_offsets[i] + col_offset[i];
+                for j in 0u32..list_size_u32 {
+                    elem_offsets[i * list_size + j as usize] = base + j * w;
+                }
+            }
+            let mut elem_cursors = vec![0u32; nrows * list_size];
+            field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
+            for i in 0..nrows {
+                col_offset[i] = col_offset[i]
+                    .checked_add(row_body_bytes)
+                    .vortex_expect("FSL row body overflow");
+            }
+            // Canonical null body for null parent rows: one null encoding per element.
+            let null_byte = child_canonical_null_byte(&elem_dtype, field);
+            let elem_width = w as usize;
+            for i in 0..nrows {
+                if !mask.value(i) {
+                    let end = (row_offsets[i] + col_offset[i]) as usize;
+                    let start = end - row_body_bytes as usize;
+                    let mut pos = start;
+                    for _ in 0..list_size {
+                        out[pos] = null_byte;
+                        for b in &mut out[pos + 1..pos + elem_width] {
+                            *b = 0;
+                        }
+                        pos += elem_width;
+                    }
+                }
+            }
+        }
+        RowWidth::Variable => {
+            // Variable-width elements: for null parent rows the canonical body is exactly
+            // `list_size` null sentinel bytes (one per element). For non-null parent rows,
+            // encode each element via a scratch buffer and copy into out.
+            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+            debug_assert_eq!(elements.len(), nrows * list_size);
+            let mut elem_sizes = vec![0u32; nrows * list_size];
+            field_size(&elements, field, &mut elem_sizes, ctx)?;
+            let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum();
+            let total_usize =
+                usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize");
+            let mut scratch = vec![0u8; total_usize];
+            let mut scratch_offsets = Vec::with_capacity(nrows * list_size);
+            let mut acc: u32 = 0;
+            for &s in &elem_sizes {
+                scratch_offsets.push(acc);
+                acc = acc
+                    .checked_add(s)
+                    .vortex_expect("FSL scratch offset overflow");
+            }
+            let mut scratch_cursors = vec![0u32; nrows * list_size];
+            field_encode(
+                &elements,
+                field,
+                &scratch_offsets,
+                &mut scratch_cursors,
+                &mut scratch,
+                ctx,
+            )?;
+            let null_byte = child_canonical_null_byte(&elem_dtype, field);
+            for i in 0..nrows {
+                let dst = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    let mut body_bytes: u32 = 0;
+                    for j in 0..list_size {
+                        let k = i * list_size + j;
+                        let src = scratch_offsets[k] as usize;
+                        let sz = elem_sizes[k] as usize;
+                        out[dst + body_bytes as usize..dst + body_bytes as usize + sz]
+                            .copy_from_slice(&scratch[src..src + sz]);
+                        body_bytes = body_bytes
+                            .checked_add(elem_sizes[k])
+                            .vortex_expect("FSL body bytes overflow");
+                    }
+                    col_offset[i] = col_offset[i]
+                        .checked_add(body_bytes)
+                        .vortex_expect("FSL row offset overflow");
+                } else {
+                    for offset in 0..list_size {
+                        out[dst + offset] = null_byte;
+                    }
+                    col_offset[i] = col_offset[i]
+                        .checked_add(u32::try_from(list_size).vortex_expect("list_size fits u32"))
+                        .vortex_expect("FSL row offset overflow");
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's
+/// natural encoding from a scratch buffer; for null parent rows, write a single
+/// `child_canonical_null_byte`.
+fn encode_variable_child(
+    child: &vortex_array::ArrayRef,
+    field: RowSortField,
+    parent_mask: &vortex_mask::Mask,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = child.len();
+    let canonical = child.clone().execute::<Canonical>(ctx)?;
+
+    // Size and encode the child into a sequential scratch buffer.
+    let mut child_sizes = vec![0u32; n];
+    field_size(&canonical, field, &mut child_sizes, ctx)?;
+    let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum();
+    let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize");
+    let mut scratch = vec![0u8; total_usize];
+    let mut scratch_offsets = Vec::with_capacity(n);
+    let mut acc: u32 = 0;
+    for &s in &child_sizes {
+        scratch_offsets.push(acc);
+        acc = acc
+            .checked_add(s)
+            .vortex_expect("child scratch offset overflow");
+    }
+    let mut scratch_cursors = vec![0u32; n];
+    field_encode(
+        &canonical,
+        field,
+        &scratch_offsets,
+        &mut scratch_cursors,
+        &mut scratch,
+        ctx,
+    )?;
+
+    let null_byte = child_canonical_null_byte(child.dtype(), field);
+    for i in 0..n {
+        let dst = (row_offsets[i] + col_offset[i]) as usize;
+        if parent_mask.value(i) {
+            let src = scratch_offsets[i] as usize;
+            let sz = child_sizes[i] as usize;
+            out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]);
+            col_offset[i] = col_offset[i]
+                .checked_add(child_sizes[i])
+                .vortex_expect("col_offset overflow");
+        } else {
+            out[dst] = null_byte;
+            col_offset[i] = col_offset[i]
+                .checked_add(1)
+                .vortex_expect("col_offset overflow");
+        }
+    }
+    Ok(())
+}
+
+fn encode_extension(
+    arr: &ExtensionArray,
+    field: RowSortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+    field_encode(&storage, field, row_offsets, col_offset, out, ctx)
+}
+
+/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with
+/// continuation/length markers. Returns the number of bytes written. Empty values are
+/// encoded by the caller as a single sentinel byte and never reach this function.
+fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
+    debug_assert!(!bytes.is_empty());
+    let xor = if descending { 0xFFu8 } else { 0x00 };
+    let mut written = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() > VARLEN_BLOCK_SIZE {
+        // Full block, continuation marker 0xFF (then XORed if descending).
+        let block = &remaining[..VARLEN_BLOCK_SIZE];
+        for (i, &b) in block.iter().enumerate() {
+            out[written + i] = b ^ xor;
+        }
+        out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor;
+        written += VARLEN_BLOCK_TOTAL;
+        remaining = &remaining[VARLEN_BLOCK_SIZE..];
+    }
+    // Final partial block: pad with zeros, last byte = remaining.len() (1..=32).
+    let n = remaining.len();
+    for (i, &b) in remaining.iter().enumerate() {
+        out[written + i] = b ^ xor;
+    }
+    for j in n..VARLEN_BLOCK_SIZE {
+        out[written + j] = xor;
+    }
+    out[written + VARLEN_BLOCK_SIZE] =
+        u8::try_from(n).vortex_expect("final varlen block length must fit in u8") ^ xor;
+    written += VARLEN_BLOCK_TOTAL;
+    u32::try_from(written).vortex_expect("encoded varlen byte length must fit in u32")
+}
+
+/// Internal trait for encoding a fixed-width native value into byte slots.
+///
+/// Implementations must produce a sequence of `size_of::<Self>()` bytes that is
+/// lexicographically byte-comparable according to the natural ordering of the type.
+pub(crate) trait RowEncode: Copy {
+    /// Encode this value into `out`, inverting the bytes for descending order.
+    fn encode_to(self, out: &mut [u8], descending: bool);
+}
+
+macro_rules! impl_row_encode_unsigned {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let bytes = self.to_be_bytes();
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_row_encode_signed {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let mut bytes = self.to_be_bytes();
+                // Flip sign bit so negatives < non-negatives lexicographically.
+                bytes[0] ^= 0x80;
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+impl_row_encode_unsigned!(u8);
+impl_row_encode_unsigned!(u16);
+impl_row_encode_unsigned!(u32);
+impl_row_encode_unsigned!(u64);
+impl_row_encode_signed!(i8);
+impl_row_encode_signed!(i16);
+impl_row_encode_signed!(i32);
+impl_row_encode_signed!(i64);
+impl_row_encode_signed!(i128);
+
+impl RowEncode for f32 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u32 = if (bits >> 31) == 0 {
+            0x8000_0000
+        } else {
+            0xFFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f64 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u64 = if (bits >> 63) == 0 {
+            0x8000_0000_0000_0000
+        } else {
+            0xFFFF_FFFF_FFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f16 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
new file mode 100644
index 00000000000..d3721e49a6e
--- /dev/null
+++ b/vortex-row/src/encode.rs
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `RowEncode` variadic scalar function: encode N input columns into a single `ListView<u8>`.
+//!
+//! The output's `(elements, offsets, sizes)` triple is built up in a single left-to-right
+//! pass over the input columns. The `sizes` array doubles as the per-row write cursor, so
+//! when the last column finishes encoding, the accumulator is the final array - no separate
+//! conversion step is needed.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::scalar_fn::Arity;
+use vortex_array::scalar_fn::ChildName;
+use vortex_array::scalar_fn::ExecutionArgs;
+use vortex_array::scalar_fn::ScalarFnId;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_session::VortexSession;
+
+use crate::codec;
+use crate::options::RowEncodingOptions;
+use crate::options::deserialize_row_encoding_options;
+use crate::options::serialize_row_encoding_options;
+use crate::size::compute_sizes;
+
+/// Variadic scalar function that encodes N input columns into a single `List<u8>`
+/// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values
+/// `cols[0][i], cols[1][i], ...` concatenated left-to-right.
+///
+/// This scalar function is public for session registration and encoding extension work.
+/// Most callers should use [`RowEncoder`](crate::RowEncoder) rather than invoking the scalar
+/// function directly.
+#[derive(Clone, Debug)]
+pub struct RowEncode;
+
+impl ScalarFnVTable for RowEncode {
+    type Options = RowEncodingOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.row_encode")
+    }
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
+        Ok(Some(serialize_row_encoding_options(options)))
+    }
+
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        _session: &VortexSession,
+    ) -> VortexResult<Self::Options> {
+        deserialize_row_encoding_options(metadata)
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Variadic { min: 1, max: None }
+    }
+
+    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
+        ChildName::from(Arc::from(format!("col_{}", child_idx)))
+    }
+
+    fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
+        Ok(DType::List(
+            Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
+            Nullability::NonNullable,
+        ))
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: &dyn ExecutionArgs,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        execute_row_encode(options, args, ctx)
+    }
+
+    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
+        true
+    }
+
+    fn is_fallible(&self, _options: &Self::Options) -> bool {
+        false
+    }
+}
+
+fn execute_row_encode(
+    options: &RowEncodingOptions,
+    args: &dyn ExecutionArgs,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let nrows = args.row_count();
+    if u32::try_from(nrows).is_err() {
+        vortex_bail!("row-encoded input has {} rows, exceeds u32::MAX", nrows);
+    }
+
+    // ===== Phase 1: classify + size pass =====
+    let crate::size::SizePassResult {
+        fixed_per_row,
+        var_lengths,
+        columns,
+    } = compute_sizes(options, args, ctx)?;
+
+    // ===== Phase 2: totals + buffer =====
+    let var_total: u64 = var_lengths
+        .as_ref()
+        .map_or(0, |v| v.iter().map(|&x| u64::from(x)).sum());
+    let total: u64 = (nrows as u64)
+        .checked_mul(u64::from(fixed_per_row))
+        .and_then(|t| t.checked_add(var_total))
+        .ok_or_else(|| {
+            vortex_error::vortex_err!("row-encoded total bytes overflow u64 (nrows * fixed + var)")
+        })?;
+    if total > u32::MAX as u64 {
+        vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total);
+    }
+    let total_len =
+        usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize");
+
+    // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder
+    // assume previously untouched bytes are zero, simplifying the null-row fill paths.
+    let mut out_buf: BufferMut<u8> = BufferMut::with_capacity(total_len);
+    out_buf.push_n(0u8, total_len);
+
+    // ===== Phase 3: per-row offsets =====
+    // listview_offsets[i] is the absolute byte offset where row `i` begins.
+    // For pure-fixed: i * fixed_per_row.
+    // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
+    // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end.
+    let nrows_u32 =
+        u32::try_from(nrows).vortex_expect("nrows fits u32 (validated earlier in this function)");
+    let mut listview_offsets: BufferMut<u32> = BufferMut::with_capacity(nrows);
+    match var_lengths.as_ref() {
+        None => {
+            for row_idx in 0..nrows_u32 {
+                // Total bytes already fit in u32, so row_idx * fixed_per_row also does.
+                listview_offsets.push(row_idx * fixed_per_row);
+            }
+        }
+        Some(v) => {
+            let mut acc: u32 = 0;
+            for (row_idx, &l) in (0..nrows_u32).zip(v.iter()) {
+                // The arithmetic below cannot overflow because we already verified the
+                // total fits in u32.
+                listview_offsets.push(row_idx * fixed_per_row + acc);
+                acc += l;
+            }
+        }
+    }
+    let listview_offsets_slice: &[u32] = listview_offsets.as_slice();
+
+    // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build
+    // it as a BufferMut so we can hand it directly to the output PrimitiveArray.
+    let mut row_cursors: BufferMut<u32> = BufferMut::with_capacity(nrows);
+    row_cursors.push_n(0u32, nrows);
+
+    // ===== Phase 4: encode columns via the cursor path =====
+    // Each column was canonicalized once during the size pass; reuse that canonical form.
+    for (i, canonical) in columns.iter().enumerate() {
+        codec::field_encode(
+            canonical,
+            options.fields[i],
+            listview_offsets_slice,
+            row_cursors.as_mut_slice(),
+            &mut out_buf,
+            ctx,
+        )?;
+    }
+
+    // ===== Phase 5: build ListView output =====
+    let elements = PrimitiveArray::new(out_buf.freeze(), Validity::NonNullable).into_array();
+    let offsets_arr =
+        PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array();
+    let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array();
+    Ok(
+        ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)?
+            .into_array(),
+    )
+}
diff --git a/vortex-row/src/encoder.rs b/vortex-row/src/encoder.rs
new file mode 100644
index 00000000000..15eeda6d2f1
--- /dev/null
+++ b/vortex-row/src/encoder.rs
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! User-facing entry point: turn N columnar arrays into one row-encoded `ListView<u8>`.
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::scalar_fn::VecExecutionArgs;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::encode::RowEncode;
+use crate::options::RowEncodingOptions;
+use crate::options::RowSortField;
+use crate::size::RowSize;
+
+/// Encodes N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose row
+/// byte slices compare lexicographically in the same order as a tuple comparison of the input
+/// values under the configured [`RowSortField`]s.
+///
+/// Construct with [`RowEncoder::new`] or [`RowEncoder::with_options`] to pin the per-column
+/// sort options, or use [`RowEncoder::default`] to apply ascending, nulls-first ordering to
+/// every column. The same encoder can be reused across calls.
+#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
+pub struct RowEncoder {
+    options: Option<RowEncodingOptions>,
+}
+
+impl RowEncoder {
+    /// Construct a `RowEncoder` from one [`RowSortField`] per input column.
+    pub fn new(fields: impl IntoIterator<Item = RowSortField>) -> Self {
+        Self {
+            options: Some(RowEncodingOptions::new(fields)),
+        }
+    }
+
+    /// Construct a `RowEncoder` from an explicit [`RowEncodingOptions`].
+    pub fn with_options(options: RowEncodingOptions) -> Self {
+        Self {
+            options: Some(options),
+        }
+    }
+
+    /// Borrow the configured options, or `None` when the encoder applies default
+    /// (ascending, nulls-first) ordering inferred from the column count at encode time.
+    pub fn options(&self) -> Option<&RowEncodingOptions> {
+        self.options.as_ref()
+    }
+
+    /// Encode `cols` into a single row-oriented [`ListViewArray`] of `u8`.
+    pub fn encode(&self, cols: &[ArrayRef], ctx: &mut ExecutionCtx) -> VortexResult<ListViewArray> {
+        let (options, args) = self.prepare(cols)?;
+        RowEncode
+            .execute(&options, &args, ctx)?
+            .execute::<ListViewArray>(ctx)
+    }
+
+    /// Compute only the per-row sizes (the `Struct { fixed: u32, var: u32 }` produced by
+    /// [`RowSize`]) without materializing the encoded rows.
+    pub fn row_sizes(&self, cols: &[ArrayRef], ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
+        let (options, args) = self.prepare(cols)?;
+        RowSize.execute(&options, &args, ctx)
+    }
+
+    /// Validate the input columns and resolve the options + execution args shared by
+    /// [`encode`](Self::encode) and [`row_sizes`](Self::row_sizes).
+    fn prepare(&self, cols: &[ArrayRef]) -> VortexResult<(RowEncodingOptions, VecExecutionArgs)> {
+        if cols.is_empty() {
+            vortex_bail!("RowEncoder: at least one column is required");
+        }
+        let options = match &self.options {
+            Some(options) => {
+                if options.len() != cols.len() {
+                    vortex_bail!(
+                        "RowEncoder: options describe {} columns but {} were provided",
+                        options.len(),
+                        cols.len()
+                    );
+                }
+                options.clone()
+            }
+            None => RowEncodingOptions::default_for_columns(cols.len()),
+        };
+        let nrows = cols[0].len();
+        for (i, col) in cols.iter().enumerate() {
+            if col.len() != nrows {
+                vortex_bail!(
+                    "RowEncoder: column {} has length {} but expected {}",
+                    i,
+                    col.len(),
+                    nrows
+                );
+            }
+        }
+        Ok((options, VecExecutionArgs::new(cols.to_vec(), nrows)))
+    }
+}
+
+/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes
+/// are lexicographically comparable in the same order as a tuple comparison of the input
+/// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`].
+pub fn convert_columns(
+    cols: &[ArrayRef],
+    fields: &[RowSortField],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListViewArray> {
+    RowEncoder::new(fields.iter().copied()).encode(cols, ctx)
+}
+
+/// Like [`convert_columns`] but takes a prebuilt [`RowEncodingOptions`].
+pub fn convert_columns_with_options(
+    cols: &[ArrayRef],
+    options: &RowEncodingOptions,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListViewArray> {
+    RowEncoder::with_options(options.clone()).encode(cols, ctx)
+}
+
+/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns.
+/// Convenience wrapper over [`RowEncoder::row_sizes`].
+pub fn compute_row_sizes(
+    cols: &[ArrayRef],
+    fields: &[RowSortField],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    RowEncoder::new(fields.iter().copied()).row_sizes(cols, ctx)
+}
+
+/// Like [`compute_row_sizes`] but takes a prebuilt [`RowEncodingOptions`].
+pub fn compute_row_sizes_with_options(
+    cols: &[ArrayRef],
+    options: &RowEncodingOptions,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    RowEncoder::with_options(options.clone()).row_sizes(cols, ctx)
+}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
new file mode 100644
index 00000000000..d921e2998e3
--- /dev/null
+++ b/vortex-row/src/lib.rs
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-oriented byte encoding for Vortex arrays.
+//!
+//! This crate converts one or more columnar arrays into a single `ListView<u8>` array whose
+//! row byte slices can be compared lexicographically. The byte ordering matches tuple
+//! ordering of the input values under the requested [`RowSortField`] settings, making the
+//! representation useful for sort keys and other row-key operations.
+//!
+//! The public entry points are:
+//! - [`RowEncoder`], the primary API for encoding columns into row bytes.
+//! - [`RowEncoder::row_sizes`], which computes the fixed and variable byte contributions
+//!   without materializing the encoded rows.
+//! - [`convert_columns`] and [`compute_row_sizes`], compatibility helpers around
+//!   [`RowEncoder`].
+//! - [`initialize`], which registers the [`RowSize`] and [`RowEncode`] scalar functions on a
+//!   [`VortexSession`].
+//!
+//! Internally, encoding is split into two scalar functions. [`RowSize`] performs the sizing
+//! pass and classifies fixed-width versus variable-width input columns. [`RowEncode`] uses
+//! those sizes to allocate one contiguous elements buffer, then writes each column's bytes
+//! into the per-row slots from left to right.
+//!
+//! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to
+//! 128 bits, UTF-8 and binary values, structs, fixed-size lists, and extensions whose storage
+//! type is supported. Variant, union, and variable-size list arrays are rejected because this
+//! crate does not define an ordering for them.
+
+mod codec;
+mod encode;
+mod encoder;
+mod options;
+mod size;
+
+#[cfg(test)]
+mod tests;
+
+pub use encode::RowEncode;
+pub use encoder::RowEncoder;
+pub use encoder::compute_row_sizes;
+pub use encoder::compute_row_sizes_with_options;
+pub use encoder::convert_columns;
+pub use encoder::convert_columns_with_options;
+pub use options::RowEncodingOptions;
+pub use options::RowSortField;
+pub use size::RowSize;
+use vortex_array::scalar_fn::session::ScalarFnSessionExt;
+use vortex_session::VortexSession;
+
+/// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given
+/// session.
+///
+/// Call this during session construction when row encoding must be available through the
+/// expression layer. The direct [`RowEncoder`] API constructs the scalar-function calls
+/// itself and does not require global registration.
+pub fn initialize(session: &VortexSession) {
+    session.scalar_fns().register(RowSize);
+    session.scalar_fns().register(RowEncode);
+}
diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs
new file mode 100644
index 00000000000..380c9a3827f
--- /dev/null
+++ b/vortex-row/src/options.rs
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Display;
+use std::fmt::Formatter;
+
+use smallvec::SmallVec;
+
+/// Per-column ordering options for row-oriented encoding.
+///
+/// A `RowSortField` describes how one input column contributes to a row key. Descending order
+/// reverses the encoded value bytes for that column. Null placement is controlled separately,
+/// so nulls keep the requested position relative to non-null values in either direction.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct RowSortField {
+    /// If true, this column sorts in descending order.
+    pub descending: bool,
+    /// If true, nulls sort before non-null values.
+    pub nulls_first: bool,
+}
+
+impl Default for RowSortField {
+    fn default() -> Self {
+        Self::ascending()
+    }
+}
+
+impl Display for RowSortField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "descending={}, nulls_first={}",
+            self.descending, self.nulls_first
+        )
+    }
+}
+
+impl RowSortField {
+    /// Construct a new `RowSortField` with explicit options.
+    pub const fn new(descending: bool, nulls_first: bool) -> Self {
+        Self {
+            descending,
+            nulls_first,
+        }
+    }
+
+    /// Construct an ascending field with nulls first.
+    pub const fn ascending() -> Self {
+        Self::new(false, true)
+    }
+
+    /// Construct a descending field with nulls first.
+    pub const fn descending() -> Self {
+        Self::new(true, true)
+    }
+
+    /// Return this field with nulls ordered before non-null values.
+    pub const fn nulls_first(mut self) -> Self {
+        self.nulls_first = true;
+        self
+    }
+
+    /// Return this field with nulls ordered after non-null values.
+    pub const fn nulls_last(mut self) -> Self {
+        self.nulls_first = false;
+        self
+    }
+
+    /// Returns the sentinel byte to write for a non-null value.
+    #[inline]
+    pub(crate) fn non_null_sentinel(&self) -> u8 {
+        // Non-null is always 0x01. Null choices are < or > 0x01.
+        0x01
+    }
+
+    /// Returns the sentinel byte to write for a null value.
+    #[inline]
+    pub(crate) fn null_sentinel(&self) -> u8 {
+        if self.nulls_first {
+            // Nulls before non-nulls (smaller byte sorts first).
+            0x00
+        } else {
+            // Nulls after non-nulls (larger byte sorts later).
+            0x02
+        }
+    }
+}
+
+const FIELDS_INLINE: usize = 4;
+
+/// Ordering options for row-oriented encoding.
+///
+/// The options contain one [`RowSortField`] per input column, in the same order as the columns
+/// passed to [`convert_columns`](crate::convert_columns),
+/// [`compute_row_sizes`](crate::compute_row_sizes), [`RowSize`](crate::RowSize), or
+/// [`RowEncode`](crate::RowEncode).
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct RowEncodingOptions {
+    pub(crate) fields: SmallVec<[RowSortField; FIELDS_INLINE]>,
+}
+
+impl RowEncodingOptions {
+    /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortField`]s.
+    pub fn new(fields: impl IntoIterator<Item = RowSortField>) -> Self {
+        Self {
+            fields: fields.into_iter().collect(),
+        }
+    }
+
+    /// Construct default ascending, nulls-first options for `column_count` input columns.
+    pub fn default_for_columns(column_count: usize) -> Self {
+        Self::new(std::iter::repeat_n(RowSortField::default(), column_count))
+    }
+
+    /// Borrow the per-column sort fields.
+    pub fn fields(&self) -> &[RowSortField] {
+        &self.fields
+    }
+
+    /// Return the number of input columns described by these options.
+    pub fn len(&self) -> usize {
+        self.fields.len()
+    }
+
+    /// Return true when the options do not describe any input columns.
+    pub fn is_empty(&self) -> bool {
+        self.fields.is_empty()
+    }
+}
+
+impl FromIterator<RowSortField> for RowEncodingOptions {
+    fn from_iter<T: IntoIterator<Item = RowSortField>>(iter: T) -> Self {
+        Self::new(iter)
+    }
+}
+
+impl Display for RowEncodingOptions {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for (i, field) in self.fields.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{}", field)?;
+        }
+        write!(f, "]")
+    }
+}
+
+/// Serialize a [`RowEncodingOptions`] to a compact byte vector: 4-byte LE length followed by
+/// `2 * len` bytes (descending + nulls_first booleans for each field).
+pub(crate) fn serialize_row_encoding_options(opts: &RowEncodingOptions) -> Vec<u8> {
+    use vortex_error::VortexExpect;
+    let n =
+        u32::try_from(opts.fields.len()).vortex_expect("RowEncodingOptions length must fit in u32");
+    let mut out = Vec::with_capacity(4 + 2 * opts.fields.len());
+    out.extend_from_slice(&n.to_le_bytes());
+    for f in &opts.fields {
+        out.push(u8::from(f.descending));
+        out.push(u8::from(f.nulls_first));
+    }
+    out
+}
+
+/// Deserialize a [`RowEncodingOptions`] produced by [`serialize_row_encoding_options`].
+pub(crate) fn deserialize_row_encoding_options(
+    bytes: &[u8],
+) -> vortex_error::VortexResult<RowEncodingOptions> {
+    if bytes.len() < 4 {
+        vortex_error::vortex_bail!(
+            "RowEncodingOptions metadata must contain a 4-byte length prefix"
+        );
+    }
+    let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize;
+    let expected = 4 + 2 * n;
+    if bytes.len() != expected {
+        vortex_error::vortex_bail!(
+            "RowEncodingOptions metadata wrong size: got {}, expected {}",
+            bytes.len(),
+            expected
+        );
+    }
+    let mut fields: SmallVec<[RowSortField; FIELDS_INLINE]> = SmallVec::with_capacity(n);
+    let mut i = 4;
+    for _ in 0..n {
+        fields.push(RowSortField {
+            descending: bytes[i] != 0,
+            nulls_first: bytes[i + 1] != 0,
+        });
+        i += 2;
+    }
+    Ok(RowEncodingOptions { fields })
+}
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
new file mode 100644
index 00000000000..26269081ce7
--- /dev/null
+++ b/vortex-row/src/size.rs
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::FieldName;
+use vortex_array::dtype::FieldNames;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::StructFields;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::Arity;
+use vortex_array::scalar_fn::ChildName;
+use vortex_array::scalar_fn::ExecutionArgs;
+use vortex_array::scalar_fn::ScalarFnId;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_session::VortexSession;
+
+use crate::codec;
+use crate::codec::RowWidth;
+use crate::options::RowEncodingOptions;
+use crate::options::deserialize_row_encoding_options;
+use crate::options::serialize_row_encoding_options;
+
+/// Result of the size pass: enough information for both [`RowSize::execute`] and the
+/// downstream [`RowEncode`](super::encode::RowEncode) pipeline.
+///
+/// `columns` holds the canonicalized form of each input so the encode pass can write bytes
+/// without re-decoding — a single canonicalization per column is shared between size and
+/// encode.
+pub(crate) struct SizePassResult {
+    pub fixed_per_row: u32,
+    pub var_lengths: Option<Vec<u32>>,
+    pub columns: Vec<Canonical>,
+}
+
+/// Walk N input columns once, classifying each as fixed-width or variable-length and
+/// accumulating per-row size contributions.
+///
+/// Fixed-width columns contribute a single scalar increment to `fixed_per_row`; they do
+/// not touch `var_lengths`. Variable-length columns add per-row contributions into the
+/// lazily-allocated `var_lengths` vec via [`dispatch_size`].
+///
+/// This is shared by [`RowSize::execute`] (which wraps the result into a
+/// `Struct { fixed, var }`) and the [`RowEncode`](super::encode::RowEncode) pipeline
+/// (which reuses the canonicalized columns for the encode pass).
+pub(crate) fn compute_sizes(
+    options: &RowEncodingOptions,
+    args: &dyn ExecutionArgs,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<SizePassResult> {
+    let n_inputs = args.num_inputs();
+    if n_inputs == 0 {
+        vortex_bail!("at least one input column is required");
+    }
+    if options.len() != n_inputs {
+        vortex_bail!(
+            "options len ({}) does not match num_inputs ({})",
+            options.len(),
+            n_inputs
+        );
+    }
+    let nrows = args.row_count();
+
+    let mut columns: Vec<Canonical> = Vec::with_capacity(n_inputs);
+    let mut fixed_per_row: u32 = 0;
+    let mut var_lengths: Option<Vec<u32>> = None;
+
+    for i in 0..n_inputs {
+        let col = args.get(i)?;
+        if col.len() != nrows {
+            vortex_bail!(
+                "column {} has length {} but expected {}",
+                i,
+                col.len(),
+                nrows
+            );
+        }
+        let width = codec::row_width_for_dtype(col.dtype())?;
+        // Canonicalize once and reuse for both sizing (variable columns) and encoding.
+        let canonical = col.execute::<Canonical>(ctx)?;
+        match width {
+            RowWidth::Fixed(w) => {
+                fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(|| {
+                    vortex_error::vortex_err!("per-row fixed width overflows u32 at column {}", i)
+                })?;
+            }
+            RowWidth::Variable => {
+                let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]);
+                codec::field_size(&canonical, options.fields[i], v, ctx)?;
+            }
+        }
+        columns.push(canonical);
+    }
+
+    Ok(SizePassResult {
+        fixed_per_row,
+        var_lengths,
+        columns,
+    })
+}
+
+/// Variadic scalar function that, given N input columns and per-column
+/// [`RowSortField`](crate::RowSortField)s,
+/// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the
+/// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode).
+///
+/// The `fixed` field is always a [`ConstantArray`] holding the sum of the per-column
+/// constant widths of fixed-width inputs (sentinel + value bytes). The `var` field is a
+/// `ConstantArray(0)` when there are no variable-length input columns, and a
+/// [`PrimitiveArray<u32>`] of per-row varlen-byte sums otherwise.
+///
+/// The total per-row byte size is `fixed + var`.
+///
+/// This scalar function is public for session registration and encoding extension work.
+/// Most callers should use [`RowEncoder::row_sizes`](crate::RowEncoder::row_sizes) rather
+/// than invoking the scalar function directly.
+#[derive(Clone, Debug)]
+pub struct RowSize;
+
+/// Returns the [`FieldNames`] used by the [`RowSize`] output struct.
+pub(crate) fn row_size_field_names() -> FieldNames {
+    FieldNames::from([FieldName::from("fixed"), FieldName::from("var")])
+}
+
+/// Returns the output [`DType`] of [`RowSize`].
+pub(crate) fn row_size_struct_dtype() -> DType {
+    DType::Struct(
+        StructFields::new(
+            row_size_field_names(),
+            vec![
+                DType::Primitive(PType::U32, Nullability::NonNullable),
+                DType::Primitive(PType::U32, Nullability::NonNullable),
+            ],
+        ),
+        Nullability::NonNullable,
+    )
+}
+
+impl ScalarFnVTable for RowSize {
+    type Options = RowEncodingOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.row_size")
+    }
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
+        Ok(Some(serialize_row_encoding_options(options)))
+    }
+
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        _session: &VortexSession,
+    ) -> VortexResult<Self::Options> {
+        deserialize_row_encoding_options(metadata)
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Variadic { min: 1, max: None }
+    }
+
+    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
+        ChildName::from(Arc::from(format!("col_{}", child_idx)))
+    }
+
+    fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
+        Ok(row_size_struct_dtype())
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: &dyn ExecutionArgs,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        let nrows = args.row_count();
+        let result = compute_sizes(options, args, ctx)?;
+        let fixed_array =
+            ConstantArray::new(Scalar::from(result.fixed_per_row), nrows).into_array();
+        let var_array = match result.var_lengths {
+            Some(v) => PrimitiveArray::new(Buffer::<u32>::copy_from(&v), Validity::NonNullable)
+                .into_array(),
+            None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(),
+        };
+        Ok(StructArray::try_new(
+            row_size_field_names(),
+            vec![fixed_array, var_array],
+            nrows,
+            Validity::NonNullable,
+        )?
+        .into_array())
+    }
+
+    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
+        true
+    }
+
+    fn is_fallible(&self, _options: &Self::Options) -> bool {
+        false
+    }
+}
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
new file mode 100644
index 00000000000..62e0e4cfb98
--- /dev/null
+++ b/vortex-row/src/tests.rs
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Tests for the row encoder.
+
+use std::f64::consts::PI;
+
+use rstest::rstest;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::listview::ListViewArrayExt;
+use vortex_error::VortexResult;
+
+use crate::RowEncoder;
+use crate::RowEncodingOptions;
+use crate::RowSortField;
+use crate::compute_row_sizes_with_options;
+use crate::convert_columns;
+use crate::convert_columns_with_options;
+
+fn collect_row_bytes(array: &ListViewArray) -> Vec<Vec<u8>> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let nrows = array.len();
+    (0..nrows)
+        .map(|i| {
+            let slice = array.list_elements_at(i).unwrap();
+            let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+            p.as_slice::<u8>().to_vec()
+        })
+        .collect()
+}
+
+/// Encode each column independently, sort the resulting row bytes, and check the permutation
+/// matches the natural sort order of `values`.
+fn assert_sort_order_i64(values: Vec<i64>, descending: bool) -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let field = RowSortField::new(descending, true);
+    let encoded = convert_columns(&[col], &[field], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Build expected permutation: sort values naturally then compare to bytes-sorted order.
+    let mut idx: Vec<usize> = (0..values.len()).collect();
+    if descending {
+        idx.sort_by(|a, b| values[*b].cmp(&values[*a]));
+    } else {
+        idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    }
+    let expected_order: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+
+    let mut sorted = rows;
+    sorted.sort();
+    assert_eq!(
+        sorted, expected_order,
+        "Row-encoded bytes do not match natural sort order"
+    );
+    Ok(())
+}
+
+#[rstest]
+#[case::ascending(false)]
+#[case::descending(true)]
+fn primitive_i64_roundtrip(#[case] descending: bool) -> VortexResult<()> {
+    let values: Vec<i64> = vec![-5, 0, 5, i64::MIN, i64::MAX, 7, -7, 1];
+    assert_sort_order_i64(values, descending)
+}
+
+#[test]
+fn primitive_u32_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<u32> = vec![0, 1, 100, u32::MAX, 42, 17];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn primitive_f64_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // We use IEEE total-ordering semantics: -0.0 < +0.0 in the byte encoding (matches
+    // `arrow-row`). Avoid -0.0 in the natural-order baseline since partial_cmp says
+    // -0.0 == 0.0.
+    let values: Vec<f64> = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, PI];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap());
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn bool_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = BoolArray::from_iter([true, false, true, false]).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // false rows come first (2x), true rows after (2x)
+    assert_eq!(sorted[0], rows[1]);
+    assert_eq!(sorted[1], rows[3]);
+    assert_eq!(sorted[2], rows[0]);
+    assert_eq!(sorted[3], rows[2]);
+    Ok(())
+}
+
+#[test]
+fn utf8_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values = vec![
+        "banana",
+        "apple",
+        "",
+        "cherry",
+        "ban",
+        "banana_loaf_for_test",
+    ];
+    let col = VarBinViewArray::from_iter_str(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].cmp(values[*b]));
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn multi_column_sort() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ints: Vec<i32> = vec![1, 2, 1, 2, 1, 3];
+    let strs = vec!["b", "a", "a", "b", "c", "z"];
+    let col0 = PrimitiveArray::from_iter(ints.clone()).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array();
+    let encoded = convert_columns(
+        &[col0, col1],
+        &[RowSortField::default(), RowSortField::default()],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    let mut idx: Vec<usize> = (0..ints.len()).collect();
+    idx.sort_by(|a, b| ints[*a].cmp(&ints[*b]).then_with(|| strs[*a].cmp(strs[*b])));
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn nulls_first_and_last() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<Option<i32>> = vec![Some(5), None, Some(1), None, Some(3)];
+    let col = PrimitiveArray::from_option_iter(values.clone()).into_array();
+
+    // nulls_first=true
+    let encoded = convert_columns(
+        std::slice::from_ref(&col),
+        &[RowSortField::ascending()],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted = rows;
+    sorted.sort();
+    // The first two sorted entries should be nulls
+    let null_count = values.iter().filter(|v| v.is_none()).count();
+    for i in 0..null_count {
+        // a null encoded row begins with 0x00
+        assert_eq!(sorted[i][0], 0x00);
+    }
+    // nulls_first=false
+    let encoded = convert_columns(&[col], &[RowSortField::ascending().nulls_last()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted = rows;
+    sorted.sort();
+    // The last two sorted entries should be nulls
+    for i in 0..null_count {
+        let pos = sorted.len() - 1 - i;
+        assert_eq!(sorted[pos][0], 0x02);
+    }
+    Ok(())
+}
+
+#[test]
+fn reusable_options_helpers() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let options = RowEncodingOptions::new([RowSortField::descending().nulls_last()]);
+    assert_eq!(options.len(), 1);
+    assert!(!options.is_empty());
+    assert_eq!(
+        options.fields(),
+        &[RowSortField {
+            descending: true,
+            nulls_first: false
+        }]
+    );
+
+    let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array();
+    let encoder = RowEncoder::with_options(options.clone());
+    assert_eq!(encoder.options(), Some(&options));
+
+    let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?;
+    assert_eq!(encoded.len(), 3);
+
+    let sizes = encoder.row_sizes(std::slice::from_ref(&col), &mut ctx)?;
+    assert_eq!(sizes.len(), 3);
+
+    let encoded = convert_columns_with_options(std::slice::from_ref(&col), &options, &mut ctx)?;
+    assert_eq!(encoded.len(), 3);
+
+    let sizes = compute_row_sizes_with_options(std::slice::from_ref(&col), &options, &mut ctx)?;
+    assert_eq!(sizes.len(), 3);
+    Ok(())
+}
+
+#[test]
+fn row_encoder_new_accepts_sort_fields() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let encoder = RowEncoder::new([RowSortField::ascending()]);
+    let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array();
+
+    let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?;
+    assert_eq!(encoded.len(), 3);
+    Ok(())
+}
+
+#[test]
+fn default_row_encoder_uses_default_fields() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col0 = PrimitiveArray::from_iter([1i32, 2, 3]).into_array();
+    let col1 = PrimitiveArray::from_iter([4i32, 5, 6]).into_array();
+
+    let encoded = RowEncoder::default().encode(&[col0, col1], &mut ctx)?;
+    assert_eq!(encoded.len(), 3);
+    Ok(())
+}
+
+#[test]
+fn struct_sort_order() -> VortexResult<()> {
+    use vortex_array::arrays::StructArray;
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ids: Vec<i64> = vec![3, 1, 3, 1, 2];
+    let names = vec!["b", "a", "a", "b", "z"];
+    let id_arr = PrimitiveArray::from_iter(ids.clone()).into_array();
+    let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array();
+    let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array();
+
+    let encoded = convert_columns(&[struct_arr], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    let mut idx: Vec<usize> = (0..ids.len()).collect();
+    idx.sort_by(|a, b| ids[*a].cmp(&ids[*b]).then_with(|| names[*a].cmp(names[*b])));
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn row_size_struct_shape() -> VortexResult<()> {
+    use vortex_array::arrays::Constant;
+    use vortex_array::arrays::StructArray;
+    use vortex_array::arrays::struct_::StructArrayExt;
+
+    use crate::compute_row_sizes;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ints: Vec<i32> = vec![1, 2, 3, 4, 5];
+    let strs = vec!["a", "bb", "ccc", "", "eeeee"];
+    let col0 = PrimitiveArray::from_iter(ints).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strs).into_array();
+
+    let sizes = compute_row_sizes(
+        &[col0, col1],
+        &[RowSortField::default(), RowSortField::default()],
+        &mut ctx,
+    )?;
+    // Shape must be Struct { fixed, var }
+    let struct_arr = sizes.execute::<StructArray>(&mut ctx)?;
+    assert_eq!(struct_arr.struct_fields().nfields(), 2);
+    let fixed = struct_arr.unmasked_field(0);
+    let var = struct_arr.unmasked_field(1);
+
+    // `fixed` must be ConstantArray with value = encoded i32 width = 1 + 4 = 5.
+    let fixed_const = fixed
+        .as_opt::<Constant>()
+        .expect("fixed field should be a ConstantArray");
+    assert_eq!(
+        fixed_const.scalar(),
+        &vortex_array::scalar::Scalar::from(5u32),
+        "fixed scalar should be encoded primitive i32 width"
+    );
+
+    // `var` must be a PrimitiveArray<u32>, since we have a varlen column.
+    let var_prim = var.clone().execute::<PrimitiveArray>(&mut ctx)?;
+    let v: &[u32] = var_prim.as_slice();
+    assert_eq!(v.len(), 5);
+    // empty string: just the empty sentinel (1 byte); null or non-empty:
+    // sentinel(1) + 33 bytes (single block).
+    let expected: Vec<u32> = vec![34, 34, 34, 1, 34];
+    assert_eq!(v, expected.as_slice());
+    Ok(())
+}
+
+#[test]
+fn single_buffer_invariant() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // Encoded rows here are all > 12 bytes, forcing the Ref-view path that points back into
+    // the shared data buffer.
+    let nrows = 64usize;
+    let primitives: Vec<i64> = (0..nrows as i64).collect();
+    let strings: Vec<String> = (0..nrows)
+        .map(|i| format!("row_{}_with_padding", i))
+        .collect();
+    let col0 = PrimitiveArray::from_iter(primitives).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let encoded = convert_columns(
+        &[col0, col1],
+        &[RowSortField::default(), RowSortField::default()],
+        &mut ctx,
+    )?;
+
+    let rows = collect_row_bytes(&encoded);
+    let expected_total: usize = rows.iter().map(|r| r.len()).sum();
+
+    // The shared data buffer holds the contiguous concatenation of every row's encoded bytes;
+    // per-row allocations would produce many small buffers instead of one shared buffer.
+    // ListView's elements array is a single contiguous primitive (u8) array; its length
+    // equals the sum of all per-row sizes. A per-row allocation strategy would instead
+    // produce N separate elements arrays or a sparse one.
+    let elements_len = encoded.elements().len();
+    assert_eq!(
+        elements_len, expected_total,
+        "elements buffer size mismatch"
+    );
+    Ok(())
+}
+
+/// Regression: with the previous 2-sentinel varlen scheme, an empty col1 followed by a
+/// non-empty col1 that happened to start with `\0` would corrupt multi-column lex order
+/// because col2's first byte aligned against col1's pad in the longer row. With the
+/// 3-sentinel scheme byte position 0 alone distinguishes empty from non-empty, so column
+/// boundaries always align.
+#[test]
+fn multi_column_varlen_empty_vs_nul_byte_string() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // col1: empty vs single 0-byte. col2: same int for all rows.
+    let col1 = VarBinViewArray::from_iter_str(["", "\0", "a", "ab"]).into_array();
+    let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1]).into_array();
+    let encoded = convert_columns(
+        &[col1, col2],
+        &[RowSortField::default(), RowSortField::default()],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Logical natural order of col1: "" < "\0" < "a" < "ab".
+    // Byte sort of the encoded rows must put them in that same order.
+    let sorted_indices_by_bytes = {
+        let mut indices: Vec<usize> = (0..rows.len()).collect();
+        indices.sort_by(|a, b| rows[*a].cmp(&rows[*b]));
+        indices
+    };
+    assert_eq!(
+        sorted_indices_by_bytes,
+        vec![0, 1, 2, 3],
+        "byte sort must match natural col1 order; sorted indices were {:?}",
+        sorted_indices_by_bytes
+    );
+    Ok(())
+}
+
+/// Regression: null col1 must sort distinct from empty col1 even when col2 follows. With
+/// the 3-sentinel scheme null=0x00, empty=0x01 differ at byte 0.
+#[test]
+fn multi_column_varlen_null_vs_empty() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col1 = VarBinViewArray::from_iter_nullable_str([
+        None::<&str>,
+        Some(""),
+        Some("a"),
+        None,
+        Some(""),
+    ])
+    .into_array();
+    let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1, 1]).into_array();
+    let encoded = convert_columns(
+        &[col1, col2],
+        &[RowSortField::ascending(), RowSortField::ascending()],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Nulls first, then empties, then non-empties — and all the col2 values are identical
+    // so col1 fully determines the order.
+    // Categorise each row by the leading byte of col1's encoding.
+    let mut buckets: [Vec<usize>; 3] = [Vec::new(), Vec::new(), Vec::new()];
+    for (i, row) in rows.iter().enumerate() {
+        let bucket = match row[0] {
+            0x00 => 0, // null
+            0x01 => 1, // empty
+            0x02 => 2, // non-empty
+            other => panic!("unexpected varlen sentinel: {:#x}", other),
+        };
+        buckets[bucket].push(i);
+    }
+    assert_eq!(buckets[0].len(), 2, "two null col1 rows");
+    assert_eq!(buckets[1].len(), 2, "two empty col1 rows");
+    assert_eq!(buckets[2].len(), 1, "one non-empty col1 row");
+
+    // All null rows must be byte-equal (same col2 value, both col1 null, single sentinel).
+    let null_rows: Vec<&Vec<u8>> = buckets[0].iter().map(|&i| &rows[i]).collect();
+    assert_eq!(
+        null_rows[0], null_rows[1],
+        "null col1 rows must be byte-equal"
+    );
+    // Same for empty.
+    let empty_rows: Vec<&Vec<u8>> = buckets[1].iter().map(|&i| &rows[i]).collect();
+    assert_eq!(
+        empty_rows[0], empty_rows[1],
+        "empty col1 rows must be byte-equal"
+    );
+
+    // Byte sort must group: nulls, empties, non-empties (because leading byte differs).
+    let mut sorted = rows.clone();
+    sorted.sort();
+    assert_eq!(sorted[0][0], 0x00);
+    assert_eq!(sorted[1][0], 0x00);
+    assert_eq!(sorted[2][0], 0x01);
+    assert_eq!(sorted[3][0], 0x01);
+    assert_eq!(sorted[4][0], 0x02);
+    Ok(())
+}
+
+/// Regression: descending varlen must put non-empty before empty (natural "" < "a" inverts
+/// to "a" < "" under descending). The 3-sentinel scheme uses `!empty < !non_empty` so
+/// non-empty's first byte is smaller than empty's first byte.
+#[test]
+fn varlen_descending_empty_vs_non_empty() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = VarBinViewArray::from_iter_str(["a", "", "abc"]).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::descending()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Natural order: "" < "a" < "abc"; descending byte sort: "abc" first, "" last.
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // sorted[0] = encoded("abc"), sorted[1] = encoded("a"), sorted[2] = encoded("")
+    assert_eq!(sorted[0], rows[2], "abc first in descending");
+    assert_eq!(sorted[1], rows[0], "a second");
+    assert_eq!(sorted[2], rows[1], "empty last");
+    Ok(())
+}
+
+/// Regression: two null parent struct rows whose underlying child values differ in length
+/// must still produce byte-equal encodings, because the parent emits a canonical null
+/// body (one null sentinel per variable child) regardless of the underlying values.
+#[test]
+fn null_struct_rows_with_varying_child_lengths_are_byte_equal() -> VortexResult<()> {
+    use vortex_array::arrays::StructArray;
+    use vortex_array::dtype::FieldName;
+    use vortex_array::dtype::FieldNames;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::BitBuffer;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // Build a nullable struct{name: utf8} where rows 0 and 2 are null but the underlying
+    // child has different length data ("short" vs "much longer text data").
+    let names =
+        VarBinViewArray::from_iter_str(["short", "x", "much longer text data"]).into_array();
+    let field_names = FieldNames::from([FieldName::from("name")]);
+    let bits = BitBuffer::from_iter([false, true, false]);
+    let validity = Validity::from(bits);
+    let struct_arr = StructArray::try_new(field_names, vec![names], 3, validity)?.into_array();
+
+    let encoded = convert_columns(&[struct_arr], &[RowSortField::ascending()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+    assert_eq!(rows.len(), 3);
+    // Both null parent rows must produce identical bytes despite the divergent children.
+    assert_eq!(
+        rows[0], rows[2],
+        "two null parent struct rows must encode to byte-equal slices"
+    );
+    // And the non-null row's leading sentinel must differ from the null sentinel.
+    assert_ne!(rows[0][0], rows[1][0], "null vs non-null sentinel differs");
+    Ok(())
+}
+
+#[test]
+fn primitive_f32_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<f32> = vec![-1.5, 0.0, 1.5, f32::INFINITY, f32::NEG_INFINITY];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap());
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn primitive_f16_sort_order() -> VortexResult<()> {
+    use vortex_array::dtype::half::f16;
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<f16> = vec![
+        f16::from_f32(-1.5),
+        f16::from_f32(0.0),
+        f16::from_f32(1.5),
+        f16::INFINITY,
+        f16::NEG_INFINITY,
+    ];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap());
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn reject_list_dtype_early() {
+    use vortex_array::ArrayRef;
+    use vortex_array::arrays::ListArray;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::buffer;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let offsets = PrimitiveArray::new(buffer![0u32, 1, 2], Validity::NonNullable).into_array();
+    let elements = PrimitiveArray::from_iter([10i32, 20]).into_array();
+    let list: ArrayRef = ListArray::try_new(elements, offsets, Validity::NonNullable)
+        .unwrap()
+        .into_array();
+    let err = convert_columns(&[list], &[RowSortField::default()], &mut ctx)
+        .expect_err("List should not be accepted");
+    assert!(
+        err.to_string().contains("List"),
+        "expected error mentioning List, got: {err}"
+    );
+}

From 083c7f3565b49ce519a9988f3e8065c237d63c05 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 15:26:39 +0000
Subject: [PATCH 02/19] ci(vortex-row): run row_encode benchmarks on CodSpeed

Add a CodSpeed shard for `vortex-row` so the `row_encode` divan benchmarks
(vortex vs arrow-row) build and run in CI alongside the other crates.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .github/workflows/codspeed.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index 50a5cbaa75a..b741aaaf41d 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -35,6 +35,7 @@ jobs:
           - { shard: 6, name: "Encodings 3", packages: "vortex-pco vortex-runend vortex-sequence" }
           - { shard: 7, name: "Encodings 4", packages: "vortex-sparse vortex-zigzag vortex-zstd" }
           - { shard: 8, name: "Storage formats", packages: "vortex-flatbuffers vortex-proto vortex-btrblocks" }
+          - { shard: 9, name: "Row encoding", packages: "vortex-row" }
     name: "Benchmark with Codspeed (Shard #${{ matrix.shard }})"
     timeout-minutes: 30
     runs-on: >-

From 37936e27c6a6d44031512883b0deb5b4ea3aabf3 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:24:26 +0000
Subject: [PATCH 03/19] vortex-row: skip ListView validation in row encoder
 output

The row encoder builds the output `(elements, offsets, sizes)` triple itself, so
the invariants `ListViewArray::try_new` validates (monotone offsets, per-row
slices within bounds and disjoint) already hold by construction. Skip the
revalidation walk via `new_unchecked`.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/encode.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index d3721e49a6e..e7a1569739a 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -186,8 +186,16 @@ fn execute_row_encode(
     let offsets_arr =
         PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array();
     let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array();
-    Ok(
-        ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)?
-            .into_array(),
-    )
+    // SAFETY: this encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself:
+    // - `elements` is a `PrimitiveArray<u8>` of length `total_len`.
+    // - `offsets_arr[i]` is `i * fixed_per_row + var_prefix[i]`, monotonically increasing and
+    //   in `0..=total_len`.
+    // - `offsets_arr[i] + sizes_arr[i] <= total_len` by construction, and each row's slice is
+    //   disjoint from every other row's.
+    // `try_new`'s validation re-walks every row to check exactly these invariants, which we
+    // already guarantee by construction, so we skip it.
+    Ok(unsafe {
+        ListViewArray::new_unchecked(elements, offsets_arr, sizes_arr, Validity::NonNullable)
+    }
+    .into_array())
 }

From 48b92d13df0770897ee70ae378623d5dc566548d Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:26:18 +0000
Subject: [PATCH 04/19] vortex-row: add validity fast-path helper for the hot
 encoders

Introduce `ValidityKind`/`resolve_validity`: resolve a column's validity once,
materializing the per-row mask only when the column may actually contain nulls.
The size pass for varbinview and the bool and primitive encoders now branch once
on validity, so the all-valid path drops the per-row `mask.value(i)` check (and
mask allocation) entirely.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/codec.rs | 142 +++++++++++++++++++++++++++++-----------
 1 file changed, 104 insertions(+), 38 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 2818db62aba..ee9fd4578c6 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -43,6 +43,7 @@ use vortex_array::dtype::DecimalType;
 use vortex_array::dtype::NativePType;
 use vortex_array::dtype::half::f16;
 use vortex_array::match_each_native_ptype;
+use vortex_array::validity::Validity;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
@@ -87,6 +88,32 @@ fn byte_width_u32(width: usize) -> u32 {
     u32::try_from(width).vortex_expect("native byte width must fit in u32")
 }
 
+/// Pre-resolved per-row validity for the row encoders.
+///
+/// Encoders pattern-match on this once before their inner loop so the no-nulls fast path
+/// avoids per-row `mask.value(i)` branches entirely, and the nullable path materializes the
+/// mask exactly once.
+pub(crate) enum ValidityKind {
+    /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask needed.
+    AllValid,
+    /// Column may have nulls; carries the materialized per-row mask.
+    Mask(vortex_mask::Mask),
+}
+
+/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when the column
+/// may actually have nulls.
+#[inline]
+pub(crate) fn resolve_validity(
+    validity: Validity,
+    len: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ValidityKind> {
+    Ok(match validity {
+        Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid,
+        other => ValidityKind::Mask(other.execute_mask(len, ctx)?),
+    })
+}
+
 /// Returns the sentinel byte for a null varlen value.
 ///
 /// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and
@@ -306,19 +333,34 @@ fn add_size_varbinview(
     sizes: &mut [u32],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let views = arr.views();
-    for (i, view) in views.iter().enumerate() {
-        let contribution = if !mask.value(i) {
-            VARLEN_NULL_SIZE
-        } else if view.is_empty() {
-            VARLEN_EMPTY_SIZE
-        } else {
-            encoded_size_for_non_empty_varlen(view.len() as usize)
-        };
-        sizes[i] = sizes[i]
-            .checked_add(contribution)
-            .vortex_expect("per-row size overflow");
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, view) in views.iter().enumerate() {
+                let contribution = if view.is_empty() {
+                    VARLEN_EMPTY_SIZE
+                } else {
+                    encoded_size_for_non_empty_varlen(view.len() as usize)
+                };
+                sizes[i] = sizes[i]
+                    .checked_add(contribution)
+                    .vortex_expect("per-row size overflow");
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            for (i, view) in views.iter().enumerate() {
+                let contribution = if !mask.value(i) {
+                    VARLEN_NULL_SIZE
+                } else if view.is_empty() {
+                    VARLEN_EMPTY_SIZE
+                } else {
+                    encoded_size_for_non_empty_varlen(view.len() as usize)
+                };
+                sizes[i] = sizes[i]
+                    .checked_add(contribution)
+                    .vortex_expect("per-row size overflow");
+            }
+        }
     }
     Ok(())
 }
@@ -443,23 +485,35 @@ fn encode_bool(
     out: &mut [u8],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let bits = arr.clone().into_bit_buffer();
     let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
     let xor = if field.descending { 0xFF } else { 0x00 };
-    for i in 0..bits.len() {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        if mask.value(i) {
-            out[pos] = non_null;
-            // false=0x01, true=0x02 so false < true; XOR for descending
-            let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
-            out[pos + 1] = raw ^ xor;
-        } else {
-            out[pos] = null;
-            out[pos + 1] = 0;
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                // false=0x01, true=0x02 so false < true; XOR for descending
+                let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+                out[pos + 1] = raw ^ xor;
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+                    out[pos + 1] = raw ^ xor;
+                } else {
+                    out[pos] = null;
+                    out[pos + 1] = 0;
+                }
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
         }
-        col_offset[i] += BOOL_ENCODED_SIZE;
     }
     Ok(())
 }
@@ -486,24 +540,36 @@ fn encode_primitive_typed<T: NativePType + RowEncode>(
     out: &mut [u8],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let slice: &[T] = arr.as_slice();
     let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
     let value_bytes = size_of::<T>();
-    for (i, &v) in slice.iter().enumerate() {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        if mask.value(i) {
-            out[pos] = non_null;
-            v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
-        } else {
-            out[pos] = null;
-            // Zero-fill the value bytes.
-            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
-                *b = 0;
+    let stride = encoded_size_for_fixed(byte_width_u32(value_bytes));
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                col_offset[i] += stride;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                } else {
+                    out[pos] = null;
+                    // Zero-fill the value bytes.
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                col_offset[i] += stride;
             }
         }
-        col_offset[i] += encoded_size_for_fixed(byte_width_u32(value_bytes));
     }
     Ok(())
 }

From 578495d9c6b72ee4936c7c9e94fd305e7bee41e4 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:28:18 +0000
Subject: [PATCH 05/19] vortex-row: skip zero-init of the output buffer

Every byte of the output range is written by some encoder: fixed-width null rows
write sentinel + explicit zero-fill, varlen encoders zero-pad their final partial
block, and struct/FSL null parent bodies are overwritten with the canonical null
encoding. The pre-zero-init memset is therefore redundant, so replace it with
`set_len`, saving a `total_len`-byte memset per call.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/encode.rs | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index e7a1569739a..e6c9f5e2443 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -131,10 +131,19 @@ fn execute_row_encode(
     let total_len =
         usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize");
 
-    // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder
-    // assume previously untouched bytes are zero, simplifying the null-row fill paths.
     let mut out_buf: BufferMut<u8> = BufferMut::with_capacity(total_len);
-    out_buf.push_n(0u8, total_len);
+    // Every encoder writes every byte in its row range: fixed-width values write
+    // sentinel + value (null rows write sentinel + explicit zero-fill); varlen blocks
+    // zero-pad their final partial block; struct/FSL fixed children are written for all
+    // rows then null parent rows are overwritten with the canonical null body. So the
+    // size-pass + encoder contract guarantees `[0, total_len)` is fully written before
+    // the buffer is read out, making the pre-zero-init redundant. Skipping it saves a
+    // `total_len`-byte memset per call (significant for varlen-heavy inputs, where
+    // `total_len` reaches multiple MB).
+    //
+    // SAFETY: `total_len` bytes of capacity were just reserved, and by the contract above
+    // every byte in that range is written before `out_buf` is frozen and read.
+    unsafe { out_buf.set_len(total_len) };
 
     // ===== Phase 3: per-row offsets =====
     // listview_offsets[i] is the absolute byte offset where row `i` begins.

From 6e401b97ba7bb7806ac852665f03ce2804ba7dc0 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:29:48 +0000
Subject: [PATCH 06/19] vortex-row: auto-vectorize pure-fixed offsets
 construction

Materialize the listview offsets buffer with `set_len` + a slice write instead of
per-row `push`. For the pure-fixed path, `iter_mut().enumerate()` lets LLVM
auto-vectorize `offsets[i] = i * fixed_per_row` (no per-element bounds or capacity
checks). `nrows` is validated to fit u32 at function entry, so the cast is exact.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/encode.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index e6c9f5e2443..4862678d31d 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -150,22 +150,24 @@ fn execute_row_encode(
     // For pure-fixed: i * fixed_per_row.
     // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
     // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end.
-    let nrows_u32 =
-        u32::try_from(nrows).vortex_expect("nrows fits u32 (validated earlier in this function)");
     let mut listview_offsets: BufferMut<u32> = BufferMut::with_capacity(nrows);
+    // SAFETY: `nrows` of capacity reserved above; every index in `[0, nrows)` is written
+    // before the buffer is read out. `nrows` was validated to fit `u32` at function entry,
+    // so `i as u32` below is exact and the multiplications can't overflow.
+    unsafe { listview_offsets.set_len(nrows) };
+    let off = listview_offsets.as_mut_slice();
     match var_lengths.as_ref() {
         None => {
-            for row_idx in 0..nrows_u32 {
-                // Total bytes already fit in u32, so row_idx * fixed_per_row also does.
-                listview_offsets.push(row_idx * fixed_per_row);
+            // Pure-fixed: offsets[i] = i * fixed_per_row. `iter_mut().enumerate()` elides
+            // per-element bounds checks, so LLVM auto-vectorizes this multiply.
+            for (i, slot) in off.iter_mut().enumerate() {
+                *slot = (i as u32) * fixed_per_row;
             }
         }
         Some(v) => {
             let mut acc: u32 = 0;
-            for (row_idx, &l) in (0..nrows_u32).zip(v.iter()) {
-                // The arithmetic below cannot overflow because we already verified the
-                // total fits in u32.
-                listview_offsets.push(row_idx * fixed_per_row + acc);
+            for (i, &l) in v.iter().enumerate() {
+                off[i] = (i as u32) * fixed_per_row + acc;
                 acc += l;
             }
         }

From bd781d7f6460fd61984360d18223cc2e4f9730bb Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:30:10 +0000
Subject: [PATCH 07/19] vortex-row: auto-vectorize mixed-path offsets
 construction

Write the mixed (fixed + varlen) offsets through `iter_mut().zip` with wrapping
arithmetic, mirroring the pure-fixed path: this elides per-element bounds checks so
the `i * fixed_per_row` multiply auto-vectorizes while the varlen prefix sum stays a
cheap sequential accumulator. The total is validated to fit u32 upstream, so the
wrapping operations never actually wrap.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/encode.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 4862678d31d..a82f571af86 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -165,10 +165,14 @@ fn execute_row_encode(
             }
         }
         Some(v) => {
+            // Mixed: offsets[i] = i * fixed_per_row + var_prefix[i], where var_prefix is the
+            // exclusive cumsum of varlen lengths. `iter_mut().zip` elides per-element bounds
+            // checks; the total was validated to fit u32 upstream so the wrapping arithmetic
+            // is exact (it never actually wraps).
             let mut acc: u32 = 0;
-            for (i, &l) in v.iter().enumerate() {
-                off[i] = (i as u32) * fixed_per_row + acc;
-                acc += l;
+            for (i, (slot, &l)) in off.iter_mut().zip(v.iter()).enumerate() {
+                *slot = (i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc);
+                acc = acc.wrapping_add(l);
             }
         }
     }

From bb39136307dce1c6a362d3582f93eb1fc5f9595d Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:32:06 +0000
Subject: [PATCH 08/19] vortex-row: rewrite varlen 32-byte block encoder with
 copy_nonoverlapping

The varlen body writer was a per-byte XOR loop. Split it into an ascending fast
path (`copy_nonoverlapping` of each 32-byte block plus a single stamped continuation
byte, then a partial final block) and a descending path that XORs a u64 at a time via
`xor_copy_block` for a vectorizable inner loop. The emitted bytes are identical to the
previous implementation for every length and direction (full-block counts and final
length byte match exactly); only the write strategy changes.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/codec.rs | 99 ++++++++++++++++++++++++++++++++---------
 1 file changed, 77 insertions(+), 22 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index ee9fd4578c6..daf7c4efd48 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -926,33 +926,88 @@ fn encode_extension(
 /// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with
 /// continuation/length markers. Returns the number of bytes written. Empty values are
 /// encoded by the caller as a single sentinel byte and never reach this function.
+///
+/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block
+/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and
+/// XORs with `0xFF`, giving LLVM a vectorizable inner loop.
 fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
     debug_assert!(!bytes.is_empty());
-    let xor = if descending { 0xFFu8 } else { 0x00 };
-    let mut written = 0usize;
-    let mut remaining = bytes;
-    while remaining.len() > VARLEN_BLOCK_SIZE {
-        // Full block, continuation marker 0xFF (then XORed if descending).
-        let block = &remaining[..VARLEN_BLOCK_SIZE];
-        for (i, &b) in block.iter().enumerate() {
-            out[written + i] = b ^ xor;
+    let len = bytes.len();
+    let full_blocks = len / VARLEN_BLOCK_SIZE;
+    let partial = len % VARLEN_BLOCK_SIZE;
+    let (full_to_write, partial_block_len) = if partial == 0 {
+        // Length is an exact multiple of 32: emit (full_blocks - 1) full blocks with the
+        // 0xFF continuation marker, then a final block whose continuation byte is 32.
+        (full_blocks - 1, VARLEN_BLOCK_SIZE)
+    } else {
+        (full_blocks, partial)
+    };
+    let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
+    debug_assert!(out.len() >= total);
+
+    // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via
+    // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the
+    // leading sentinel that the caller wrote and that is not part of `out`). `bytes` is valid
+    // for `len` reads, and every pointer advance below stays within `[0, total)` for `dst`
+    // and `[0, len)` for `src`.
+    unsafe {
+        let mut src = bytes.as_ptr();
+        let mut dst = out.as_mut_ptr();
+
+        if !descending {
+            // Ascending fast path: each full block is a 32-byte memcpy + a single 0xFF stamp.
+            for _ in 0..full_to_write {
+                std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0xFF;
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: copy the partial data, zero-pad the tail, write the length byte.
+            std::ptr::copy_nonoverlapping(src, dst, partial_block_len);
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8;
+        } else {
+            // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable
+            // inner loop; the tail handles the partial block byte-wise.
+            for _ in 0..full_to_write {
+                xor_copy_block(src, dst);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            for i in 0..partial_block_len {
+                *dst.add(i) = *src.add(i) ^ 0xFF;
+            }
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0xFF, // 0x00 XOR 0xFF
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF;
         }
-        out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor;
-        written += VARLEN_BLOCK_TOTAL;
-        remaining = &remaining[VARLEN_BLOCK_SIZE..];
-    }
-    // Final partial block: pad with zeros, last byte = remaining.len() (1..=32).
-    let n = remaining.len();
-    for (i, &b) in remaining.iter().enumerate() {
-        out[written + i] = b ^ xor;
     }
-    for j in n..VARLEN_BLOCK_SIZE {
-        out[written + j] = xor;
+    total as u32
+}
+
+/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the
+/// four u64-wide iterations into SIMD on x86.
+///
+/// # Safety
+/// `src` must be valid for 32 reads, `dst` valid for 32 writes, and the regions must not
+/// overlap.
+#[inline(always)]
+unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) {
+    // Four u64 lanes of 8 bytes each = 32 bytes total.
+    for i in 0..4 {
+        let off = i * 8;
+        // SAFETY: the caller guarantees src/dst are valid for the full 32-byte block.
+        let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) };
+        unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) };
     }
-    out[written + VARLEN_BLOCK_SIZE] =
-        u8::try_from(n).vortex_expect("final varlen block length must fit in u8") ^ xor;
-    written += VARLEN_BLOCK_TOTAL;
-    u32::try_from(written).vortex_expect("encoded varlen byte length must fit in u32")
 }
 
 /// Internal trait for encoding a fixed-width native value into byte slots.

From ee049ae050375f71b12d39f346bc730b565714d1 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:34:09 +0000
Subject: [PATCH 09/19] vortex-row: walk VarBinView rows directly in the
 encoder hot loop

Replace the `with_iterator` traversal in `encode_varbinview` with a direct walk over
the view array: cache the data-buffer slices once, then for each row read the bytes
straight from the inlined view slot or the referenced buffer at `offset..offset+len`.
This drops the iterator's per-row option/bounds machinery. Validity is resolved once
via `resolve_validity`, keeping the no-nulls path branch-free on validity.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/codec.rs | 69 +++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index daf7c4efd48..5056ab3e6f6 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -26,7 +26,6 @@
 
 use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
-use vortex_array::accessor::ArrayAccessor;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::DecimalArray;
 use vortex_array::arrays::ExtensionArray;
@@ -642,35 +641,71 @@ fn encode_varbinview(
     row_offsets: &[u32],
     col_offset: &mut [u32],
     out: &mut [u8],
-    _ctx: &mut ExecutionCtx,
+    ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
     let null_byte = varlen_null_sentinel(field);
     let empty_byte = varlen_empty_sentinel(field);
     let non_empty_byte = varlen_non_empty_sentinel(field);
+    let descending = field.descending;
+
+    let views = arr.views();
+    // Cache the data-buffer slices once. Inlined views (len <= 12) carry their bytes inline,
+    // so they never touch `buffers`; referenced views index into the pre-validated buffer at
+    // `offset..offset + len`. Walking views directly avoids the per-row bounds and branch work
+    // of `with_iterator`.
+    let buffers: smallvec::SmallVec<[&[u8]; 4]> = (0..arr.data_buffers().len())
+        .map(|i| arr.buffer(i).as_slice())
+        .collect();
 
-    // `with_iterator` yields `Some(bytes)` for non-null rows and `None` for null rows,
-    // so the iterator alone fully describes validity — no separate mask lookup needed.
-    arr.with_iterator(|iter| {
-        for (i, maybe) in iter.enumerate() {
-            let pos = (row_offsets[i] + col_offset[i]) as usize;
-            match maybe {
-                None => {
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, view) in views.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                let len = view.len() as usize;
+                if len == 0 {
+                    out[pos] = empty_byte;
+                    col_offset[i] += VARLEN_EMPTY_SIZE;
+                    continue;
+                }
+                let bytes: &[u8] = if view.is_inlined() {
+                    view.as_inlined().value()
+                } else {
+                    let r = view.as_view();
+                    let off = r.offset as usize;
+                    &buffers[r.buffer_index as usize][off..off + len]
+                };
+                out[pos] = non_empty_byte;
+                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
+                col_offset[i] += 1 + written;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            for (i, view) in views.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if !mask.value(i) {
                     out[pos] = null_byte;
                     col_offset[i] += VARLEN_NULL_SIZE;
+                    continue;
                 }
-                Some([]) => {
+                let len = view.len() as usize;
+                if len == 0 {
                     out[pos] = empty_byte;
                     col_offset[i] += VARLEN_EMPTY_SIZE;
+                    continue;
                 }
-                Some(bytes) => {
-                    out[pos] = non_empty_byte;
-                    let written =
-                        encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], field.descending);
-                    col_offset[i] += 1 + written;
-                }
+                let bytes: &[u8] = if view.is_inlined() {
+                    view.as_inlined().value()
+                } else {
+                    let r = view.as_view();
+                    let off = r.offset as usize;
+                    &buffers[r.buffer_index as usize][off..off + len]
+                };
+                out[pos] = non_empty_byte;
+                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
+                col_offset[i] += 1 + written;
             }
         }
-    });
+    }
     Ok(())
 }
 

From 65a24f9ed92cadf2e8f9653d94cfe19b6bc909e2 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:37:41 +0000
Subject: [PATCH 10/19] vortex-row: satisfy cast-truncation lints in ported hot
 paths

The auto-vectorized offset loops and the varlen block writer used raw `as` casts
that trip this crate's `cast_possible_truncation` lint. Iterate a `u32` counter
instead of casting `usize` per element, and use `u8`/`u32` `try_from` for the
varlen final-block length byte and total byte count. No behavior change.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/codec.rs  |  9 ++++++---
 vortex-row/src/encode.rs | 14 ++++++++------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 5056ab3e6f6..92f3bc13a0e 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -979,6 +979,9 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
     };
     let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
     debug_assert!(out.len() >= total);
+    // The final block's continuation byte encodes its content length (1..=32).
+    let len_byte =
+        u8::try_from(partial_block_len).vortex_expect("varlen final block length (1..=32) fits u8");
 
     // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via
     // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the
@@ -1004,7 +1007,7 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
                 0,
                 VARLEN_BLOCK_SIZE - partial_block_len,
             );
-            *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8;
+            *dst.add(VARLEN_BLOCK_SIZE) = len_byte;
         } else {
             // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable
             // inner loop; the tail handles the partial block byte-wise.
@@ -1022,10 +1025,10 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
                 0xFF, // 0x00 XOR 0xFF
                 VARLEN_BLOCK_SIZE - partial_block_len,
             );
-            *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF;
+            *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF;
         }
     }
-    total as u32
+    u32::try_from(total).vortex_expect("encoded varlen byte length fits u32")
 }
 
 /// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index a82f571af86..f789382c4a1 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -158,10 +158,12 @@ fn execute_row_encode(
     let off = listview_offsets.as_mut_slice();
     match var_lengths.as_ref() {
         None => {
-            // Pure-fixed: offsets[i] = i * fixed_per_row. `iter_mut().enumerate()` elides
-            // per-element bounds checks, so LLVM auto-vectorizes this multiply.
-            for (i, slot) in off.iter_mut().enumerate() {
-                *slot = (i as u32) * fixed_per_row;
+            // Pure-fixed: offsets[i] = i * fixed_per_row. Zipping against a `u32` counter
+            // elides per-element bounds checks (and avoids a per-element `usize as u32`
+            // cast), so LLVM auto-vectorizes this multiply. `nrows` fits u32, so the counter
+            // never overflows.
+            for (slot, i) in off.iter_mut().zip(0u32..) {
+                *slot = i * fixed_per_row;
             }
         }
         Some(v) => {
@@ -170,8 +172,8 @@ fn execute_row_encode(
             // checks; the total was validated to fit u32 upstream so the wrapping arithmetic
             // is exact (it never actually wraps).
             let mut acc: u32 = 0;
-            for (i, (slot, &l)) in off.iter_mut().zip(v.iter()).enumerate() {
-                *slot = (i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc);
+            for ((slot, &l), i) in off.iter_mut().zip(v.iter()).zip(0u32..) {
+                *slot = i.wrapping_mul(fixed_per_row).wrapping_add(acc);
                 acc = acc.wrapping_add(l);
             }
         }

From 2711504490e8e73fe3004184aca20882adfd2f0d Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 18:57:45 +0000
Subject: [PATCH 11/19] vortex-row: arithmetic-write fast path for
 fixed-before-varlen columns

Classify each column in the size pass (`ColKind` + `first_varlen_idx`): a fixed-width
column with no varlen column before it has a constant within-row offset, so its write
position is pure arithmetic (`i * fixed_per_row + prefix + var_prefix[i]`) with no
per-row cursor. Route those columns through `field_encode_fixed_arithmetic`; the cursor
path is seeded to start at the first varlen column. Primitive columns in the pure-fixed
case use a `chunks_exact_mut` hot loop (matching arrow-row's not-null path); all other
fixed types reuse the cursor encoder at the computed offsets, so output is byte-identical.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/codec.rs  | 115 +++++++++++++++++++++++++++++++++++++++
 vortex-row/src/encode.rs |  93 +++++++++++++++++++++++++------
 vortex-row/src/size.rs   |  42 +++++++++++++-
 3 files changed, 229 insertions(+), 21 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 92f3bc13a0e..d0cb32ce13d 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -269,6 +269,59 @@ pub(crate) fn field_size(
     Ok(())
 }
 
+/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row
+/// cursor.
+///
+/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix
+/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen
+/// contributions (`None` when the row layout has no variable-length columns). This is the
+/// fast path for fixed-width columns that appear before any varlen column, so their
+/// within-row position is a constant offset rather than a running cursor.
+///
+/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that
+/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`).
+/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written
+/// are byte-identical to the cursor path.
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn field_encode_fixed_arithmetic(
+    canonical: &Canonical,
+    field: RowSortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    nrows: usize,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    if var_prefix.is_none()
+        && let Canonical::Primitive(arr) = canonical
+    {
+        return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx);
+    }
+
+    // General path: materialize this column's per-row start offsets and reuse the cursor
+    // encoder with zero-initialized cursors, so every row is written at its arithmetic
+    // offset with the exact same bytes the cursor path would produce.
+    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut base = col_prefix;
+    match var_prefix {
+        None => {
+            for _ in 0..nrows {
+                offsets.push(base);
+                base = base.wrapping_add(row_stride);
+            }
+        }
+        Some(vp) => {
+            for &p in vp.iter().take(nrows) {
+                offsets.push(base.wrapping_add(p));
+                base = base.wrapping_add(row_stride);
+            }
+        }
+    }
+    let mut cursors = vec![0u32; nrows];
+    field_encode(canonical, field, &offsets, &mut cursors, out, ctx)
+}
+
 /// Encode each row's bytes for the given canonical view into `out`, writing starting at
 /// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
 /// bytes written.
@@ -958,6 +1011,68 @@ fn encode_extension(
     field_encode(&storage, field, row_offsets, col_offset, out, ctx)
 }
 
+/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a
+/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the
+/// compiler can drop the per-row offset/cursor indirection.
+fn encode_primitive_arith(
+    arr: &PrimitiveArray,
+    field: RowSortField,
+    col_prefix: u32,
+    row_stride: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_arith_typed::<T>(arr, field, col_prefix, row_stride, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: RowSortField,
+    col_prefix: u32,
+    row_stride: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let value_bytes = size_of::<T>();
+    let slot_size = 1 + value_bytes;
+    let stride = row_stride as usize;
+    let prefix = col_prefix as usize;
+    let descending = field.descending;
+
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk,
+            // so the inner write vectorizes the same way as `arrow-row`'s not-null path.
+            for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) {
+                let slot = &mut chunk[prefix..prefix + slot_size];
+                slot[0] = non_null;
+                v.encode_to(&mut slot[1..], descending);
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() {
+                let slot = &mut chunk[prefix..prefix + slot_size];
+                if mask.value(i) {
+                    slot[0] = non_null;
+                    v.encode_to(&mut slot[1..], descending);
+                } else {
+                    slot[0] = null;
+                    for b in &mut slot[1..] {
+                        *b = 0;
+                    }
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
 /// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with
 /// continuation/length markers. Returns the number of bytes written. Empty values are
 /// encoded by the caller as a single sentinel byte and never reach this function.
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index f789382c4a1..46a4be778d4 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -34,6 +34,7 @@ use crate::codec;
 use crate::options::RowEncodingOptions;
 use crate::options::deserialize_row_encoding_options;
 use crate::options::serialize_row_encoding_options;
+use crate::size::ColKind;
 use crate::size::compute_sizes;
 
 /// Variadic scalar function that encodes N input columns into a single `List<u8>`
@@ -112,6 +113,8 @@ fn execute_row_encode(
     let crate::size::SizePassResult {
         fixed_per_row,
         var_lengths,
+        col_kinds,
+        first_varlen_idx,
         columns,
     } = compute_sizes(options, args, ctx)?;
 
@@ -149,53 +152,107 @@ fn execute_row_encode(
     // listview_offsets[i] is the absolute byte offset where row `i` begins.
     // For pure-fixed: i * fixed_per_row.
     // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
+    //
+    // When fixed-before-varlen columns coexist with a varlen column, we additionally build
+    // `var_prefix_for_arith[i] = exclusive cumsum of var_lengths[..i]` and hand it to the
+    // arithmetic encoders so they can compute per-row write positions without a cursor.
+    let need_arith_prefix = first_varlen_idx.is_some()
+        && col_kinds.iter().any(|k| {
+            matches!(
+                k,
+                ColKind::Fixed {
+                    before_varlen: true,
+                    ..
+                }
+            )
+        });
+
     // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end.
     let mut listview_offsets: BufferMut<u32> = BufferMut::with_capacity(nrows);
     // SAFETY: `nrows` of capacity reserved above; every index in `[0, nrows)` is written
     // before the buffer is read out. `nrows` was validated to fit `u32` at function entry,
-    // so `i as u32` below is exact and the multiplications can't overflow.
+    // so the `0u32..` counters below are exact and the multiplications can't overflow.
     unsafe { listview_offsets.set_len(nrows) };
     let off = listview_offsets.as_mut_slice();
+    let mut var_prefix_for_arith: Option<Vec<u32>> = None;
     match var_lengths.as_ref() {
         None => {
             // Pure-fixed: offsets[i] = i * fixed_per_row. Zipping against a `u32` counter
-            // elides per-element bounds checks (and avoids a per-element `usize as u32`
-            // cast), so LLVM auto-vectorizes this multiply. `nrows` fits u32, so the counter
-            // never overflows.
+            // elides per-element bounds checks, so LLVM auto-vectorizes this multiply.
             for (slot, i) in off.iter_mut().zip(0u32..) {
                 *slot = i * fixed_per_row;
             }
         }
         Some(v) => {
             // Mixed: offsets[i] = i * fixed_per_row + var_prefix[i], where var_prefix is the
-            // exclusive cumsum of varlen lengths. `iter_mut().zip` elides per-element bounds
-            // checks; the total was validated to fit u32 upstream so the wrapping arithmetic
-            // is exact (it never actually wraps).
+            // exclusive cumsum of varlen lengths. The total was validated to fit u32 upstream
+            // so the wrapping arithmetic is exact (it never actually wraps).
+            let mut vp: Option<Vec<u32>> = need_arith_prefix.then(|| Vec::with_capacity(nrows));
             let mut acc: u32 = 0;
             for ((slot, &l), i) in off.iter_mut().zip(v.iter()).zip(0u32..) {
+                if let Some(p) = vp.as_mut() {
+                    p.push(acc);
+                }
                 *slot = i.wrapping_mul(fixed_per_row).wrapping_add(acc);
                 acc = acc.wrapping_add(l);
             }
+            var_prefix_for_arith = vp;
         }
     }
     let listview_offsets_slice: &[u32] = listview_offsets.as_slice();
 
     // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build
     // it as a BufferMut so we can hand it directly to the output PrimitiveArray.
+    //
+    // The cursor path begins at the first cursor-path column. Fixed-before-varlen columns
+    // are written by the arithmetic path and do not touch the cursor, so the cursor is
+    // pre-seeded with the within-row offset of the first varlen column (its `fixed_prefix`).
+    // When there are no varlen columns at all, every column takes the arithmetic path and
+    // the cursor loop runs zero iterations; seeding with `fixed_per_row` then leaves the
+    // cursors already correct as per-row sizes.
+    let initial_cursor: u32 = match first_varlen_idx {
+        Some(idx) => match col_kinds[idx] {
+            ColKind::Variable { fixed_prefix } => fixed_prefix,
+            ColKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"),
+        },
+        None => fixed_per_row,
+    };
     let mut row_cursors: BufferMut<u32> = BufferMut::with_capacity(nrows);
-    row_cursors.push_n(0u32, nrows);
+    row_cursors.push_n(initial_cursor, nrows);
 
-    // ===== Phase 4: encode columns via the cursor path =====
-    // Each column was canonicalized once during the size pass; reuse that canonical form.
+    // ===== Phase 4: encode columns =====
+    // Fixed-before-varlen columns take the arithmetic-write path (constant within-row
+    // offset, no cursor mutation). Fixed-after-varlen and varlen columns take the cursor
+    // path. Each column was canonicalized once during the size pass; reuse that form.
     for (i, canonical) in columns.iter().enumerate() {
-        codec::field_encode(
-            canonical,
-            options.fields[i],
-            listview_offsets_slice,
-            row_cursors.as_mut_slice(),
-            &mut out_buf,
-            ctx,
-        )?;
+        match col_kinds[i] {
+            ColKind::Fixed {
+                prefix,
+                before_varlen: true,
+                ..
+            } => {
+                codec::field_encode_fixed_arithmetic(
+                    canonical,
+                    options.fields[i],
+                    prefix,
+                    fixed_per_row,
+                    var_prefix_for_arith.as_deref(),
+                    nrows,
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+            ColKind::Fixed { .. } | ColKind::Variable { .. } => {
+                codec::field_encode(
+                    canonical,
+                    options.fields[i],
+                    listview_offsets_slice,
+                    row_cursors.as_mut_slice(),
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+        }
     }
 
     // ===== Phase 5: build ListView output =====
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index 26269081ce7..860fe3c2a2c 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -36,6 +36,24 @@ use crate::options::RowEncodingOptions;
 use crate::options::deserialize_row_encoding_options;
 use crate::options::serialize_row_encoding_options;
 
+/// Classification of a single input column for the size pass.
+///
+/// Tracks each column's within-row byte offset (the constant prefix from all preceding
+/// fixed-width columns) and, for fixed columns, whether any variable-length column has
+/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast
+/// path (no varlen before this column, so the within-row position is constant per row) and
+/// the cursor-write path.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum ColKind {
+    /// Fixed-width column. `prefix` is the within-row byte offset of this column's first
+    /// byte. When `before_varlen` is true no variable-length column precedes this one, so the
+    /// within-row offset is constant for every row.
+    Fixed { prefix: u32, before_varlen: bool },
+    /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all
+    /// preceding fixed columns; the contribution of earlier varlen columns is added per row.
+    Variable { fixed_prefix: u32 },
+}
+
 /// Result of the size pass: enough information for both [`RowSize::execute`] and the
 /// downstream [`RowEncode`](super::encode::RowEncode) pipeline.
 ///
@@ -45,6 +63,8 @@ use crate::options::serialize_row_encoding_options;
 pub(crate) struct SizePassResult {
     pub fixed_per_row: u32,
     pub var_lengths: Option<Vec<u32>>,
+    pub col_kinds: Vec<ColKind>,
+    pub first_varlen_idx: Option<usize>,
     pub columns: Vec<Canonical>,
 }
 
@@ -77,8 +97,11 @@ pub(crate) fn compute_sizes(
     let nrows = args.row_count();
 
     let mut columns: Vec<Canonical> = Vec::with_capacity(n_inputs);
+    let mut col_kinds: Vec<ColKind> = Vec::with_capacity(n_inputs);
     let mut fixed_per_row: u32 = 0;
     let mut var_lengths: Option<Vec<u32>> = None;
+    let mut first_varlen_idx: Option<usize> = None;
+    let mut running_fixed_prefix: u32 = 0;
 
     for i in 0..n_inputs {
         let col = args.get(i)?;
@@ -95,13 +118,24 @@ pub(crate) fn compute_sizes(
         let canonical = col.execute::<Canonical>(ctx)?;
         match width {
             RowWidth::Fixed(w) => {
-                fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(|| {
-                    vortex_error::vortex_err!("per-row fixed width overflows u32 at column {}", i)
-                })?;
+                col_kinds.push(ColKind::Fixed {
+                    prefix: running_fixed_prefix,
+                    before_varlen: first_varlen_idx.is_none(),
+                });
+                let overflow =
+                    || vortex_error::vortex_err!("per-row fixed width overflows u32 at column {i}");
+                fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(overflow)?;
+                running_fixed_prefix = running_fixed_prefix.checked_add(w).ok_or_else(overflow)?;
             }
             RowWidth::Variable => {
+                if first_varlen_idx.is_none() {
+                    first_varlen_idx = Some(i);
+                }
                 let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]);
                 codec::field_size(&canonical, options.fields[i], v, ctx)?;
+                col_kinds.push(ColKind::Variable {
+                    fixed_prefix: running_fixed_prefix,
+                });
             }
         }
         columns.push(canonical);
@@ -110,6 +144,8 @@ pub(crate) fn compute_sizes(
     Ok(SizePassResult {
         fixed_per_row,
         var_lengths,
+        col_kinds,
+        first_varlen_idx,
         columns,
     })
 }

From 2fc07fa1accf9541d469c757b10a960c3e1f488f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 5 Jun 2026 08:59:19 +0000
Subject: [PATCH 12/19] ci(vortex-row): fold row_encode benchmarks into
 CodSpeed shard 8

Run the vortex-row row_encode benchmarks as part of the existing 'Storage formats'
shard rather than adding a dedicated ninth shard.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .github/workflows/codspeed.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index b741aaaf41d..81b37e510b8 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -34,8 +34,7 @@ jobs:
           - { shard: 5, name: "Encodings 2", packages: "vortex-decimal-byte-parts vortex-fastlanes vortex-fsst", features: "--features _test-harness" }
           - { shard: 6, name: "Encodings 3", packages: "vortex-pco vortex-runend vortex-sequence" }
           - { shard: 7, name: "Encodings 4", packages: "vortex-sparse vortex-zigzag vortex-zstd" }
-          - { shard: 8, name: "Storage formats", packages: "vortex-flatbuffers vortex-proto vortex-btrblocks" }
-          - { shard: 9, name: "Row encoding", packages: "vortex-row" }
+          - { shard: 8, name: "Storage formats & row encoding", packages: "vortex-flatbuffers vortex-proto vortex-btrblocks vortex-row" }
     name: "Benchmark with Codspeed (Shard #${{ matrix.shard }})"
     timeout-minutes: 30
     runs-on: >-

From b97b7e4f66c5e01868d989f066bc2d587a9c8d4d Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 21:58:52 +0000
Subject: [PATCH 13/19] vortex-row: benchmark FSST row-encoding
 (unpack-then-convert vs phases)

FSST is not order-preserving, so row keys must be the decompressed bytes; the only
strategy today is decompress to a canonical VarBinView then row-encode it. This bench
measures that path and its two phases (decompress-only, and row-encode of an
already-decompressed column) on compressible multi-block strings, to quantify the
opportunity for a future fused FSST row-encode kernel: the phases are additive
(decompress ~46%, row-encode ~54%), and the row-encode phase re-reads/re-writes the
decompressed bytes a fused kernel could emit once.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                            |   1 +
 vortex-row/Cargo.toml                 |   5 ++
 vortex-row/benches/fsst_row_encode.rs | 120 ++++++++++++++++++++++++++
 3 files changed, 126 insertions(+)
 create mode 100644 vortex-row/benches/fsst_row_encode.rs

diff --git a/Cargo.lock b/Cargo.lock
index 967f0a18a09..bf24dafe3dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9997,6 +9997,7 @@ dependencies = [
  "vortex-array",
  "vortex-buffer",
  "vortex-error",
+ "vortex-fsst",
  "vortex-mask",
  "vortex-session",
 ]
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index 9222c7d6a43..e58a48f16e7 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -35,7 +35,12 @@ mimalloc = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
+vortex-fsst = { workspace = true }
 
 [[bench]]
 name = "row_encode"
 harness = false
+
+[[bench]]
+name = "fsst_row_encode"
+harness = false
diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs
new file mode 100644
index 00000000000..750e23faa1e
--- /dev/null
+++ b/vortex-row/benches/fsst_row_encode.rs
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(clippy::unwrap_used)]
+
+//! Row-encoding an FSST-compressed string column: the only realizable strategy is
+//! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it),
+//! because FSST is **not order-preserving** — its 1-byte codes are assigned by compression
+//! gain, not by value, so the compressed bytes cannot be compared lexicographically. A
+//! hypothetical "direct" kernel could only *fuse* decompression with row-key emission; it
+//! still has to expand every symbol.
+//!
+//! These benchmarks measure the full path and its two phases so the fusion opportunity is
+//! quantifiable:
+//!   * `fsst_unpack_then_convert` — decompress + row-encode (the status quo).
+//!   * `fsst_decompress_only`     — decompress alone (the irreducible floor: a direct kernel
+//!     must still produce these bytes).
+//!   * `plain_row_encode_only`    — row-encode an already-decompressed `VarBinView` (the part
+//!     a fused kernel would overlap with decompression; its writes into the intermediate
+//!     buffer + views are what fusion removes).
+
+use divan::counter::BytesCount;
+use mimalloc::MiMalloc;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use vortex_array::ArrayRef;
+use vortex_array::Canonical;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::VarBinArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_fsst::fsst_compress;
+use vortex_fsst::fsst_train_compressor;
+use vortex_row::RowEncoder;
+
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
+const N: usize = 100_000;
+const AVG_LEN: usize = 64;
+const UNIQUE_CHARS: u8 = 8;
+
+/// Generate compressible, multi-block (>32 byte) strings over a small alphabet so FSST finds
+/// a strong symbol table — the regime where a direct kernel would matter most.
+fn generate_strings() -> (VarBinArray, u64) {
+    let mut rng = StdRng::seed_from_u64(0);
+    let mut strings = Vec::with_capacity(N);
+    let mut total_bytes: u64 = 0;
+    for _ in 0..N {
+        let len = AVG_LEN * rng.random_range(50..=150) / 100;
+        total_bytes += len as u64;
+        let s = (0..len)
+            .map(|_| rng.random_range(b'a'..(b'a' + UNIQUE_CHARS)) as char)
+            .collect::<String>()
+            .into_bytes();
+        strings.push(Some(s.into_boxed_slice()));
+    }
+    let arr = VarBinArray::from_iter(strings, DType::Binary(Nullability::NonNullable));
+    (arr, total_bytes)
+}
+
+fn build_fsst() -> (ArrayRef, u64) {
+    let (arr, total_bytes) = generate_strings();
+    let compressor = fsst_train_compressor(&arr);
+    let len = arr.len();
+    let dtype = arr.dtype().clone();
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let fsst = fsst_compress(arr, len, &dtype, &compressor, &mut ctx).into_array();
+    (fsst, total_bytes)
+}
+
+fn decompress(fsst: &ArrayRef) -> ArrayRef {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    fsst.clone()
+        .execute::<Canonical>(&mut ctx)
+        .unwrap()
+        .into_array()
+}
+
+fn main() {
+    divan::main();
+}
+
+/// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it.
+#[divan::bench]
+fn fsst_unpack_then_convert(bencher: divan::Bencher) {
+    let (fsst, total_bytes) = build_fsst();
+    let encoder = RowEncoder::default();
+    bencher.counter(BytesCount::new(total_bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let decoded = fsst.clone().execute::<Canonical>(&mut ctx).unwrap().into_array();
+        encoder.encode(&[decoded], &mut ctx).unwrap()
+    });
+}
+
+/// Irreducible floor: FSST decompression alone (a direct kernel must still produce these
+/// bytes, since the sort key *is* the decompressed bytes).
+#[divan::bench]
+fn fsst_decompress_only(bencher: divan::Bencher) {
+    let (fsst, total_bytes) = build_fsst();
+    bencher
+        .counter(BytesCount::new(total_bytes))
+        .bench_local(|| decompress(&fsst));
+}
+
+/// Row-encode an already-decompressed `VarBinView`. The writes into the decompressed buffer +
+/// views that precede this step are what a fused direct kernel would eliminate.
+#[divan::bench]
+fn plain_row_encode_only(bencher: divan::Bencher) {
+    let (fsst, total_bytes) = build_fsst();
+    let decoded = decompress(&fsst);
+    let encoder = RowEncoder::default();
+    bencher.counter(BytesCount::new(total_bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        encoder.encode(std::slice::from_ref(&decoded), &mut ctx).unwrap()
+    });
+}

From d6f1f4ecb11c0c24566aa43f5aa4993e5fb17c64 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 22:04:16 +0000
Subject: [PATCH 14/19] vortex-row: rustfmt the fsst row-encode benchmark

Apply nightly rustfmt formatting to the FSST benchmark added in the previous commit.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/benches/fsst_row_encode.rs | 28 ++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs
index 750e23faa1e..58c9eed9843 100644
--- a/vortex-row/benches/fsst_row_encode.rs
+++ b/vortex-row/benches/fsst_row_encode.rs
@@ -89,11 +89,17 @@ fn main() {
 fn fsst_unpack_then_convert(bencher: divan::Bencher) {
     let (fsst, total_bytes) = build_fsst();
     let encoder = RowEncoder::default();
-    bencher.counter(BytesCount::new(total_bytes)).bench_local(|| {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
-        let decoded = fsst.clone().execute::<Canonical>(&mut ctx).unwrap().into_array();
-        encoder.encode(&[decoded], &mut ctx).unwrap()
-    });
+    bencher
+        .counter(BytesCount::new(total_bytes))
+        .bench_local(|| {
+            let mut ctx = LEGACY_SESSION.create_execution_ctx();
+            let decoded = fsst
+                .clone()
+                .execute::<Canonical>(&mut ctx)
+                .unwrap()
+                .into_array();
+            encoder.encode(&[decoded], &mut ctx).unwrap()
+        });
 }
 
 /// Irreducible floor: FSST decompression alone (a direct kernel must still produce these
@@ -113,8 +119,12 @@ fn plain_row_encode_only(bencher: divan::Bencher) {
     let (fsst, total_bytes) = build_fsst();
     let decoded = decompress(&fsst);
     let encoder = RowEncoder::default();
-    bencher.counter(BytesCount::new(total_bytes)).bench_local(|| {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
-        encoder.encode(std::slice::from_ref(&decoded), &mut ctx).unwrap()
-    });
+    bencher
+        .counter(BytesCount::new(total_bytes))
+        .bench_local(|| {
+            let mut ctx = LEGACY_SESSION.create_execution_ctx();
+            encoder
+                .encode(std::slice::from_ref(&decoded), &mut ctx)
+                .unwrap()
+        });
 }

From 43915bd9c915e314c899be0c7aca3f260f4a132e Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 22:12:10 +0000
Subject: [PATCH 15/19] vortex-row: add fused fast-path FSST row-encode
 benchmark

Adds `fsst_fast_fused`: bulk-decompresses the FSST code heap straight into a
contiguous buffer (no intermediate VarBinViewArray) and block-encodes rows directly
into the row-key ListView using the stored uncompressed_lengths (free size pass), with
the same no-zero-init / no-extra-copy techniques as the row encoder. Lets us compare
the fused path head-to-head against decode-then-convert.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/benches/fsst_row_encode.rs | 135 +++++++++++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs
index 58c9eed9843..e7ee9cf221e 100644
--- a/vortex-row/benches/fsst_row_encode.rs
+++ b/vortex-row/benches/fsst_row_encode.rs
@@ -1,7 +1,11 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#![expect(clippy::unwrap_used)]
+#![expect(
+    clippy::unwrap_used,
+    clippy::expect_used,
+    clippy::cast_possible_truncation
+)]
 
 //! Row-encoding an FSST-compressed string column: the only realizable strategy is
 //! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it),
@@ -29,9 +33,17 @@ use vortex_array::Canonical;
 use vortex_array::IntoArray;
 use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::VarBinArray;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::ByteBufferMut;
+use vortex_fsst::FSST;
+use vortex_fsst::FSSTArrayExt;
 use vortex_fsst::fsst_compress;
 use vortex_fsst::fsst_train_compressor;
 use vortex_row::RowEncoder;
@@ -80,6 +92,117 @@ fn decompress(fsst: &ArrayRef) -> ArrayRef {
         .into_array()
 }
 
+const VARLEN_BLOCK: usize = 32;
+const VARLEN_BLOCK_TOTAL: usize = 33;
+// Sentinel for a non-empty varlen value (ascending, non-null) — value is irrelevant to timing.
+const NON_EMPTY_SENTINEL: u8 = 0x02;
+
+/// Encoded row-key length for a non-empty value of `len` decompressed bytes: a leading
+/// sentinel plus `ceil(len/32)` 32-byte blocks, each followed by a continuation/length byte.
+fn encoded_len(len: usize) -> u32 {
+    if len == 0 {
+        1
+    } else {
+        1 + (len.div_ceil(VARLEN_BLOCK) as u32) * VARLEN_BLOCK_TOTAL as u32
+    }
+}
+
+/// Block-encode `bytes` (ascending) into `out`, matching vortex-row's varlen body format.
+fn block_encode(bytes: &[u8], out: &mut [u8]) {
+    let len = bytes.len();
+    let full = len / VARLEN_BLOCK;
+    let partial = len % VARLEN_BLOCK;
+    let (full_to_write, partial_len) = if partial == 0 {
+        (full - 1, VARLEN_BLOCK)
+    } else {
+        (full, partial)
+    };
+    let mut src = 0;
+    let mut dst = 0;
+    for _ in 0..full_to_write {
+        out[dst..dst + VARLEN_BLOCK].copy_from_slice(&bytes[src..src + VARLEN_BLOCK]);
+        out[dst + VARLEN_BLOCK] = 0xFF;
+        src += VARLEN_BLOCK;
+        dst += VARLEN_BLOCK_TOTAL;
+    }
+    out[dst..dst + partial_len].copy_from_slice(&bytes[src..src + partial_len]);
+    for b in &mut out[dst + partial_len..dst + VARLEN_BLOCK] {
+        *b = 0;
+    }
+    out[dst + VARLEN_BLOCK] = partial_len as u8;
+}
+
+/// Fused FSST → row-key kernel: bulk-decompress the code heap into one contiguous buffer (no
+/// intermediate `VarBinViewArray`), then block-encode each row straight into the row-key
+/// `ListView<u8>` using the stored `uncompressed_lengths` for boundaries (no size-pass walk).
+fn fast_fused(fsst: &ArrayRef) -> ArrayRef {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let view = fsst.as_opt::<FSST>().expect("FSST array");
+
+    // Per-row decompressed lengths are already stored — the size pass is free.
+    let lens_arr = view
+        .uncompressed_lengths()
+        .clone()
+        .execute::<PrimitiveArray>(&mut ctx)
+        .unwrap();
+    let lens: Vec<usize> = match_each_integer_ptype!(lens_arr.ptype(), |P| {
+        lens_arr
+            .as_slice::<P>()
+            .iter()
+            .map(|x| *x as usize)
+            .collect()
+    });
+
+    // Bulk-decompress the whole code heap once into a contiguous buffer (no VarBinView).
+    let heap = view.codes_bytes();
+    let total: usize = lens.iter().sum();
+    let decompressor = view.decompressor();
+    let mut decompressed = ByteBufferMut::with_capacity(total + 7);
+    let n = decompressor.decompress_into(heap.as_slice(), decompressed.spare_capacity_mut());
+    unsafe { decompressed.set_len(n) };
+    let bytes = decompressed.as_slice();
+
+    // Size + offsets for the row-key ListView (lengths are free, no view walk).
+    let nrows = lens.len();
+    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut sizes: Vec<u32> = Vec::with_capacity(nrows);
+    let mut acc: u32 = 0;
+    for &l in &lens {
+        offsets.push(acc);
+        let sz = encoded_len(l);
+        sizes.push(sz);
+        acc += sz;
+    }
+
+    // Block-encode every row directly into the elements buffer. No zero-init (every byte is
+    // written: sentinel + block body with zero-padded final block) and no Vec→Buffer copy.
+    let mut out = ByteBufferMut::with_capacity(acc as usize);
+    unsafe { out.set_len(acc as usize) };
+    let out_slice = out.as_mut_slice();
+    let mut src = 0usize;
+    for (i, &l) in lens.iter().enumerate() {
+        let pos = offsets[i] as usize;
+        out_slice[pos] = NON_EMPTY_SENTINEL;
+        if l != 0 {
+            block_encode(&bytes[src..src + l], &mut out_slice[pos + 1..]);
+        }
+        src += l;
+    }
+
+    let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable);
+    let offsets_arr =
+        PrimitiveArray::new(Buffer::<u32>::copy_from(&offsets), Validity::NonNullable);
+    let sizes_arr = PrimitiveArray::new(Buffer::<u32>::copy_from(&sizes), Validity::NonNullable);
+    ListViewArray::try_new(
+        elements.into_array(),
+        offsets_arr.into_array(),
+        sizes_arr.into_array(),
+        Validity::NonNullable,
+    )
+    .unwrap()
+    .into_array()
+}
+
 fn main() {
     divan::main();
 }
@@ -102,6 +225,16 @@ fn fsst_unpack_then_convert(bencher: divan::Bencher) {
         });
 }
 
+/// Fused fast path: bulk-decompress directly into the row-key block format, skipping the
+/// intermediate `VarBinViewArray` and the generic row-encoder (size pass is free).
+#[divan::bench]
+fn fsst_fast_fused(bencher: divan::Bencher) {
+    let (fsst, total_bytes) = build_fsst();
+    bencher
+        .counter(BytesCount::new(total_bytes))
+        .bench_local(|| fast_fused(&fsst));
+}
+
 /// Irreducible floor: FSST decompression alone (a direct kernel must still produce these
 /// bytes, since the sort key *is* the decompressed bytes).
 #[divan::bench]

From 2c54e62931dd27bef50e1de35dbe2fa6742f8200 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 5 Jun 2026 08:02:46 +0000
Subject: [PATCH 16/19] vortex-row: add scatter-right fused FSST row-encode
 benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `fsst_fast_scatter`: keeps FSST's fast contiguous bulk decompressor but runs it
into a cache-resident scratch one row-batch at a time, scattering each row into block
form from cache so the decompressed bytes never round-trip through main memory. A
one-time assert_arrays_eq! check confirms it produces byte-identical row keys to the
straightforward fused path.

Result: fast_scatter is on par with fast_fused (no speedup) — the decompressed buffer is
already consumed cache-warm in the simple fused path, so avoiding the round-trip saves
nothing; the workload is CPU-bound on FSST symbol decode plus block-copy.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/benches/fsst_row_encode.rs | 129 +++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 1 deletion(-)

diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs
index e7ee9cf221e..083788ff6ab 100644
--- a/vortex-row/benches/fsst_row_encode.rs
+++ b/vortex-row/benches/fsst_row_encode.rs
@@ -4,7 +4,8 @@
 #![expect(
     clippy::unwrap_used,
     clippy::expect_used,
-    clippy::cast_possible_truncation
+    clippy::cast_possible_truncation,
+    clippy::many_single_char_names
 )]
 
 //! Row-encoding an FSST-compressed string column: the only realizable strategy is
@@ -36,6 +37,8 @@ use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::VarBinArray;
+use vortex_array::arrays::varbin::VarBinArrayExt;
+use vortex_array::assert_arrays_eq;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
 use vortex_array::match_each_integer_ptype;
@@ -203,10 +206,134 @@ fn fast_fused(fsst: &ArrayRef) -> ArrayRef {
     .into_array()
 }
 
+/// "Scatter right": keep FSST's fast contiguous bulk decompressor, but run it into a
+/// cache-resident scratch one row-batch at a time, then scatter each row into block form from
+/// cache. The decompressed bytes never round-trip through main memory — unlike `fast_fused`,
+/// which materializes the whole 6.4 MB decompressed buffer and reads it back to block-encode.
+fn fast_scatter(fsst: &ArrayRef) -> ArrayRef {
+    // Scratch sized to stay resident in L1/L2; each batch decompresses up to this many bytes.
+    const SCRATCH: usize = 16 * 1024;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let view = fsst.as_opt::<FSST>().expect("FSST array");
+
+    let lens_arr = view
+        .uncompressed_lengths()
+        .clone()
+        .execute::<PrimitiveArray>(&mut ctx)
+        .unwrap();
+    let lens: Vec<usize> = match_each_integer_ptype!(lens_arr.ptype(), |P| {
+        lens_arr
+            .as_slice::<P>()
+            .iter()
+            .map(|x| *x as usize)
+            .collect()
+    });
+    let nrows = lens.len();
+
+    // Per-row compressed code offsets (relative to the sliced heap start).
+    let codes = view.codes();
+    let heap = codes.sliced_bytes();
+    let code_off_arr = codes
+        .offsets()
+        .clone()
+        .execute::<PrimitiveArray>(&mut ctx)
+        .unwrap();
+    let base = match_each_integer_ptype!(code_off_arr.ptype(), |P| {
+        code_off_arr.as_slice::<P>()[0] as usize
+    });
+    let code_off: Vec<usize> = match_each_integer_ptype!(code_off_arr.ptype(), |P| {
+        code_off_arr
+            .as_slice::<P>()
+            .iter()
+            .map(|x| *x as usize - base)
+            .collect()
+    });
+
+    // Output sizing (free from stored lengths).
+    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut sizes: Vec<u32> = Vec::with_capacity(nrows);
+    let mut acc: u32 = 0;
+    let mut max_row = 0usize;
+    for &l in &lens {
+        offsets.push(acc);
+        let sz = encoded_len(l);
+        sizes.push(sz);
+        acc += sz;
+        max_row = max_row.max(l);
+    }
+    let mut out = ByteBufferMut::with_capacity(acc as usize);
+    unsafe { out.set_len(acc as usize) };
+    let out_slice = out.as_mut_slice();
+
+    let decompressor = view.decompressor();
+    let scratch_cap = SCRATCH.max(max_row) + 8;
+    let mut scratch = ByteBufferMut::with_capacity(scratch_cap);
+
+    let mut r = 0usize;
+    while r < nrows {
+        // Grow a batch until it would overflow the scratch (always at least one row).
+        let bs = r;
+        let mut batch_bytes = 0usize;
+        while r < nrows && (r == bs || batch_bytes + lens[r] <= SCRATCH) {
+            batch_bytes += lens[r];
+            r += 1;
+        }
+        let be = r;
+
+        // Decompress this batch's codes in one fast call into the cache-resident scratch.
+        let cslice = &heap.as_slice()[code_off[bs]..code_off[be]];
+        let n = decompressor.decompress_into(cslice, scratch.spare_capacity_mut());
+        unsafe { scratch.set_len(n) };
+        let sbytes = scratch.as_slice();
+
+        // Scatter each row from cache into block form.
+        let mut local = 0usize;
+        for i in bs..be {
+            let l = lens[i];
+            let pos = offsets[i] as usize;
+            out_slice[pos] = NON_EMPTY_SENTINEL;
+            if l != 0 {
+                block_encode(&sbytes[local..local + l], &mut out_slice[pos + 1..]);
+            }
+            local += l;
+        }
+        unsafe { scratch.set_len(0) };
+    }
+
+    let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable);
+    let offsets_arr =
+        PrimitiveArray::new(Buffer::<u32>::copy_from(&offsets), Validity::NonNullable);
+    let sizes_arr = PrimitiveArray::new(Buffer::<u32>::copy_from(&sizes), Validity::NonNullable);
+    ListViewArray::try_new(
+        elements.into_array(),
+        offsets_arr.into_array(),
+        sizes_arr.into_array(),
+        Validity::NonNullable,
+    )
+    .unwrap()
+    .into_array()
+}
+
 fn main() {
+    // Correctness: the batched cache-resident scatter must produce identical row keys to the
+    // straightforward fused path.
+    {
+        let (fsst, _) = build_fsst();
+        assert_arrays_eq!(fast_scatter(&fsst), fast_fused(&fsst));
+    }
     divan::main();
 }
 
+/// "Scatter right" fused path: cache-resident batched decompress + scatter into block form.
+#[divan::bench]
+fn fsst_fast_scatter(bencher: divan::Bencher) {
+    let (fsst, total_bytes) = build_fsst();
+    bencher
+        .counter(BytesCount::new(total_bytes))
+        .bench_local(|| fast_scatter(&fsst));
+}
+
 /// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it.
 #[divan::bench]
 fn fsst_unpack_then_convert(bencher: divan::Bencher) {

From b3411f1b2d08d7a1b7cfbc036b23feeeb4c50399 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 5 Jun 2026 11:32:57 +0100
Subject: [PATCH 17/19] fix

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 docs/specs/index.md        |   1 +
 docs/specs/row-encoding.md | 539 +++++++++++++++++++++++++++++++++++++
 vortex-row/README.md       |  17 ++
 vortex-row/src/codec.rs    |  40 +--
 vortex-row/src/encode.rs   |   2 +-
 vortex-row/src/encoder.rs  |  23 ++
 vortex-row/src/lib.rs      |   6 +-
 vortex-row/src/size.rs     |   2 +-
 vortex-row/src/tests.rs    |  40 +++
 9 files changed, 638 insertions(+), 32 deletions(-)
 create mode 100644 docs/specs/row-encoding.md
 create mode 100644 vortex-row/README.md

diff --git a/docs/specs/index.md b/docs/specs/index.md
index 547cd99f182..045ccb0710d 100644
--- a/docs/specs/index.md
+++ b/docs/specs/index.md
@@ -11,4 +11,5 @@ file-format
 ipc-format
 dtype-format
 scalar-format
+row-encoding
 ```
diff --git a/docs/specs/row-encoding.md b/docs/specs/row-encoding.md
new file mode 100644
index 00000000000..8fc3288f82b
--- /dev/null
+++ b/docs/specs/row-encoding.md
@@ -0,0 +1,539 @@
+# Row Encoding Byte Sort Specification
+
+This document describes the byte-sortable row encoding implemented by the `vortex-row`
+crate. The encoding converts one or more columnar arrays into a `ListView<u8>` array. Each
+output row is a byte string, and lexicographic byte comparison of those byte strings matches
+logical tuple comparison of the input values under the configured row sort options.
+
+This is a schema-aware row-key format. The bytes do not contain type tags, field names, or
+sort options. Two encoded rows are comparable only when they were produced with the same
+input schema and the same per-column `RowSortField` settings.
+
+The row encoding is not the Vortex file format or scalar IPC format. It is an internal
+comparison representation used for sort keys and row-key operations.
+
+:::{warning}
+The row encoding format is experimental. Its byte layout, supported type set, and edge-case
+semantics may change between Vortex releases. Do not persist these bytes or depend on them as
+a stable interchange format.
+:::
+
+## Order Property
+
+For a fixed schema with columns `c0, c1, ..., cn` and per-column sort fields
+`f0, f1, ..., fn`, row encoding provides this property:
+
+```text
+encode(row_a) < encode(row_b)
+```
+
+if and only if tuple comparison says:
+
+```text
+(row_a.c0, row_a.c1, ..., row_a.cn) < (row_b.c0, row_b.c1, ..., row_b.cn)
+```
+
+using the requested ascending or descending direction and requested null placement for each
+column.
+
+The property is built from two rules:
+
+1. Each supported scalar or nested value is encoded so its bytes sort in the same order as
+   the value.
+2. Fields are concatenated from left to right, so lexicographic byte comparison naturally
+   performs tuple comparison.
+
+## Notation
+
+This document uses the following notation:
+
+- `||` means byte concatenation.
+- `BE(x)` means the fixed-width big-endian bytes of `x`.
+- `!b` means `b XOR 0xFF`.
+- `!bytes` means bitwise complement of every byte in `bytes`.
+- `zero(n)` means `n` zero bytes.
+- `ff(n)` means `n` bytes of `0xFF`.
+- `width(T)` means the native byte width of fixed-width type `T`.
+
+`BE(x)` always emits exactly the byte width of the value being encoded, with the most
+significant byte first. It is not length-prefixed and it does not drop leading zero or
+leading `0xFF` bytes. The host machine's native endianness is irrelevant; encoders produce
+these bytes explicitly.
+
+For example:
+
+| Value and type | `BE(value)` |
+| --- | --- |
+| `1_u8` | `01` |
+| `258_u16` | `01 02` |
+| `258_u32` | `00 00 01 02` |
+| `-5_i32`, before the signed sign-bit transform | `FF FF FF FB` |
+| `ordered = 0x80000000_u32` | `80 00 00 00` |
+
+## Field Options
+
+Each input column has a `RowSortField`:
+
+```text
+RowSortField {
+    descending: bool,
+    nulls_first: bool,
+}
+```
+
+`descending` reverses the order of non-null values. `nulls_first` is independent of
+`descending`, so nulls can sort before or after non-nulls in either direction.
+
+## Sentinel Summary
+
+Sentinels are single bytes that classify nullness and, for variable-width values, whether a
+value is empty or non-empty. They are chosen so byte comparison can decide those categories
+before comparing any value bytes.
+
+| Encoding family | Case | Ascending, nulls first | Descending, nulls first | Ascending, nulls last | Descending, nulls last |
+| --- | --- | --- | --- | --- | --- |
+| Fixed-width | Null | `0x00` | `0x00` | `0x02` | `0x02` |
+| Fixed-width | Non-null | `0x01` | `0x01` | `0x01` | `0x01` |
+| Variable-width | Null | `0x00` | `0x00` | `0xFF` | `0xFF` |
+| Variable-width | Empty | `0x01` | `0xFE` | `0x01` | `0xFE` |
+| Variable-width | Non-empty | `0x02` | `0xFD` | `0x02` | `0xFD` |
+
+Fixed-width sentinels are used by null, boolean, primitive, decimal, struct, and fixed-size
+list values. Variable-width sentinels are used by UTF-8 and binary values.
+
+## Fixed-Width Sentinels
+
+Every fixed-width value starts with a one-byte sentinel:
+
+| Case | Sentinel |
+| --- | --- |
+| Null, `nulls_first = true` | `0x00` |
+| Non-null | `0x01` |
+| Null, `nulls_first = false` | `0x02` |
+
+The sentinel is not inverted for descending order. Only the non-null value bytes are
+inverted. This keeps null placement independent from sort direction.
+
+For fixed-width nulls, the sentinel is followed by zero-filled value bytes. This gives fixed
+types a constant encoded width for every row.
+
+## Variable-Width Sentinels
+
+UTF-8 and binary values use three leading sentinels. The separate empty and non-empty
+sentinels are important: they ensure the first byte decides null, empty, or non-empty before
+later columns can affect comparison.
+
+| Case | Ascending | Descending |
+| --- | --- | --- |
+| Null, `nulls_first = true` | `0x00` | `0x00` |
+| Empty | `0x01` | `0xFE` |
+| Non-empty | `0x02` | `0xFD` |
+| Null, `nulls_first = false` | `0xFF` | `0xFF` |
+
+The null sentinel is not inverted by descending order. Empty and non-empty sentinels are
+inverted so non-null value order is reversed while null placement stays fixed.
+
+## Null
+
+`Null` values have no body:
+
+```text
+fixed_null_sentinel
+```
+
+The sentinel is `0x00` for nulls-first and `0x02` for nulls-last.
+
+## Boolean
+
+Booleans are fixed-width and use one value byte:
+
+```text
+sentinel || value_byte
+```
+
+For ascending order:
+
+| Value | Value byte |
+| --- | --- |
+| `false` | `0x01` |
+| `true` | `0x02` |
+
+For descending order, the value byte is inverted:
+
+| Value | Value byte |
+| --- | --- |
+| `true` | `0xFD` |
+| `false` | `0xFE` |
+
+Null booleans encode as:
+
+```text
+null_sentinel || 0x00
+```
+
+## Unsigned Integers
+
+Supported unsigned primitive types are `u8`, `u16`, `u32`, and `u64`.
+
+Ascending encoding:
+
+```text
+0x01 || BE(value)
+```
+
+Descending encoding:
+
+```text
+0x01 || !BE(value)
+```
+
+Big-endian byte order makes lexicographic byte order match numeric order for fixed-width
+unsigned integers. Bitwise complement reverses that order for descending fields.
+
+Null unsigned integers encode as:
+
+```text
+null_sentinel || zero(width(T))
+```
+
+## Signed Integers
+
+Supported signed primitive PTypes are `i8`, `i16`, `i32`, and `i64`. The same signed
+integer transform is also used for `i128` decimal storage.
+
+Signed integers first flip the sign bit of their big-endian two's-complement
+representation:
+
+```text
+ordered = BE(value)
+ordered[0] = ordered[0] XOR 0x80
+```
+
+Ascending encoding:
+
+```text
+0x01 || ordered
+```
+
+Descending encoding:
+
+```text
+0x01 || !ordered
+```
+
+Flipping the sign bit maps the signed numeric range into unsigned byte order:
+
+```text
+negative values -> 0x00..0x7F prefix range
+non-negative values -> 0x80..0xFF prefix range
+```
+
+Null signed integers encode as:
+
+```text
+null_sentinel || zero(width(T))
+```
+
+## Floating Point
+
+Supported floating primitive types are `f16`, `f32`, and `f64`.
+
+The encoder treats the IEEE bit pattern as an unsigned integer and applies a sign-aware
+transform before writing big-endian bytes.
+
+For a floating value with raw bits `bits`:
+
+```text
+if sign_bit(bits) == 0:
+    ordered = bits XOR sign_bit_mask
+else:
+    ordered = bits XOR all_ones
+```
+
+Ascending encoding:
+
+```text
+0x01 || BE(ordered)
+```
+
+Descending encoding:
+
+```text
+0x01 || !BE(ordered)
+```
+
+This produces a total-order-style byte ordering where negative values sort before positive
+values, and `-0.0` sorts before `+0.0`. NaN values are ordered by their raw bit patterns
+under the same transform; they are not canonicalized by row encoding.
+
+Null floats encode as:
+
+```text
+null_sentinel || zero(width(T))
+```
+
+## Decimal
+
+Decimals are encoded as their scaled signed integer storage value. The selected storage
+width is the smallest decimal value type for the decimal precision:
+
+| Precision | Storage |
+| --- | --- |
+| `1..=2` | `i8` |
+| `3..=4` | `i16` |
+| `5..=9` | `i32` |
+| `10..=18` | `i64` |
+| `19..=38` | `i128` |
+
+The storage integer is encoded with the signed integer encoding described above. Decimal
+columns have one precision and scale, so ordering the scaled integer storage values matches
+ordering the decimal values in that column.
+
+`Decimal256` is not supported by row encoding.
+
+## UTF-8 and Binary
+
+UTF-8 and binary values use the variable-width sentinels described above.
+
+Null:
+
+```text
+varlen_null_sentinel
+```
+
+Empty:
+
+```text
+varlen_empty_sentinel
+```
+
+Non-empty:
+
+```text
+varlen_non_empty_sentinel || varlen_body(bytes)
+```
+
+For UTF-8, `bytes` are the UTF-8 bytes of the string. For binary, `bytes` are the raw binary
+bytes. The byte ordering is therefore UTF-8 byte lexicographic order for strings and raw byte
+lexicographic order for binary.
+
+### Variable-Length Body
+
+Non-empty variable-length values are encoded in blocks. Each block contains 32 data bytes
+followed by one marker byte:
+
+```text
+data[0..32] || marker
+```
+
+For ascending order:
+
+- Every non-final full block uses marker `0xFF`.
+- The final block is padded with zeros to 32 data bytes.
+- The final marker is the number of real data bytes in the final block, in `1..=32`.
+
+For descending order:
+
+- Every data byte is inverted.
+- Every non-final full-block marker is `0x00`, the inverse of `0xFF`.
+- The final block is padded with `0xFF`, the inverse of ascending zero padding.
+- The final marker is inverted: `final_len XOR 0xFF`.
+
+If the input length is exactly a multiple of 32, the final block has marker `32`, and earlier
+blocks, if any, use the continuation marker.
+
+This block structure preserves prefix order. For example, in ascending order a shorter value
+that is a prefix of a longer value reaches its final marker before the longer value reaches
+the continuation marker. Since final length markers in `1..=32` are less than `0xFF`, the
+shorter prefix sorts first. Descending order inverts the same bytes and reverses that result.
+
+## Struct
+
+A struct is encoded as:
+
+```text
+struct_sentinel || field_0 || field_1 || ... || field_n
+```
+
+The outer sentinel is the fixed-width sentinel:
+
+- `0x01` for a non-null struct
+- `0x00` or `0x02` for a null struct, depending on null placement
+
+For a non-null struct, each field is encoded recursively in schema order using the same
+`RowSortField` as the parent struct column.
+
+For a null struct, the body is canonicalized so two null parent rows produce byte-equal
+output even if their physical child arrays contain different values:
+
+- Fixed-width children contribute their fixed-width null encoding.
+- Variable-width children contribute exactly one child null sentinel byte.
+
+A struct has fixed row width only when all of its fields have fixed row width. If any child
+is variable-width, the struct is variable-width.
+
+## Fixed-Size List
+
+A fixed-size list with `N` elements is encoded as:
+
+```text
+list_sentinel || element_0 || element_1 || ... || element_N-1
+```
+
+The outer sentinel is the fixed-width sentinel:
+
+- `0x01` for a non-null list
+- `0x00` or `0x02` for a null list, depending on null placement
+
+For a non-null fixed-size list, elements are encoded recursively in element order using the
+same `RowSortField` as the parent list column.
+
+For a null fixed-size list, the body is canonicalized:
+
+- Fixed-width elements contribute their fixed-width null encoding, repeated `N` times.
+- Variable-width elements contribute one child null sentinel byte per element.
+
+A fixed-size list has fixed row width only when its element type has fixed row width.
+
+## Nested Values
+
+Nested structs and fixed-size lists apply the same rules recursively. Each nullable parent
+adds its own outer sentinel. Null parents canonicalize their child body before comparison can
+observe underlying child values.
+
+## Unsupported Types
+
+The current row encoder rejects types for which it does not define byte-sort semantics:
+
+| Type | Reason |
+| --- | --- |
+| Variable-size `List` | No row encoding order is defined. |
+| `Variant` | No row encoding order is defined. |
+| `Union` | No row encoding order is defined. |
+| `Extension` | No row encoding order is defined. |
+| `Decimal256` | Encoding is not implemented. |
+
+The absence of these encodings is intentional. Adding one requires defining both the logical
+ordering and the exact byte representation that preserves that ordering.
+
+Temporal extensions could be added later by normalizing them to storage arrays at the
+row-encoder boundary, once the supported temporal ordering contract is made explicit.
+
+## Size and Output Layout
+
+The encoded output is a `ListView<u8>`:
+
+```text
+elements: contiguous u8 buffer containing all row bytes
+offsets:  per-row start offset into elements
+sizes:    per-row byte length
+```
+
+Rows are not self-describing without their `sizes`. A variable-width field can make one row
+longer than another, and the enclosing `ListView` supplies the row boundary.
+
+The encoder computes sizes before writing bytes:
+
+- Fixed-width columns contribute a constant width per row.
+- Variable-width columns contribute data-dependent widths per row.
+- The final `sizes` array is also used as the per-row write cursor during encoding.
+
+## Why Concatenation Works
+
+For each supported field type, the field encoder is an order embedding from logical values to
+byte strings:
+
+```text
+a < b  <=>  encode_field(a) < encode_field(b)
+a = b  <=>  encode_field(a) = encode_field(b)
+```
+
+When two rows are compared lexicographically, the first differing byte belongs to the first
+field whose encoded value differs. All preceding fields have byte-equal encodings and
+therefore equal logical values. The result is the same as tuple comparison.
+
+Variable-width fields preserve this property because their encodings are self-delimiting for
+comparison:
+
+- Null, empty, and non-empty values differ at the first byte.
+- Non-empty values use block markers to decide prefix cases before the next field can be
+  compared.
+- Row boundaries are supplied by `ListView` sizes.
+
+Descending order works because complementing every byte of an equal-length order-preserving
+value encoding reverses its order. The variable-width encoding also complements its sentinels,
+body bytes, padding, and markers for non-null values, so the same reversal applies to strings
+and binary values. Null sentinels are excluded from that reversal so null placement remains
+controlled solely by `nulls_first`.
+
+## Example Row
+
+This example shows one row that contains every supported encoding family. All columns use
+ascending order with nulls first.
+
+Schema:
+
+```text
+(
+    null_col: Null,
+    bool_col: Bool,
+    uint_col: U16,
+    int_col: I16,
+    float_col: F32,
+    decimal_col: Decimal(precision = 9, scale = 2),
+    utf8_col: Utf8,
+    binary_col: Binary,
+    struct_col: Struct { x: I8, y: Utf8 },
+    fsl_col: FixedSizeList<U8, 3>,
+)
+```
+
+Values:
+
+```text
+(
+    null,
+    true,
+    258_u16,
+    -5_i16,
+    1.5_f32,
+    123.45_decimal,     // stored as 12345_i32
+    "a",
+    DE AD BE EF,
+    { x: 1_i8, y: "" },
+    [1_u8, 2_u8, 3_u8],
+)
+```
+
+Encoded columns:
+
+| Column | Encoded bytes |
+| --- | --- |
+| `null_col` | `00` |
+| `bool_col` | `01 02` |
+| `uint_col` | `01 01 02` |
+| `int_col` | `01 7F FB` |
+| `float_col` | `01 BF C0 00 00` |
+| `decimal_col` | `01 80 00 30 39` |
+| `utf8_col` | `02 61 zero(31) 01` |
+| `binary_col` | `02 DE AD BE EF zero(28) 04` |
+| `struct_col` | `01 01 81 01` |
+| `fsl_col` | `01 01 01 01 02 01 03` |
+
+The full row key is the concatenation of those byte strings in schema order:
+
+```text
+00
+|| 01 02
+|| 01 01 02
+|| 01 7F FB
+|| 01 BF C0 00 00
+|| 01 80 00 30 39
+|| 02 61 zero(31) 01
+|| 02 DE AD BE EF zero(28) 04
+|| 01 01 81 01
+|| 01 01 01 01 02 01 03
+```
+
+Primitive examples here use one representative width per primitive family. Other widths use
+the same transform and emit exactly `width(T)` value bytes.
diff --git a/vortex-row/README.md b/vortex-row/README.md
new file mode 100644
index 00000000000..e0c574eca43
--- /dev/null
+++ b/vortex-row/README.md
@@ -0,0 +1,17 @@
+# vortex-row
+
+`vortex-row` provides an experimental row-oriented byte encoder for Vortex arrays. It
+produces byte strings that can be compared lexicographically to sort rows according to the
+configured column ordering.
+
+Only supported Vortex logical types are accepted. Extension types are rejected until their
+logical sort semantics are defined.
+
+## Experimental Format
+
+The row encoding byte layout is experimental. Its exact bytes, supported type set, and
+edge-case semantics may change between Vortex releases.
+
+Do not persist row-encoded bytes or use them as a stable interchange format. They are intended
+for internal sort-key and row-key operations where the encoder version, schema, and sort
+options are controlled together.
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index d0cb32ce13d..4848a750e52 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -28,13 +28,11 @@ use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::DecimalArray;
-use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::NullArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
-use vortex_array::arrays::extension::ExtensionArrayExt;
 use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
 use vortex_array::arrays::struct_::StructArrayExt;
 use vortex_array::dtype::DType;
@@ -227,11 +225,11 @@ pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
                 "row encoding does not support variable-size List arrays (no well-defined ordering)"
             )
         }
-        DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()),
         DType::Variant(_) => {
             vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
         }
         DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"),
+        dtype => vortex_bail!("row encoding does not support dtype: {dtype:?}"),
     }
 }
 
@@ -257,7 +255,6 @@ pub(crate) fn field_size(
         Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
         Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
         Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
-        Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?,
         Canonical::List(_) => vortex_bail!(
             "row encoding does not support canonical List arrays: {:?}",
             canonical.dtype()
@@ -265,6 +262,12 @@ pub(crate) fn field_size(
         Canonical::Variant(_) => {
             vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
         }
+        unsupported => {
+            vortex_bail!(
+                "row encoding does not support canonical array: {:?}",
+                unsupported.dtype()
+            )
+        }
     }
     Ok(())
 }
@@ -344,7 +347,6 @@ pub(crate) fn field_encode(
         Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
         Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?,
         Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?,
         Canonical::List(_) => vortex_bail!(
             "row encoding does not support canonical List arrays: {:?}",
             canonical.dtype()
@@ -352,6 +354,12 @@ pub(crate) fn field_encode(
         Canonical::Variant(_) => {
             vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
         }
+        unsupported => {
+            vortex_bail!(
+                "row encoding does not support canonical array: {:?}",
+                unsupported.dtype()
+            )
+        }
     }
     Ok(())
 }
@@ -504,16 +512,6 @@ fn add_size_fsl(
     Ok(())
 }
 
-fn add_size_extension(
-    arr: &ExtensionArray,
-    field: RowSortField,
-    sizes: &mut [u32],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
-    field_size(&storage, field, sizes, ctx)
-}
-
 fn encode_null(
     arr: &NullArray,
     field: RowSortField,
@@ -999,18 +997,6 @@ fn encode_variable_child(
     Ok(())
 }
 
-fn encode_extension(
-    arr: &ExtensionArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
-    field_encode(&storage, field, row_offsets, col_offset, out, ctx)
-}
-
 /// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a
 /// constant within-row offset, iterating the output in `row_stride`-sized chunks so the
 /// compiler can drop the per-row offset/cursor indirection.
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 46a4be778d4..04feec89415 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -41,7 +41,7 @@ use crate::size::compute_sizes;
 /// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values
 /// `cols[0][i], cols[1][i], ...` concatenated left-to-right.
 ///
-/// This scalar function is public for session registration and encoding extension work.
+/// This scalar function is public for session registration and row-encoding work.
 /// Most callers should use [`RowEncoder`](crate::RowEncoder) rather than invoking the scalar
 /// function directly.
 #[derive(Clone, Debug)]
diff --git a/vortex-row/src/encoder.rs b/vortex-row/src/encoder.rs
index 15eeda6d2f1..7bcd3e05627 100644
--- a/vortex-row/src/encoder.rs
+++ b/vortex-row/src/encoder.rs
@@ -6,6 +6,7 @@
 use vortex_array::ArrayRef;
 use vortex_array::ExecutionCtx;
 use vortex_array::arrays::ListViewArray;
+use vortex_array::dtype::DType;
 use vortex_array::scalar_fn::ScalarFnVTable;
 use vortex_array::scalar_fn::VecExecutionArgs;
 use vortex_error::VortexResult;
@@ -85,6 +86,7 @@ impl RowEncoder {
         };
         let nrows = cols[0].len();
         for (i, col) in cols.iter().enumerate() {
+            reject_extension_dtype(col.dtype())?;
             if col.len() != nrows {
                 vortex_bail!(
                     "RowEncoder: column {} has length {} but expected {}",
@@ -98,6 +100,27 @@ impl RowEncoder {
     }
 }
 
+fn reject_extension_dtype(dtype: &DType) -> VortexResult<()> {
+    match dtype {
+        DType::Extension(ext_dtype) => {
+            vortex_bail!(
+                "row encoding does not support Extension arrays yet: {}",
+                ext_dtype.id()
+            )
+        }
+        DType::Struct(fields, _) => {
+            for field_dtype in fields.fields() {
+                reject_extension_dtype(&field_dtype)?;
+            }
+        }
+        DType::FixedSizeList(elem, ..) | DType::List(elem, _) => {
+            reject_extension_dtype(elem)?;
+        }
+        _ => {}
+    }
+    Ok(())
+}
+
 /// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes
 /// are lexicographically comparable in the same order as a tuple comparison of the input
 /// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`].
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index d921e2998e3..b36121f0da2 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -23,9 +23,9 @@
 //! into the per-row slots from left to right.
 //!
 //! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to
-//! 128 bits, UTF-8 and binary values, structs, fixed-size lists, and extensions whose storage
-//! type is supported. Variant, union, and variable-size list arrays are rejected because this
-//! crate does not define an ordering for them.
+//! 128 bits, UTF-8 and binary values, structs, and fixed-size lists. Extension, variant,
+//! union, and variable-size list arrays are rejected because this crate does not define an
+//! ordering for them.
 
 mod codec;
 mod encode;
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index 860fe3c2a2c..9112379a6f4 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -162,7 +162,7 @@ pub(crate) fn compute_sizes(
 ///
 /// The total per-row byte size is `fixed + var`.
 ///
-/// This scalar function is public for session registration and encoding extension work.
+/// This scalar function is public for session registration and row-encoding work.
 /// Most callers should use [`RowEncoder::row_sizes`](crate::RowEncoder::row_sizes) rather
 /// than invoking the scalar function directly.
 #[derive(Clone, Debug)]
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
index 62e0e4cfb98..5c85c911154 100644
--- a/vortex-row/src/tests.rs
+++ b/vortex-row/src/tests.rs
@@ -10,10 +10,15 @@ use vortex_array::IntoArray;
 use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
 use vortex_array::arrays::listview::ListViewArrayExt;
+use vortex_array::dtype::Nullability;
+use vortex_array::extension::datetime::Date;
+use vortex_array::extension::datetime::TimeUnit;
 use vortex_error::VortexResult;
 
 use crate::RowEncoder;
@@ -88,6 +93,41 @@ fn primitive_u32_sort_order() -> VortexResult<()> {
     Ok(())
 }
 
+#[test]
+fn reject_temporal_extension_dtype_early() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let storage = PrimitiveArray::from_iter([2i32, -1, 0, 7]).into_array();
+    let ext_dtype = Date::new(TimeUnit::Days, Nullability::NonNullable).erased();
+    let col = ExtensionArray::new(ext_dtype, storage).into_array();
+
+    let err = convert_columns(&[col], &[RowSortField::ascending()], &mut ctx)
+        .expect_err("temporal extensions should be rejected");
+    assert!(
+        err.to_string().contains("Extension arrays yet"),
+        "expected error mentioning unsupported Extension arrays, got: {err}"
+    );
+    Ok(())
+}
+
+#[test]
+fn reject_nested_temporal_extension_dtype_early() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let storage = PrimitiveArray::from_iter([2i32, -1, 0, 7]).into_array();
+    let ext_dtype = Date::new(TimeUnit::Days, Nullability::NonNullable).erased();
+    let date_col = ExtensionArray::new(ext_dtype, storage).into_array();
+    let tag_col = VarBinViewArray::from_iter_str(["d", "b", "c", "a"]).into_array();
+    let struct_col =
+        StructArray::from_fields(&[("date", date_col), ("tag", tag_col)])?.into_array();
+
+    let err = convert_columns(&[struct_col], &[RowSortField::ascending()], &mut ctx)
+        .expect_err("nested temporal extensions should be rejected");
+    assert!(
+        err.to_string().contains("Extension arrays yet"),
+        "expected error mentioning unsupported Extension arrays, got: {err}"
+    );
+    Ok(())
+}
+
 #[test]
 fn primitive_f64_sort_order() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();

From a213cdd675cf80595b0076da9349c9fc7fb50943 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 5 Jun 2026 11:34:54 +0100
Subject: [PATCH 18/19] fix

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                            |   1 -
 vortex-row/Cargo.toml                 |   5 -
 vortex-row/benches/fsst_row_encode.rs | 390 --------------------------
 3 files changed, 396 deletions(-)
 delete mode 100644 vortex-row/benches/fsst_row_encode.rs

diff --git a/Cargo.lock b/Cargo.lock
index bf24dafe3dd..967f0a18a09 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9997,7 +9997,6 @@ dependencies = [
  "vortex-array",
  "vortex-buffer",
  "vortex-error",
- "vortex-fsst",
  "vortex-mask",
  "vortex-session",
 ]
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index e58a48f16e7..9222c7d6a43 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -35,12 +35,7 @@ mimalloc = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
-vortex-fsst = { workspace = true }
 
 [[bench]]
 name = "row_encode"
 harness = false
-
-[[bench]]
-name = "fsst_row_encode"
-harness = false
diff --git a/vortex-row/benches/fsst_row_encode.rs b/vortex-row/benches/fsst_row_encode.rs
deleted file mode 100644
index 083788ff6ab..00000000000
--- a/vortex-row/benches/fsst_row_encode.rs
+++ /dev/null
@@ -1,390 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-#![expect(
-    clippy::unwrap_used,
-    clippy::expect_used,
-    clippy::cast_possible_truncation,
-    clippy::many_single_char_names
-)]
-
-//! Row-encoding an FSST-compressed string column: the only realizable strategy is
-//! "unpack then convert" (decompress FSST to a canonical `VarBinView`, then row-encode it),
-//! because FSST is **not order-preserving** — its 1-byte codes are assigned by compression
-//! gain, not by value, so the compressed bytes cannot be compared lexicographically. A
-//! hypothetical "direct" kernel could only *fuse* decompression with row-key emission; it
-//! still has to expand every symbol.
-//!
-//! These benchmarks measure the full path and its two phases so the fusion opportunity is
-//! quantifiable:
-//!   * `fsst_unpack_then_convert` — decompress + row-encode (the status quo).
-//!   * `fsst_decompress_only`     — decompress alone (the irreducible floor: a direct kernel
-//!     must still produce these bytes).
-//!   * `plain_row_encode_only`    — row-encode an already-decompressed `VarBinView` (the part
-//!     a fused kernel would overlap with decompression; its writes into the intermediate
-//!     buffer + views are what fusion removes).
-
-use divan::counter::BytesCount;
-use mimalloc::MiMalloc;
-use rand::RngExt;
-use rand::SeedableRng;
-use rand::rngs::StdRng;
-use vortex_array::ArrayRef;
-use vortex_array::Canonical;
-use vortex_array::IntoArray;
-use vortex_array::LEGACY_SESSION;
-use vortex_array::VortexSessionExecute;
-use vortex_array::arrays::ListViewArray;
-use vortex_array::arrays::PrimitiveArray;
-use vortex_array::arrays::VarBinArray;
-use vortex_array::arrays::varbin::VarBinArrayExt;
-use vortex_array::assert_arrays_eq;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
-use vortex_array::match_each_integer_ptype;
-use vortex_array::validity::Validity;
-use vortex_buffer::Buffer;
-use vortex_buffer::ByteBufferMut;
-use vortex_fsst::FSST;
-use vortex_fsst::FSSTArrayExt;
-use vortex_fsst::fsst_compress;
-use vortex_fsst::fsst_train_compressor;
-use vortex_row::RowEncoder;
-
-#[global_allocator]
-static GLOBAL: MiMalloc = MiMalloc;
-
-const N: usize = 100_000;
-const AVG_LEN: usize = 64;
-const UNIQUE_CHARS: u8 = 8;
-
-/// Generate compressible, multi-block (>32 byte) strings over a small alphabet so FSST finds
-/// a strong symbol table — the regime where a direct kernel would matter most.
-fn generate_strings() -> (VarBinArray, u64) {
-    let mut rng = StdRng::seed_from_u64(0);
-    let mut strings = Vec::with_capacity(N);
-    let mut total_bytes: u64 = 0;
-    for _ in 0..N {
-        let len = AVG_LEN * rng.random_range(50..=150) / 100;
-        total_bytes += len as u64;
-        let s = (0..len)
-            .map(|_| rng.random_range(b'a'..(b'a' + UNIQUE_CHARS)) as char)
-            .collect::<String>()
-            .into_bytes();
-        strings.push(Some(s.into_boxed_slice()));
-    }
-    let arr = VarBinArray::from_iter(strings, DType::Binary(Nullability::NonNullable));
-    (arr, total_bytes)
-}
-
-fn build_fsst() -> (ArrayRef, u64) {
-    let (arr, total_bytes) = generate_strings();
-    let compressor = fsst_train_compressor(&arr);
-    let len = arr.len();
-    let dtype = arr.dtype().clone();
-    let mut ctx = LEGACY_SESSION.create_execution_ctx();
-    let fsst = fsst_compress(arr, len, &dtype, &compressor, &mut ctx).into_array();
-    (fsst, total_bytes)
-}
-
-fn decompress(fsst: &ArrayRef) -> ArrayRef {
-    let mut ctx = LEGACY_SESSION.create_execution_ctx();
-    fsst.clone()
-        .execute::<Canonical>(&mut ctx)
-        .unwrap()
-        .into_array()
-}
-
-const VARLEN_BLOCK: usize = 32;
-const VARLEN_BLOCK_TOTAL: usize = 33;
-// Sentinel for a non-empty varlen value (ascending, non-null) — value is irrelevant to timing.
-const NON_EMPTY_SENTINEL: u8 = 0x02;
-
-/// Encoded row-key length for a non-empty value of `len` decompressed bytes: a leading
-/// sentinel plus `ceil(len/32)` 32-byte blocks, each followed by a continuation/length byte.
-fn encoded_len(len: usize) -> u32 {
-    if len == 0 {
-        1
-    } else {
-        1 + (len.div_ceil(VARLEN_BLOCK) as u32) * VARLEN_BLOCK_TOTAL as u32
-    }
-}
-
-/// Block-encode `bytes` (ascending) into `out`, matching vortex-row's varlen body format.
-fn block_encode(bytes: &[u8], out: &mut [u8]) {
-    let len = bytes.len();
-    let full = len / VARLEN_BLOCK;
-    let partial = len % VARLEN_BLOCK;
-    let (full_to_write, partial_len) = if partial == 0 {
-        (full - 1, VARLEN_BLOCK)
-    } else {
-        (full, partial)
-    };
-    let mut src = 0;
-    let mut dst = 0;
-    for _ in 0..full_to_write {
-        out[dst..dst + VARLEN_BLOCK].copy_from_slice(&bytes[src..src + VARLEN_BLOCK]);
-        out[dst + VARLEN_BLOCK] = 0xFF;
-        src += VARLEN_BLOCK;
-        dst += VARLEN_BLOCK_TOTAL;
-    }
-    out[dst..dst + partial_len].copy_from_slice(&bytes[src..src + partial_len]);
-    for b in &mut out[dst + partial_len..dst + VARLEN_BLOCK] {
-        *b = 0;
-    }
-    out[dst + VARLEN_BLOCK] = partial_len as u8;
-}
-
-/// Fused FSST → row-key kernel: bulk-decompress the code heap into one contiguous buffer (no
-/// intermediate `VarBinViewArray`), then block-encode each row straight into the row-key
-/// `ListView<u8>` using the stored `uncompressed_lengths` for boundaries (no size-pass walk).
-fn fast_fused(fsst: &ArrayRef) -> ArrayRef {
-    let mut ctx = LEGACY_SESSION.create_execution_ctx();
-    let view = fsst.as_opt::<FSST>().expect("FSST array");
-
-    // Per-row decompressed lengths are already stored — the size pass is free.
-    let lens_arr = view
-        .uncompressed_lengths()
-        .clone()
-        .execute::<PrimitiveArray>(&mut ctx)
-        .unwrap();
-    let lens: Vec<usize> = match_each_integer_ptype!(lens_arr.ptype(), |P| {
-        lens_arr
-            .as_slice::<P>()
-            .iter()
-            .map(|x| *x as usize)
-            .collect()
-    });
-
-    // Bulk-decompress the whole code heap once into a contiguous buffer (no VarBinView).
-    let heap = view.codes_bytes();
-    let total: usize = lens.iter().sum();
-    let decompressor = view.decompressor();
-    let mut decompressed = ByteBufferMut::with_capacity(total + 7);
-    let n = decompressor.decompress_into(heap.as_slice(), decompressed.spare_capacity_mut());
-    unsafe { decompressed.set_len(n) };
-    let bytes = decompressed.as_slice();
-
-    // Size + offsets for the row-key ListView (lengths are free, no view walk).
-    let nrows = lens.len();
-    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
-    let mut sizes: Vec<u32> = Vec::with_capacity(nrows);
-    let mut acc: u32 = 0;
-    for &l in &lens {
-        offsets.push(acc);
-        let sz = encoded_len(l);
-        sizes.push(sz);
-        acc += sz;
-    }
-
-    // Block-encode every row directly into the elements buffer. No zero-init (every byte is
-    // written: sentinel + block body with zero-padded final block) and no Vec→Buffer copy.
-    let mut out = ByteBufferMut::with_capacity(acc as usize);
-    unsafe { out.set_len(acc as usize) };
-    let out_slice = out.as_mut_slice();
-    let mut src = 0usize;
-    for (i, &l) in lens.iter().enumerate() {
-        let pos = offsets[i] as usize;
-        out_slice[pos] = NON_EMPTY_SENTINEL;
-        if l != 0 {
-            block_encode(&bytes[src..src + l], &mut out_slice[pos + 1..]);
-        }
-        src += l;
-    }
-
-    let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable);
-    let offsets_arr =
-        PrimitiveArray::new(Buffer::<u32>::copy_from(&offsets), Validity::NonNullable);
-    let sizes_arr = PrimitiveArray::new(Buffer::<u32>::copy_from(&sizes), Validity::NonNullable);
-    ListViewArray::try_new(
-        elements.into_array(),
-        offsets_arr.into_array(),
-        sizes_arr.into_array(),
-        Validity::NonNullable,
-    )
-    .unwrap()
-    .into_array()
-}
-
-/// "Scatter right": keep FSST's fast contiguous bulk decompressor, but run it into a
-/// cache-resident scratch one row-batch at a time, then scatter each row into block form from
-/// cache. The decompressed bytes never round-trip through main memory — unlike `fast_fused`,
-/// which materializes the whole 6.4 MB decompressed buffer and reads it back to block-encode.
-fn fast_scatter(fsst: &ArrayRef) -> ArrayRef {
-    // Scratch sized to stay resident in L1/L2; each batch decompresses up to this many bytes.
-    const SCRATCH: usize = 16 * 1024;
-
-    let mut ctx = LEGACY_SESSION.create_execution_ctx();
-    let view = fsst.as_opt::<FSST>().expect("FSST array");
-
-    let lens_arr = view
-        .uncompressed_lengths()
-        .clone()
-        .execute::<PrimitiveArray>(&mut ctx)
-        .unwrap();
-    let lens: Vec<usize> = match_each_integer_ptype!(lens_arr.ptype(), |P| {
-        lens_arr
-            .as_slice::<P>()
-            .iter()
-            .map(|x| *x as usize)
-            .collect()
-    });
-    let nrows = lens.len();
-
-    // Per-row compressed code offsets (relative to the sliced heap start).
-    let codes = view.codes();
-    let heap = codes.sliced_bytes();
-    let code_off_arr = codes
-        .offsets()
-        .clone()
-        .execute::<PrimitiveArray>(&mut ctx)
-        .unwrap();
-    let base = match_each_integer_ptype!(code_off_arr.ptype(), |P| {
-        code_off_arr.as_slice::<P>()[0] as usize
-    });
-    let code_off: Vec<usize> = match_each_integer_ptype!(code_off_arr.ptype(), |P| {
-        code_off_arr
-            .as_slice::<P>()
-            .iter()
-            .map(|x| *x as usize - base)
-            .collect()
-    });
-
-    // Output sizing (free from stored lengths).
-    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
-    let mut sizes: Vec<u32> = Vec::with_capacity(nrows);
-    let mut acc: u32 = 0;
-    let mut max_row = 0usize;
-    for &l in &lens {
-        offsets.push(acc);
-        let sz = encoded_len(l);
-        sizes.push(sz);
-        acc += sz;
-        max_row = max_row.max(l);
-    }
-    let mut out = ByteBufferMut::with_capacity(acc as usize);
-    unsafe { out.set_len(acc as usize) };
-    let out_slice = out.as_mut_slice();
-
-    let decompressor = view.decompressor();
-    let scratch_cap = SCRATCH.max(max_row) + 8;
-    let mut scratch = ByteBufferMut::with_capacity(scratch_cap);
-
-    let mut r = 0usize;
-    while r < nrows {
-        // Grow a batch until it would overflow the scratch (always at least one row).
-        let bs = r;
-        let mut batch_bytes = 0usize;
-        while r < nrows && (r == bs || batch_bytes + lens[r] <= SCRATCH) {
-            batch_bytes += lens[r];
-            r += 1;
-        }
-        let be = r;
-
-        // Decompress this batch's codes in one fast call into the cache-resident scratch.
-        let cslice = &heap.as_slice()[code_off[bs]..code_off[be]];
-        let n = decompressor.decompress_into(cslice, scratch.spare_capacity_mut());
-        unsafe { scratch.set_len(n) };
-        let sbytes = scratch.as_slice();
-
-        // Scatter each row from cache into block form.
-        let mut local = 0usize;
-        for i in bs..be {
-            let l = lens[i];
-            let pos = offsets[i] as usize;
-            out_slice[pos] = NON_EMPTY_SENTINEL;
-            if l != 0 {
-                block_encode(&sbytes[local..local + l], &mut out_slice[pos + 1..]);
-            }
-            local += l;
-        }
-        unsafe { scratch.set_len(0) };
-    }
-
-    let elements = PrimitiveArray::new(out.freeze(), Validity::NonNullable);
-    let offsets_arr =
-        PrimitiveArray::new(Buffer::<u32>::copy_from(&offsets), Validity::NonNullable);
-    let sizes_arr = PrimitiveArray::new(Buffer::<u32>::copy_from(&sizes), Validity::NonNullable);
-    ListViewArray::try_new(
-        elements.into_array(),
-        offsets_arr.into_array(),
-        sizes_arr.into_array(),
-        Validity::NonNullable,
-    )
-    .unwrap()
-    .into_array()
-}
-
-fn main() {
-    // Correctness: the batched cache-resident scatter must produce identical row keys to the
-    // straightforward fused path.
-    {
-        let (fsst, _) = build_fsst();
-        assert_arrays_eq!(fast_scatter(&fsst), fast_fused(&fsst));
-    }
-    divan::main();
-}
-
-/// "Scatter right" fused path: cache-resident batched decompress + scatter into block form.
-#[divan::bench]
-fn fsst_fast_scatter(bencher: divan::Bencher) {
-    let (fsst, total_bytes) = build_fsst();
-    bencher
-        .counter(BytesCount::new(total_bytes))
-        .bench_local(|| fast_scatter(&fsst));
-}
-
-/// Status quo: decompress FSST to a canonical `VarBinView`, then row-encode it.
-#[divan::bench]
-fn fsst_unpack_then_convert(bencher: divan::Bencher) {
-    let (fsst, total_bytes) = build_fsst();
-    let encoder = RowEncoder::default();
-    bencher
-        .counter(BytesCount::new(total_bytes))
-        .bench_local(|| {
-            let mut ctx = LEGACY_SESSION.create_execution_ctx();
-            let decoded = fsst
-                .clone()
-                .execute::<Canonical>(&mut ctx)
-                .unwrap()
-                .into_array();
-            encoder.encode(&[decoded], &mut ctx).unwrap()
-        });
-}
-
-/// Fused fast path: bulk-decompress directly into the row-key block format, skipping the
-/// intermediate `VarBinViewArray` and the generic row-encoder (size pass is free).
-#[divan::bench]
-fn fsst_fast_fused(bencher: divan::Bencher) {
-    let (fsst, total_bytes) = build_fsst();
-    bencher
-        .counter(BytesCount::new(total_bytes))
-        .bench_local(|| fast_fused(&fsst));
-}
-
-/// Irreducible floor: FSST decompression alone (a direct kernel must still produce these
-/// bytes, since the sort key *is* the decompressed bytes).
-#[divan::bench]
-fn fsst_decompress_only(bencher: divan::Bencher) {
-    let (fsst, total_bytes) = build_fsst();
-    bencher
-        .counter(BytesCount::new(total_bytes))
-        .bench_local(|| decompress(&fsst));
-}
-
-/// Row-encode an already-decompressed `VarBinView`. The writes into the decompressed buffer +
-/// views that precede this step are what a fused direct kernel would eliminate.
-#[divan::bench]
-fn plain_row_encode_only(bencher: divan::Bencher) {
-    let (fsst, total_bytes) = build_fsst();
-    let decoded = decompress(&fsst);
-    let encoder = RowEncoder::default();
-    bencher
-        .counter(BytesCount::new(total_bytes))
-        .bench_local(|| {
-            let mut ctx = LEGACY_SESSION.create_execution_ctx();
-            encoder
-                .encode(std::slice::from_ref(&decoded), &mut ctx)
-                .unwrap()
-        });
-}

From 9e2f1432a84adf2a1668483256c43f4ca4f8c76b Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 5 Jun 2026 11:46:17 +0100
Subject: [PATCH 19/19] add row encoder order fuzzer

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                     |   1 +
 fuzz/Cargo.toml                |   9 +
 fuzz/fuzz_targets/row_order.rs |  18 ++
 fuzz/src/lib.rs                |   3 +
 fuzz/src/row_order.rs          | 354 +++++++++++++++++++++++++++++++++
 5 files changed, 385 insertions(+)
 create mode 100644 fuzz/fuzz_targets/row_order.rs
 create mode 100644 fuzz/src/row_order.rs

diff --git a/Cargo.lock b/Cargo.lock
index 967f0a18a09..1a54b9285e1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9745,6 +9745,7 @@ dependencies = [
  "vortex-fsst",
  "vortex-io",
  "vortex-mask",
+ "vortex-row",
  "vortex-runend",
  "vortex-session",
  "vortex-utils",
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index e2d05b706f9..c52b9d3b1cc 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -39,6 +39,7 @@ vortex-error = { workspace = true }
 vortex-fsst = { workspace = true }
 vortex-io = { workspace = true }
 vortex-mask = { workspace = true }
+vortex-row = { workspace = true }
 vortex-runend = { workspace = true, features = ["arbitrary"] }
 vortex-session = { workspace = true }
 vortex-utils = { workspace = true }
@@ -97,6 +98,14 @@ path = "fuzz_targets/fsst_like.rs"
 test = false
 required-features = ["native"]
 
+[[bin]]
+bench = false
+doc = false
+name = "row_order"
+path = "fuzz_targets/row_order.rs"
+test = false
+required-features = ["native"]
+
 [[bin]]
 bench = false
 doc = false
diff --git a/fuzz/fuzz_targets/row_order.rs b/fuzz/fuzz_targets/row_order.rs
new file mode 100644
index 00000000000..758774fe6c7
--- /dev/null
+++ b/fuzz/fuzz_targets/row_order.rs
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![no_main]
+
+use libfuzzer_sys::Corpus;
+use libfuzzer_sys::fuzz_target;
+use vortex_error::vortex_panic;
+use vortex_fuzz::FuzzRowOrder;
+use vortex_fuzz::run_row_order_fuzz;
+
+fuzz_target!(|fuzz: FuzzRowOrder| -> Corpus {
+    match run_row_order_fuzz(fuzz) {
+        Ok(true) => Corpus::Keep,
+        Ok(false) => Corpus::Reject,
+        Err(e) => vortex_panic!("{e}"),
+    }
+});
diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs
index b0abf660045..43ff5a74629 100644
--- a/fuzz/src/lib.rs
+++ b/fuzz/src/lib.rs
@@ -7,6 +7,7 @@ mod array;
 pub mod compress;
 pub mod error;
 pub mod fsst_like;
+pub mod row_order;
 
 // File module only available for native builds (requires vortex-file which uses tokio)
 #[cfg(not(target_arch = "wasm32"))]
@@ -31,6 +32,8 @@ pub use fsst_like::run_fsst_like_fuzz;
 pub use gpu::FuzzCompressGpu;
 #[cfg(feature = "cuda")]
 pub use gpu::run_compress_gpu;
+pub use row_order::FuzzRowOrder;
+pub use row_order::run_row_order_fuzz;
 
 pub const FUZZ_ARRAY_MAX_LEN: usize = 2048;
 pub const FUZZ_FILE_ARRAY_MAX_LEN: usize = 16_384;
diff --git a/fuzz/src/row_order.rs b/fuzz/src/row_order.rs
new file mode 100644
index 00000000000..91fd962e7cf
--- /dev/null
+++ b/fuzz/src/row_order.rs
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::cmp::Ordering;
+use std::sync::Arc;
+
+use arbitrary::Arbitrary;
+use arbitrary::Result;
+use arbitrary::Unstructured;
+use vortex_array::ArrayRef;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::arbitrary::ArbitraryArray;
+use vortex_array::arrays::arbitrary::ArbitraryArrayConfig;
+use vortex_array::arrays::arbitrary::ArbitraryWith;
+use vortex_array::arrays::listview::ListViewArrayExt;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::DecimalDType;
+use vortex_array::dtype::FieldName;
+use vortex_array::dtype::FieldNames;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::StructFields;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar::ScalarValue;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_error::vortex_err;
+use vortex_row::RowEncoder;
+use vortex_row::RowSortField;
+
+use crate::SESSION;
+use crate::error::Backtrace;
+use crate::error::VortexFuzzError;
+use crate::error::VortexFuzzResult;
+
+const MAX_COLUMNS: usize = 4;
+const MAX_ROWS_PER_SIDE: usize = 32;
+const MAX_NESTING_DEPTH: u8 = 2;
+const MAX_STRUCT_FIELDS: usize = 3;
+const MAX_FIXED_SIZE_LIST_LEN: u32 = 3;
+
+#[derive(Debug)]
+pub struct FuzzRowOrder {
+    left_cols: Vec<ArrayRef>,
+    right_cols: Vec<ArrayRef>,
+    sort_fields: Vec<RowSortField>,
+}
+
+impl<'a> Arbitrary<'a> for FuzzRowOrder {
+    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
+        let column_count = u.int_in_range(1..=MAX_COLUMNS)?;
+        let left_len = u.int_in_range(1..=MAX_ROWS_PER_SIDE)?;
+        let right_len = u.int_in_range(1..=MAX_ROWS_PER_SIDE)?;
+
+        let mut left_cols = Vec::with_capacity(column_count);
+        let mut right_cols = Vec::with_capacity(column_count);
+        let mut sort_fields = Vec::with_capacity(column_count);
+
+        for _ in 0..column_count {
+            let dtype = random_supported_dtype(u, MAX_NESTING_DEPTH)?;
+            left_cols.push(random_array(u, dtype.clone(), left_len)?);
+            right_cols.push(random_array(u, dtype, right_len)?);
+            sort_fields.push(RowSortField::new(u.arbitrary()?, u.arbitrary()?));
+        }
+
+        Ok(Self {
+            left_cols,
+            right_cols,
+            sort_fields,
+        })
+    }
+}
+
+#[expect(clippy::result_large_err)]
+pub fn run_row_order_fuzz(fuzz: FuzzRowOrder) -> VortexFuzzResult<bool> {
+    run_row_order_fuzz_inner(fuzz)
+        .map_err(|err| VortexFuzzError::VortexError(err, Backtrace::capture()))
+}
+
+fn run_row_order_fuzz_inner(fuzz: FuzzRowOrder) -> VortexResult<bool> {
+    let FuzzRowOrder {
+        left_cols,
+        right_cols,
+        sort_fields,
+    } = fuzz;
+
+    let mut ctx = SESSION.create_execution_ctx();
+    let encoder = RowEncoder::new(sort_fields.iter().copied());
+    let left_rows = collect_row_bytes(&encoder.encode(&left_cols, &mut ctx)?, &mut ctx)?;
+    let right_rows = collect_row_bytes(&encoder.encode(&right_cols, &mut ctx)?, &mut ctx)?;
+
+    for (left_idx, left_bytes) in left_rows.iter().enumerate() {
+        for (right_idx, right_bytes) in right_rows.iter().enumerate() {
+            let array_order = compare_rows(
+                &left_cols,
+                left_idx,
+                &right_cols,
+                right_idx,
+                &sort_fields,
+                &mut ctx,
+            )?;
+            let row_order = left_bytes.cmp(right_bytes);
+            if array_order != row_order {
+                vortex_bail!(
+                    "row-order mismatch comparing left row {} to right row {}: \
+                     array order {:?}, row-byte order {:?}, dtypes {:?}, sort fields {:?}, \
+                     left bytes {:?}, right bytes {:?}",
+                    left_idx,
+                    right_idx,
+                    array_order,
+                    row_order,
+                    left_cols.iter().map(|col| col.dtype()).collect::<Vec<_>>(),
+                    sort_fields,
+                    left_bytes,
+                    right_bytes
+                );
+            }
+        }
+    }
+
+    Ok(true)
+}
+
+fn collect_row_bytes(
+    encoded: &vortex_array::arrays::ListViewArray,
+    ctx: &mut vortex_array::ExecutionCtx,
+) -> VortexResult<Vec<Vec<u8>>> {
+    (0..encoded.len())
+        .map(|row_idx| {
+            let row = encoded.list_elements_at(row_idx)?;
+            let row = row.execute::<PrimitiveArray>(ctx)?;
+            Ok(row.as_slice::<u8>().to_vec())
+        })
+        .collect()
+}
+
+fn compare_rows(
+    left_cols: &[ArrayRef],
+    left_idx: usize,
+    right_cols: &[ArrayRef],
+    right_idx: usize,
+    sort_fields: &[RowSortField],
+    ctx: &mut vortex_array::ExecutionCtx,
+) -> VortexResult<Ordering> {
+    for ((left_col, right_col), field) in left_cols.iter().zip(right_cols).zip(sort_fields) {
+        let left = left_col.execute_scalar(left_idx, ctx)?;
+        let right = right_col.execute_scalar(right_idx, ctx)?;
+        match compare_scalar(&left, &right, *field)? {
+            Ordering::Equal => {}
+            ordering => return Ok(ordering),
+        }
+    }
+
+    Ok(Ordering::Equal)
+}
+
+fn compare_scalar(left: &Scalar, right: &Scalar, field: RowSortField) -> VortexResult<Ordering> {
+    if !left.dtype().eq_ignore_nullability(right.dtype()) {
+        vortex_bail!(
+            "cannot compare row scalars with different dtypes: {} vs {}",
+            left.dtype(),
+            right.dtype()
+        );
+    }
+
+    compare_scalar_values(left.dtype(), left.value(), right.value(), field)
+}
+
+fn compare_scalar_values(
+    dtype: &DType,
+    left: Option<&ScalarValue>,
+    right: Option<&ScalarValue>,
+    field: RowSortField,
+) -> VortexResult<Ordering> {
+    let (Some(left), Some(right)) = (left, right) else {
+        return Ok(compare_nulls(left.is_none(), right.is_none(), field));
+    };
+
+    match dtype {
+        DType::Null => Ok(Ordering::Equal),
+        DType::Struct(fields, _) => compare_struct_values(fields, left, right, field),
+        DType::FixedSizeList(element_dtype, list_size, _) => {
+            compare_fixed_size_list_values(element_dtype, *list_size, left, right, field)
+        }
+        DType::List(..) | DType::Variant(_) | DType::Union(_) | DType::Extension(_) => {
+            vortex_bail!("row-order fuzzer generated unsupported dtype: {dtype}")
+        }
+        _ => compare_leaf_values(dtype, left, right, field),
+    }
+}
+
+fn compare_nulls(left_is_null: bool, right_is_null: bool, field: RowSortField) -> Ordering {
+    match (left_is_null, right_is_null) {
+        (true, true) | (false, false) => Ordering::Equal,
+        (true, false) => {
+            if field.nulls_first {
+                Ordering::Less
+            } else {
+                Ordering::Greater
+            }
+        }
+        (false, true) => {
+            if field.nulls_first {
+                Ordering::Greater
+            } else {
+                Ordering::Less
+            }
+        }
+    }
+}
+
+fn compare_struct_values(
+    fields: &StructFields,
+    left: &ScalarValue,
+    right: &ScalarValue,
+    field: RowSortField,
+) -> VortexResult<Ordering> {
+    let (ScalarValue::Tuple(left_fields), ScalarValue::Tuple(right_fields)) = (left, right) else {
+        vortex_bail!("struct dtype expected tuple scalar values");
+    };
+    if left_fields.len() != fields.nfields() || right_fields.len() != fields.nfields() {
+        vortex_bail!(
+            "struct scalar field count mismatch: expected {}, got {} and {}",
+            fields.nfields(),
+            left_fields.len(),
+            right_fields.len()
+        );
+    }
+
+    for ((field_dtype, left_value), right_value) in
+        fields.fields().zip(left_fields).zip(right_fields)
+    {
+        match compare_scalar_values(
+            &field_dtype,
+            left_value.as_ref(),
+            right_value.as_ref(),
+            field,
+        )? {
+            Ordering::Equal => {}
+            ordering => return Ok(ordering),
+        }
+    }
+
+    Ok(Ordering::Equal)
+}
+
+fn compare_fixed_size_list_values(
+    element_dtype: &DType,
+    list_size: u32,
+    left: &ScalarValue,
+    right: &ScalarValue,
+    field: RowSortField,
+) -> VortexResult<Ordering> {
+    let (ScalarValue::Tuple(left_elements), ScalarValue::Tuple(right_elements)) = (left, right)
+    else {
+        vortex_bail!("fixed-size list dtype expected tuple scalar values");
+    };
+    let expected_len = list_size as usize;
+    if left_elements.len() != expected_len || right_elements.len() != expected_len {
+        vortex_bail!(
+            "fixed-size list scalar length mismatch: expected {}, got {} and {}",
+            expected_len,
+            left_elements.len(),
+            right_elements.len()
+        );
+    }
+
+    for (left_value, right_value) in left_elements.iter().zip(right_elements) {
+        match compare_scalar_values(
+            element_dtype,
+            left_value.as_ref(),
+            right_value.as_ref(),
+            field,
+        )? {
+            Ordering::Equal => {}
+            ordering => return Ok(ordering),
+        }
+    }
+
+    Ok(Ordering::Equal)
+}
+
+fn compare_leaf_values(
+    dtype: &DType,
+    left: &ScalarValue,
+    right: &ScalarValue,
+    field: RowSortField,
+) -> VortexResult<Ordering> {
+    let left = Scalar::try_new(dtype.clone(), Some(left.clone()))?;
+    let right = Scalar::try_new(dtype.clone(), Some(right.clone()))?;
+    let ordering = left.partial_cmp(&right).ok_or_else(|| {
+        vortex_err!(
+            "scalar comparison returned None for matching row-order dtype {}",
+            dtype
+        )
+    })?;
+
+    Ok(if field.descending {
+        ordering.reverse()
+    } else {
+        ordering
+    })
+}
+
+fn random_array(u: &mut Unstructured<'_>, dtype: DType, len: usize) -> Result<ArrayRef> {
+    Ok(ArbitraryArray::arbitrary_with_config(
+        u,
+        &ArbitraryArrayConfig {
+            dtype: Some(dtype),
+            len: len..=len,
+        },
+    )?
+    .0)
+}
+
+fn random_supported_dtype(u: &mut Unstructured<'_>, depth: u8) -> Result<DType> {
+    let max_kind = if depth == 0 { 5 } else { 7 };
+    Ok(match u.int_in_range(0..=max_kind)? {
+        0 => DType::Null,
+        1 => DType::Bool(u.arbitrary()?),
+        2 => DType::Primitive(PType::arbitrary(u)?, u.arbitrary()?),
+        3 => DType::Decimal(random_supported_decimal_dtype(u)?, u.arbitrary()?),
+        4 => DType::Utf8(u.arbitrary()?),
+        5 => DType::Binary(u.arbitrary()?),
+        6 => DType::Struct(
+            random_supported_struct_fields(u, depth - 1)?,
+            u.arbitrary()?,
+        ),
+        7 => DType::FixedSizeList(
+            Arc::new(random_supported_dtype(u, depth - 1)?),
+            u.int_in_range(0..=MAX_FIXED_SIZE_LIST_LEN)?,
+            u.arbitrary()?,
+        ),
+        _ => unreachable!("dtype kind range is bounded"),
+    })
+}
+
+fn random_supported_decimal_dtype(u: &mut Unstructured<'_>) -> Result<DecimalDType> {
+    let precision = u.int_in_range(1..=38)?;
+    let scale = u.int_in_range(-18..=precision as i8)?;
+    Ok(DecimalDType::new(precision, scale))
+}
+
+fn random_supported_struct_fields(u: &mut Unstructured<'_>, depth: u8) -> Result<StructFields> {
+    let field_count = u.int_in_range(0..=MAX_STRUCT_FIELDS)?;
+    let names = (0..field_count)
+        .map(|idx| FieldName::from(format!("f{idx}")))
+        .collect::<Vec<_>>();
+    let dtypes = (0..field_count)
+        .map(|_| random_supported_dtype(u, depth))
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(StructFields::new(FieldNames::from(names), dtypes))
+}