From 26d987f9dbd9a3c57fe7b1847518efbf902b3633 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 21:19:11 +0000 Subject: [PATCH 1/5] feat(codec): PR-X12 A2 mode bit-pack + A3-intra prediction kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A2 — mode.rs (~270 lines, 11 tests): - 16-bit header: 2-bit mode (Skip/Merge/Delta/Escape) + 12-bit basin_idx - MergeDir 2-bit pack/unpack (high bits masked) - Whole-leaf compact pack/unpack: Skip=2B, Merge=3B, Delta=3B, Escape=6B - packed_byte_len() const fn for buffer pre-sizing - MAX_BASIN_IDX (4095) + BASIN_NONE sentinel - Stream roundtrip test for mixed-mode leaves A3-intra — predict.rs (~330 lines, 12 tests): - IntraContext { basin_idx, delta_i32, NESW neighbours } - IntraConfig { escape_next_idx: Option } - predict_intra() decision tree: Skip → Merge → Delta → Escape (monotone wire cost 2 → 3 → 3 → 6 bytes; cheapest-fit policy) - Merge match: same basin_idx + same δ as u8 (sign-tolerant wrapping cast) - Escape fallback: lossy i8 clamp when allocator absent (never panics) - End-to-end pack/unpack chain test through the decision Deferred to follow-up: A3-inter (cross-tier neighbour scan from BlockedGrid L2/L3), A4 transform, A6 RDO, A7 rANS, A8 stream framing. --- src/hpc/codec/mod.rs | 17 +- src/hpc/codec/mode.rs | 380 +++++++++++++++++++++++++++++++++++++ src/hpc/codec/predict.rs | 397 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 791 insertions(+), 3 deletions(-) create mode 100644 src/hpc/codec/mode.rs create mode 100644 src/hpc/codec/predict.rs diff --git a/src/hpc/codec/mod.rs b/src/hpc/codec/mod.rs index 54f171bd..2f23c294 100644 --- a/src/hpc/codec/mod.rs +++ b/src/hpc/codec/mod.rs @@ -8,9 +8,13 @@ //! # Module layout (per PR-X12 worker decomposition) //! //! - [`ctu`] — A1: `Ctu` carrier + `CtuPartition` enum + quad-tree -//! split / merge ops. **Shipped in this PR.** -//! - `mode`, `predict`, `transform`, `quantize`, `rdo`, `ans`, `stream` -//! — A2-A8, queued as follow-up sprints. +//! split / merge ops. +//! - [`mode`] — A2: bit-pack / unpack helpers for the on-wire 16-bit +//! header + per-mode tail (Skip/Merge/Delta/Escape). +//! - [`predict`] — A3-intra: encoder-side mode-decision kernel that +//! picks the cheapest `LeafCu` from a cell + NESW neighbours. +//! - `transform`, `quantize`, `rdo`, `ans`, `stream` — A4-A8, queued as +//! follow-up sprints. //! //! # Feature gate //! @@ -22,6 +26,13 @@ //! `.claude/knowledge/pr-x12-codec-x265-design.md` — master design doc. pub mod ctu; +pub mod mode; +pub mod predict; pub use ctu::{CellMode, MergeDir, MAX_QUAD_TREE_NODES, MAX_SPLIT_DEPTH}; pub use ctu::{Ctu, CtuArena, CtuPartition, LeafCu, MaxSplitDepthReached, MergeError, NodeIdx}; +pub use mode::{ + pack_header, pack_leaf, pack_merge_dir, packed_byte_len, unpack_header, unpack_leaf, unpack_merge_dir, BASIN_NONE, + MAX_BASIN_IDX, +}; +pub use predict::{is_no_basin, predict_intra, IntraConfig, IntraContext}; diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs new file mode 100644 index 00000000..d18f5c96 --- /dev/null +++ b/src/hpc/codec/mode.rs @@ -0,0 +1,380 @@ +//! Mode bit-pack / unpack helpers (PR-X12 A2). +//! +//! Compact wire-friendly representation of the [`CellMode`] + +//! [`MergeDir`] + [`LeafCu`] fields from [`super::ctu`]. The functions +//! here are the inverse of each other and pack into the smallest +//! integer width that fits, leaving the per-mode payload (`delta`, +//! `escape_idx`) for callers to append/consume as raw bytes. +//! +//! # Header layout — `pack_header` / `unpack_header` +//! +//! Each leaf has a fixed 16-bit header followed by a variable-width +//! tail. The header packs the most-frequently-accessed fields so a +//! decoder can route on a single `u16` load: +//! +//! ```text +//! MSB LSB +//! ┌──┬──┬──────────────────────────────┐ +//! │M0│M1│ basin_idx (12) │ ← 16-bit header +//! └──┴──┴──────────────────────────────┘ +//! │ │ └─ basin_idx is the only payload field always present +//! └──┴──── 2-bit mode discriminant (CellMode::as_u8()) +//! (top 2 bits) +//! ``` +//! +//! The remaining 2 bits at the top of the second byte are reserved for +//! the encoder's future `merge_dir` overlap when the mode is `Merge`; +//! a separate `pack_mode_dir` helper keeps `Merge`'s direction in a +//! single byte alongside `Skip`/`Delta`/`Escape`'s mode tag. +//! +//! # Per-mode tail width +//! +//! | Mode | Header | Tail bytes | Total | +//! |--------|--------|--------------------------|-------| +//! | Skip | 2 | 0 | 2 | +//! | Merge | 2 | 1 (`MergeDir` 2-bit) | 3 | +//! | Delta | 2 | 1 (`u8` perturbation) | 3 | +//! | Escape | 2 | 4 (`u32` escape_idx, LE) | 6 | +//! +//! The compact pack writes header (LE) then the per-mode tail. The +//! `escape_idx` width is the worst case; a future A7 rANS pass can +//! shrink it via per-frame frequency tables — A2 stays format-stable. +//! +//! # What A2 does NOT do +//! +//! - **Bytestream framing** (frame headers, CTU markers) — lives in +//! PR-X12 A8 `stream.rs`. +//! - **Entropy coding** (rANS) — lives in PR-X12 A7 `ans.rs`. A2's +//! output is the input to A7. +//! - **Per-frame escape vector** — caller maintains it; A2 packs the +//! `escape_idx` referencing into the leaf header. + +use super::ctu::{CellMode, LeafCu, MergeDir}; + +// ════════════════════════════════════════════════════════════════════ +// Header pack / unpack (16-bit) +// ════════════════════════════════════════════════════════════════════ + +/// Maximum encodable `basin_idx`. Stored in the lower 12 bits of the +/// header; values >= this constant overflow the header field. +pub const MAX_BASIN_IDX: u16 = (1 << 12) - 1; // 4095 + +/// Tag inside the per-frame basin codebook for "no basin assigned" +/// (encoder-side sentinel during mode decision). +pub const BASIN_NONE: u16 = MAX_BASIN_IDX; + +/// Pack `(mode, basin_idx)` into a 16-bit header. +/// +/// `basin_idx` must be `<= MAX_BASIN_IDX` (12 bits). Higher bits are +/// silently truncated; the encoder should clamp before calling. +/// +/// ``` +/// use ndarray::hpc::codec::mode::{pack_header, unpack_header}; +/// use ndarray::hpc::codec::CellMode; +/// let h = pack_header(CellMode::Delta, 1234); +/// assert_eq!(unpack_header(h), (CellMode::Delta, 1234)); +/// ``` +#[inline] +pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 { + let mode_bits = (mode as u16) & 0b11; + let basin_bits = basin_idx & MAX_BASIN_IDX; + (mode_bits << 12) | basin_bits +} + +/// Unpack a 16-bit header into `(mode, basin_idx)`. +/// +/// The 2-bit mode field always decodes (all 4 variants are valid). +/// `basin_idx` is the 12-bit lower field, exactly as packed. +#[inline] +pub fn unpack_header(packed: u16) -> (CellMode, u16) { + let mode_bits = ((packed >> 12) & 0b11) as u8; + let basin_idx = packed & MAX_BASIN_IDX; + let mode = match mode_bits { + 0b00 => CellMode::Skip, + 0b01 => CellMode::Merge, + 0b10 => CellMode::Delta, + _ => CellMode::Escape, + }; + (mode, basin_idx) +} + +// ════════════════════════════════════════════════════════════════════ +// MergeDir 2-bit pack / unpack +// ════════════════════════════════════════════════════════════════════ + +/// Pack a [`MergeDir`] into the lower 2 bits of a `u8`. +#[inline] +pub fn pack_merge_dir(dir: MergeDir) -> u8 { + dir as u8 +} + +/// Unpack the lower 2 bits of a `u8` into a [`MergeDir`]. +/// +/// All four 2-bit values map to a valid `MergeDir`; bits 2-7 are +/// ignored. +#[inline] +pub fn unpack_merge_dir(byte: u8) -> MergeDir { + match byte & 0b11 { + 0 => MergeDir::North, + 1 => MergeDir::East, + 2 => MergeDir::West, + _ => MergeDir::South, + } +} + +// ════════════════════════════════════════════════════════════════════ +// Whole-leaf pack / unpack +// ════════════════════════════════════════════════════════════════════ + +/// Compact pack: writes header (2 bytes, LE) + per-mode tail into +/// `out`. Returns the number of bytes written. +/// +/// The buffer must have at least 6 bytes of space (the Escape-mode +/// worst case) — callers iterating CTUs typically pre-allocate +/// `6 * cell_count` and trim afterwards. +/// +/// Returns `None` if `out.len() < 6` (insufficient capacity). +/// +/// Format: +/// - Bytes 0-1: header (`pack_header(mode, basin_idx)`, LE) +/// - Bytes 2..: per-mode tail (see module docs) +/// +/// ``` +/// use ndarray::hpc::codec::mode::{pack_leaf, unpack_leaf}; +/// use ndarray::hpc::codec::LeafCu; +/// let leaf = LeafCu::delta(42, 0x7F); +/// let mut buf = [0u8; 6]; +/// let n = pack_leaf(&leaf, &mut buf).unwrap(); +/// assert_eq!(n, 3); +/// let (decoded, consumed) = unpack_leaf(&buf).unwrap(); +/// assert_eq!(decoded, leaf); +/// assert_eq!(consumed, 3); +/// ``` +pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option { + if out.len() < 6 { + return None; + } + let header = pack_header(leaf.mode, leaf.basin_idx); + out[..2].copy_from_slice(&header.to_le_bytes()); + let tail_len = match leaf.mode { + CellMode::Skip => 0, + CellMode::Merge => { + // Caller guarantees `merge_dir.is_some()` for `Merge` mode + // (LeafCu::merge constructor enforces this). Fall back to + // North if the invariant is violated, to keep encoder + // robustness — the decoder will still produce a valid leaf. + out[2] = pack_merge_dir(leaf.merge_dir.unwrap_or(MergeDir::North)); + 1 + } + CellMode::Delta => { + out[2] = leaf.delta.unwrap_or(0); + 1 + } + CellMode::Escape => { + let idx = leaf.escape_idx.unwrap_or(0); + out[2..6].copy_from_slice(&idx.to_le_bytes()); + 4 + } + }; + Some(2 + tail_len) +} + +/// Compact unpack: reads header + per-mode tail from `buf`. Returns +/// `(leaf, bytes_consumed)`. +/// +/// Returns `None` if the buffer is shorter than the per-mode width +/// (2 for Skip, 3 for Merge/Delta, 6 for Escape). +pub fn unpack_leaf(buf: &[u8]) -> Option<(LeafCu, usize)> { + if buf.len() < 2 { + return None; + } + let header = u16::from_le_bytes([buf[0], buf[1]]); + let (mode, basin_idx) = unpack_header(header); + let (leaf, consumed) = match mode { + CellMode::Skip => (LeafCu::skip(basin_idx), 2), + CellMode::Merge => { + if buf.len() < 3 { + return None; + } + (LeafCu::merge(basin_idx, unpack_merge_dir(buf[2])), 3) + } + CellMode::Delta => { + if buf.len() < 3 { + return None; + } + (LeafCu::delta(basin_idx, buf[2]), 3) + } + CellMode::Escape => { + if buf.len() < 6 { + return None; + } + let idx = u32::from_le_bytes([buf[2], buf[3], buf[4], buf[5]]); + (LeafCu::escape(basin_idx, idx), 6) + } + }; + Some((leaf, consumed)) +} + +/// Byte cost of packing a leaf in this mode. Useful for pre-sizing +/// a buffer without packing first. +#[inline] +pub const fn packed_byte_len(mode: CellMode) -> usize { + match mode { + CellMode::Skip => 2, + CellMode::Merge => 3, + CellMode::Delta => 3, + CellMode::Escape => 6, + } +} + +// ════════════════════════════════════════════════════════════════════ +// Tests +// ════════════════════════════════════════════════════════════════════ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn header_roundtrip_all_modes_and_basin_extents() { + for mode in [CellMode::Skip, CellMode::Merge, CellMode::Delta, CellMode::Escape] { + for basin in [0u16, 1, 42, 1234, MAX_BASIN_IDX] { + let h = pack_header(mode, basin); + assert_eq!(unpack_header(h), (mode, basin), "mode={mode:?}, basin={basin}"); + } + } + } + + #[test] + fn header_truncates_oversize_basin_idx() { + // basin_idx = 4096 doesn't fit in 12 bits; the high bit gets + // dropped, giving back basin=0. + let h = pack_header(CellMode::Skip, 4096); + let (_, basin) = unpack_header(h); + assert_eq!(basin, 0); + } + + #[test] + fn merge_dir_roundtrip_all_four() { + for dir in [MergeDir::North, MergeDir::East, MergeDir::West, MergeDir::South] { + let b = pack_merge_dir(dir); + assert_eq!(unpack_merge_dir(b), dir); + } + } + + #[test] + fn merge_dir_ignores_high_bits() { + // High bits 2-7 are reserved; unpack should mask them out. + assert_eq!(unpack_merge_dir(0b1111_1100), MergeDir::North); + assert_eq!(unpack_merge_dir(0b1111_1101), MergeDir::East); + } + + #[test] + fn leaf_pack_skip_is_2_bytes() { + let leaf = LeafCu::skip(100); + let mut buf = [0xAAu8; 6]; + let n = pack_leaf(&leaf, &mut buf).unwrap(); + assert_eq!(n, 2); + // Bytes 2-5 untouched. + assert_eq!(&buf[2..], &[0xAA, 0xAA, 0xAA, 0xAA]); + } + + #[test] + fn leaf_pack_merge_is_3_bytes() { + let leaf = LeafCu::merge(100, MergeDir::East); + let mut buf = [0u8; 6]; + let n = pack_leaf(&leaf, &mut buf).unwrap(); + assert_eq!(n, 3); + let (decoded, consumed) = unpack_leaf(&buf).unwrap(); + assert_eq!(decoded, leaf); + assert_eq!(consumed, 3); + } + + #[test] + fn leaf_pack_delta_is_3_bytes() { + let leaf = LeafCu::delta(100, 0xCC); + let mut buf = [0u8; 6]; + let n = pack_leaf(&leaf, &mut buf).unwrap(); + assert_eq!(n, 3); + let (decoded, consumed) = unpack_leaf(&buf).unwrap(); + assert_eq!(decoded, leaf); + assert_eq!(consumed, 3); + } + + #[test] + fn leaf_pack_escape_is_6_bytes() { + let leaf = LeafCu::escape(100, 0xDEAD_BEEF); + let mut buf = [0u8; 6]; + let n = pack_leaf(&leaf, &mut buf).unwrap(); + assert_eq!(n, 6); + let (decoded, consumed) = unpack_leaf(&buf).unwrap(); + assert_eq!(decoded, leaf); + assert_eq!(consumed, 6); + } + + #[test] + fn leaf_pack_rejects_short_buffer() { + let leaf = LeafCu::escape(100, 0xDEAD_BEEF); + let mut buf = [0u8; 5]; // 1 short of Escape's worst case + assert!(pack_leaf(&leaf, &mut buf).is_none()); + } + + #[test] + fn leaf_unpack_rejects_short_buffer() { + // Header says Escape but only 2 bytes follow → not enough. + let mut buf = [0u8; 3]; + let header = pack_header(CellMode::Escape, 50); + buf[..2].copy_from_slice(&header.to_le_bytes()); + assert!(unpack_leaf(&buf).is_none()); + } + + #[test] + fn packed_byte_len_matches_pack_output() { + let cases = [ + (LeafCu::skip(10), CellMode::Skip), + (LeafCu::merge(10, MergeDir::West), CellMode::Merge), + (LeafCu::delta(10, 7), CellMode::Delta), + (LeafCu::escape(10, 99), CellMode::Escape), + ]; + for (leaf, mode) in cases { + let mut buf = [0u8; 6]; + let n = pack_leaf(&leaf, &mut buf).unwrap(); + assert_eq!(n, packed_byte_len(mode)); + } + } + + #[test] + fn stream_pack_then_unpack_roundtrips_mixed_leaves() { + // Encode a sequence of mixed-mode leaves into one buffer, + // decode in order, assert exact equality of all 8. + let leaves = [ + LeafCu::skip(0), + LeafCu::delta(1, 0xAB), + LeafCu::merge(2, MergeDir::North), + LeafCu::escape(3, 0xDEAD_BEEF), + LeafCu::skip(MAX_BASIN_IDX), + LeafCu::delta(MAX_BASIN_IDX, 0xFF), + LeafCu::merge(MAX_BASIN_IDX, MergeDir::South), + LeafCu::escape(MAX_BASIN_IDX, u32::MAX), + ]; + // Worst case: 8 × 6 bytes = 48 + let mut buf = vec![0u8; 48]; + let mut offset = 0; + for leaf in &leaves { + let n = pack_leaf(leaf, &mut buf[offset..]).unwrap(); + offset += n; + } + let total_written = offset; + // Decode in order. + let mut decoded = Vec::with_capacity(8); + let mut read = 0; + while read < total_written { + let (leaf, n) = unpack_leaf(&buf[read..]).unwrap(); + decoded.push(leaf); + read += n; + } + assert_eq!(decoded.len(), 8); + assert_eq!(&decoded[..], &leaves[..]); + assert_eq!(read, total_written); + } +} diff --git a/src/hpc/codec/predict.rs b/src/hpc/codec/predict.rs new file mode 100644 index 00000000..727b94ef --- /dev/null +++ b/src/hpc/codec/predict.rs @@ -0,0 +1,397 @@ +//! Intra-prediction mode decision (PR-X12 A3, intra path). +//! +//! Encoder-side kernel: given a cell value, its nearest-basin index + +//! delta, and the four cardinal neighbour `LeafCu`s, choose the best +//! [`CellMode`] for the cell and emit the corresponding [`LeafCu`]. +//! +//! This is the **mode-decision** kernel, not the inverse-projection +//! reconstruction. Decoder-side reconstruction is the inverse of this +//! decision tree and is folded into PR-X12 A6 RDO + A8 stream +//! interpretation; A3 ships only the encoder direction. +//! +//! # The decision tree +//! +//! For one cell at (row, col) inside a CTU: +//! +//! ```text +//! ┌─────────────────────────────────────────────────┐ +//! │ delta == 0 ? → Skip(basin_idx) │ +//! └─────────────────────────────────────────────────┘ +//! │ no +//! ▼ +//! ┌─────────────────────────────────────────────────┐ +//! │ any same-dir neighbour has Delta-mode with the │ +//! │ SAME basin_idx AND SAME |delta| (sign-tolerant) │ +//! │ → Merge(basin_idx, dir) │ +//! └─────────────────────────────────────────────────┘ +//! │ no candidate +//! ▼ +//! ┌─────────────────────────────────────────────────┐ +//! │ |delta| fits in i8 (≤ 127)? │ +//! │ → Delta(basin_idx, δ_u8) │ +//! └─────────────────────────────────────────────────┘ +//! │ overflow +//! ▼ +//! ┌─────────────────────────────────────────────────┐ +//! │ → Escape(basin_idx, idx) │ +//! │ (caller appends raw u64 to escape vector) │ +//! └─────────────────────────────────────────────────┘ +//! ``` +//! +//! The order is **Skip → Merge → Delta → Escape** because the wire +//! cost is monotonically increasing in the same order (2 → 3 → 3 → 6 +//! bytes per [`packed_byte_len`](super::mode::packed_byte_len)). The +//! decision picks the cheapest mode that fits. +//! +//! # What A3-intra does NOT do +//! +//! - **Inter prediction** (parent-tier neighbours from the +//! `BlockedGrid` L2/L3 cascade) — deferred to PR-X12 A3 follow-up. +//! - **Rate-distortion optimisation** — A3-intra picks by exact match +//! only. Soft mode-switching with λ-RDO is PR-X12 A6. +//! - **Transform / quantisation** — A3-intra works on already-decoded +//! integer deltas. The transform path (DCT-II for delta residuals) +//! is PR-X12 A4. +//! - **SIMD-batched CTU sweep** — scalar reference today. The +//! `F32x16`-batched form (16 cells per inner loop via +//! `crate::simd_soa::MultiLaneColumn`) is a follow-up after the +//! reference + reconstruction parity test pin the math. + +use super::ctu::{CellMode, LeafCu, MergeDir}; +use super::mode::BASIN_NONE; + +// ════════════════════════════════════════════════════════════════════ +// Inputs to the encoder mode decision +// ════════════════════════════════════════════════════════════════════ + +/// Per-cell context the encoder needs to choose a mode. +/// +/// Built by the caller from the CTU's basin lookup + the per-cell +/// neighbour table. The encoder does not own the basin codebook or the +/// escape vector; it returns an `Escape(basin_idx, escape_idx)` leaf +/// and lets the caller push the original cell value into the per-frame +/// escape vector at `escape_idx`. +/// +/// # Fields +/// +/// - `basin_idx`: nearest basin's index in the per-frame codebook, +/// already resolved by the caller (typically via +/// `ogit_bridge::nearest_basin`). Must be `<= MAX_BASIN_IDX` +/// (12 bits) per [`super::mode::pack_header`]. The encoder does not +/// re-validate. +/// - `delta_i32`: signed delta from the basin's u8-quantised +/// representation of the cell. The encoder branches on `|delta|` +/// to decide between Delta (fits in i8) and Escape (overflows). +/// `i32` width avoids overflow when the caller computes +/// `cell_value - basin_value` for two u8 inputs. +/// - `neighbours`: NESW (in [`MergeDir`] discriminant order) optional +/// neighbour leaves. `None` for boundary cells; the Merge candidate +/// scan skips `None` entries. +#[derive(Debug, Clone, Copy)] +pub struct IntraContext<'a> { + /// Pre-resolved basin index (12-bit max). + pub basin_idx: u16, + /// Signed delta from basin → cell, in the basin's u8 quantisation + /// space. + pub delta_i32: i32, + /// NESW neighbour leaves, indexed by [`MergeDir`] discriminant. + pub neighbours: [Option<&'a LeafCu>; 4], +} + +/// Configuration for the intra-prediction decision. +/// +/// Today a single field; the field exists so the API can grow +/// (Merge tolerance, RDO knobs in A6) without a signature break. +#[derive(Debug, Clone, Copy)] +pub struct IntraConfig { + /// Future allocator for the encoder's escape vector — returns the + /// next index to write. `None` disables Escape mode (the encoder + /// will fall back to Delta-with-truncation, which **loses + /// precision** but never panics; callers wanting lossless coding + /// must provide a real allocator). + /// + /// Stateless API today: encoder calls `escape_next_idx` once per + /// Escape decision. The caller is responsible for actually + /// appending the u64 cell value into the escape vector at the + /// returned index — this kernel doesn't see the cell value. + pub escape_next_idx: Option, +} + +impl Default for IntraConfig { + fn default() -> Self { + Self { escape_next_idx: None } + } +} + +// ════════════════════════════════════════════════════════════════════ +// The decision kernel +// ════════════════════════════════════════════════════════════════════ + +/// Encoder-side intra-prediction. Returns the cheapest [`LeafCu`] +/// representation of the cell described by `ctx`. +/// +/// See the module docs for the decision tree (Skip → Merge → Delta → +/// Escape) and the rationale (monotone wire cost). +/// +/// # Examples +/// +/// Skip when the cell is exactly the basin: +/// +/// ``` +/// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig}; +/// use ndarray::hpc::codec::{CellMode, LeafCu}; +/// let ctx = IntraContext { +/// basin_idx: 42, +/// delta_i32: 0, +/// neighbours: [None; 4], +/// }; +/// let leaf = predict_intra(&ctx, &IntraConfig::default()); +/// assert_eq!(leaf.mode, CellMode::Skip); +/// assert_eq!(leaf.basin_idx, 42); +/// ``` +/// +/// Delta when no Merge candidate exists but |δ| fits in i8: +/// +/// ``` +/// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig}; +/// use ndarray::hpc::codec::CellMode; +/// let ctx = IntraContext { +/// basin_idx: 42, +/// delta_i32: 17, +/// neighbours: [None; 4], +/// }; +/// let leaf = predict_intra(&ctx, &IntraConfig::default()); +/// assert_eq!(leaf.mode, CellMode::Delta); +/// assert_eq!(leaf.delta, Some(17)); +/// ``` +pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { + // ── 1. Skip ────────────────────────────────────────────────────── + if ctx.delta_i32 == 0 { + return LeafCu::skip(ctx.basin_idx); + } + + // ── 2. Merge ───────────────────────────────────────────────────── + // + // A neighbour is a Merge candidate iff: + // (a) its mode is Delta (Skip / Merge / Escape neighbours carry + // no reusable delta to inherit from) + // (b) its basin_idx matches ours (Merge inheritance implicitly + // points at the SAME basin — different basins mean a + // different reference frame) + // (c) its δ exactly matches our δ as a u8 (sign-tolerant via + // wrapping cast; matches the A2 pack format where Delta + // stores a raw u8 byte without a sign bit) + // + // We scan NESW in discriminant order and pick the first match. + // Multiple matches all collapse to the same coded leaf, so the + // first-hit policy is order-deterministic without affecting + // bitstream length. + let our_delta_u8 = ctx.delta_i32 as u8; // wrapping cast matches A2 pack + for (i, nb_slot) in ctx.neighbours.iter().enumerate() { + let Some(nb) = nb_slot else { continue }; + if nb.mode != CellMode::Delta { + continue; + } + if nb.basin_idx != ctx.basin_idx { + continue; + } + if nb.delta != Some(our_delta_u8) { + continue; + } + let dir = merge_dir_from_index(i); + return LeafCu::merge(ctx.basin_idx, dir); + } + + // ── 3. Delta ───────────────────────────────────────────────────── + // + // i8 range is [-128, 127]. We pack as raw u8 (wrapping cast) so + // the encoder's reconstruction must read the byte back as i8 to + // recover the sign. This matches how `LeafCu::delta` stores it and + // how `super::mode::pack_leaf` writes it. + if (-128..=127).contains(&ctx.delta_i32) { + return LeafCu::delta(ctx.basin_idx, our_delta_u8); + } + + // ── 4. Escape ──────────────────────────────────────────────────── + // + // |δ| doesn't fit in i8. Caller must own the per-frame escape + // vector and provide the next-write index; we return a leaf that + // references it. If the caller didn't provide an allocator, we + // fall back to a saturated Delta (lossy but never panicking) so + // a misconfigured encoder still produces a valid bytestream. + match cfg.escape_next_idx { + Some(idx) => LeafCu::escape(ctx.basin_idx, idx), + None => { + // Lossy fallback: clamp to i8 range. Caller is responsible + // for noticing that the reconstruction won't be bit-exact. + let clamped = ctx.delta_i32.clamp(-128, 127) as u8; + LeafCu::delta(ctx.basin_idx, clamped) + } + } +} + +#[inline] +fn merge_dir_from_index(i: usize) -> MergeDir { + match i { + 0 => MergeDir::North, + 1 => MergeDir::East, + 2 => MergeDir::West, + _ => MergeDir::South, + } +} + +/// Sanity-check sentinel: returns `true` iff the resolved basin index +/// is the "no basin" marker. Encoders that compute basins lazily can +/// short-circuit Skip/Merge/Delta and emit Escape directly when this +/// fires. +#[inline] +pub fn is_no_basin(basin_idx: u16) -> bool { + basin_idx == BASIN_NONE +} + +// ════════════════════════════════════════════════════════════════════ +// Tests +// ════════════════════════════════════════════════════════════════════ + +#[cfg(test)] +mod tests { + use super::*; + + fn ctx_with_neighbours<'a>(basin: u16, delta: i32, neighbours: [Option<&'a LeafCu>; 4]) -> IntraContext<'a> { + IntraContext { + basin_idx: basin, + delta_i32: delta, + neighbours, + } + } + + #[test] + fn skip_when_delta_is_zero() { + let leaf = predict_intra(&ctx_with_neighbours(100, 0, [None; 4]), &IntraConfig::default()); + assert_eq!(leaf, LeafCu::skip(100)); + } + + #[test] + fn skip_preferred_over_neighbour_match() { + // δ=0 trumps everything else, even a perfect Merge candidate. + let nb = LeafCu::delta(100, 0); + let neighbours = [Some(&nb), None, None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, 0, neighbours), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Skip); + } + + #[test] + fn delta_in_i8_range() { + for d in [-128i32, -1, 1, 127] { + let leaf = predict_intra(&ctx_with_neighbours(100, d, [None; 4]), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Delta); + assert_eq!(leaf.delta, Some(d as u8)); + } + } + + #[test] + fn merge_when_neighbour_delta_matches_basin_and_value() { + // Northern neighbour: Delta-mode, same basin, same δ as us. + let nb_north = LeafCu::delta(100, 17); + let neighbours = [Some(&nb_north), None, None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Merge); + assert_eq!(leaf.merge_dir, Some(MergeDir::North)); + assert_eq!(leaf.basin_idx, 100); + } + + #[test] + fn merge_skipped_when_basin_differs() { + // Same δ but different basin → cannot Merge (different + // reference frame). Falls through to Delta. + let nb_north = LeafCu::delta(99, 17); + let neighbours = [Some(&nb_north), None, None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Delta); + } + + #[test] + fn merge_skipped_when_neighbour_mode_is_not_delta() { + // Skip / Merge / Escape neighbours carry no inheritable δ. + let nb_skip = LeafCu::skip(100); + let nb_merge = LeafCu::merge(100, MergeDir::North); + let nb_esc = LeafCu::escape(100, 0); + let neighbours = [Some(&nb_skip), Some(&nb_merge), None, Some(&nb_esc)]; + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Delta); + } + + #[test] + fn merge_picks_first_hit_in_nesw_order() { + // Both N and E qualify; encoder must pick N (lower index). + let nb_match = LeafCu::delta(100, 17); + let neighbours = [Some(&nb_match), Some(&nb_match), None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + assert_eq!(leaf.merge_dir, Some(MergeDir::North)); + } + + #[test] + fn merge_negative_delta_via_wrapping_cast() { + // δ = -17 packs to 0xEF (= 239 as u8). Neighbour stored as + // u8 = 0xEF MUST match — the cast must be wrapping, not + // saturating. + let nb_match = LeafCu::delta(100, (-17_i32) as u8); + let neighbours = [None, Some(&nb_match), None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, -17, neighbours), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Merge); + assert_eq!(leaf.merge_dir, Some(MergeDir::East)); + } + + #[test] + fn escape_when_delta_overflows_i8_and_allocator_present() { + let cfg = IntraConfig { + escape_next_idx: Some(42), + }; + let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &cfg); + assert_eq!(leaf.mode, CellMode::Escape); + assert_eq!(leaf.escape_idx, Some(42)); + assert_eq!(leaf.basin_idx, 100); + } + + #[test] + fn escape_lossy_fallback_when_no_allocator() { + // Without an escape_next_idx, the encoder clamps to i8 range. + // The result is a valid LeafCu but the reconstruction won't + // be bit-exact. + let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Delta); + assert_eq!(leaf.delta, Some(127)); + } + + #[test] + fn escape_lossy_fallback_negative_overflow() { + let leaf = predict_intra(&ctx_with_neighbours(100, -1000, [None; 4]), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Delta); + assert_eq!(leaf.delta, Some((-128_i32) as u8)); + } + + #[test] + fn pack_then_unpack_chained_through_intra_decision() { + // End-to-end: encoder picks Merge for one cell. The packed + // representation must round-trip via A2's pack/unpack with + // bit-exact fidelity. + use super::super::mode::{pack_leaf, unpack_leaf}; + let nb = LeafCu::delta(100, 17); + let neighbours = [None, Some(&nb), None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + assert_eq!(leaf.mode, CellMode::Merge); + + let mut buf = [0u8; 6]; + let n = pack_leaf(&leaf, &mut buf).unwrap(); + let (decoded, consumed) = unpack_leaf(&buf).unwrap(); + assert_eq!(n, consumed); + assert_eq!(decoded, leaf); + } + + #[test] + fn is_no_basin_sentinel_round_trip() { + assert!(is_no_basin(BASIN_NONE)); + assert!(!is_no_basin(0)); + assert!(!is_no_basin(100)); + } +} From b39a5769c65903295163ee8c5bda1ddba4093488 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 06:42:09 +0000 Subject: [PATCH 2/5] =?UTF-8?q?fix(codec):=20address=20PR-195=20review=20?= =?UTF-8?q?=E2=80=94=20overflow=20Merge=20alias=20+=20mode-sized=20pack=5F?= =?UTF-8?q?leaf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 (codex) — overflow δ no longer aliases to Merge: predict_intra previously took `our_delta_u8 = δ as u8` BEFORE checking i8 fit, so e.g. δ=200 wrapped to 0xC8 and could match a neighbour byte 0xC8 (i8=-56), silently corrupting reconstruction. Now the i8 range check gates both the Merge scan and the Delta branch; out-of-range δ falls straight through to Escape (or the documented lossy clamp). + 2 regression tests: - overflow_delta_does_not_alias_to_merge - overflow_delta_with_allocator_takes_escape P2 (coderabbit + codex) — pack_leaf accepts mode-sized buffers: pack_leaf used a 6-byte minimum for all modes; callers pre-sizing by packed_byte_len() got spurious None for Skip(2)/Merge,Delta(3). Length check now gates on packed_byte_len(leaf.mode). + 1 regression test: pack_leaf_accepts_mode_sized_buffers P3 (coderabbit) — doctest examples on the public-API surface: Added /// runnable examples to MAX_BASIN_IDX, BASIN_NONE, unpack_header, pack_merge_dir, unpack_merge_dir, unpack_leaf, packed_byte_len, IntraContext, IntraConfig, is_no_basin. Removed unused LeafCu import from existing predict_intra doctest. Gates: cargo test --features codec --lib hpc::codec → 48 passed cargo test --features codec --doc hpc::codec → 14 passed cargo fmt --all -- --check → clean cargo clippy --features codec --lib -- -D warnings → clean --- src/hpc/codec/mode.rs | 53 +++++++++++++++++- src/hpc/codec/predict.rs | 112 +++++++++++++++++++++++++++++++++------ 2 files changed, 148 insertions(+), 17 deletions(-) diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs index d18f5c96..ef4cf965 100644 --- a/src/hpc/codec/mode.rs +++ b/src/hpc/codec/mode.rs @@ -57,10 +57,20 @@ use super::ctu::{CellMode, LeafCu, MergeDir}; /// Maximum encodable `basin_idx`. Stored in the lower 12 bits of the /// header; values >= this constant overflow the header field. +/// +/// ``` +/// use ndarray::hpc::codec::MAX_BASIN_IDX; +/// assert_eq!(MAX_BASIN_IDX, (1 << 12) - 1); +/// ``` pub const MAX_BASIN_IDX: u16 = (1 << 12) - 1; // 4095 /// Tag inside the per-frame basin codebook for "no basin assigned" /// (encoder-side sentinel during mode decision). +/// +/// ``` +/// use ndarray::hpc::codec::{BASIN_NONE, MAX_BASIN_IDX}; +/// assert_eq!(BASIN_NONE, MAX_BASIN_IDX); +/// ``` pub const BASIN_NONE: u16 = MAX_BASIN_IDX; /// Pack `(mode, basin_idx)` into a 16-bit header. @@ -85,6 +95,12 @@ pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 { /// /// The 2-bit mode field always decodes (all 4 variants are valid). /// `basin_idx` is the 12-bit lower field, exactly as packed. +/// +/// ``` +/// use ndarray::hpc::codec::{pack_header, unpack_header, CellMode}; +/// let h = pack_header(CellMode::Escape, 7); +/// assert_eq!(unpack_header(h), (CellMode::Escape, 7)); +/// ``` #[inline] pub fn unpack_header(packed: u16) -> (CellMode, u16) { let mode_bits = ((packed >> 12) & 0b11) as u8; @@ -103,6 +119,11 @@ pub fn unpack_header(packed: u16) -> (CellMode, u16) { // ════════════════════════════════════════════════════════════════════ /// Pack a [`MergeDir`] into the lower 2 bits of a `u8`. +/// +/// ``` +/// use ndarray::hpc::codec::{pack_merge_dir, MergeDir}; +/// assert_eq!(pack_merge_dir(MergeDir::East), 1); +/// ``` #[inline] pub fn pack_merge_dir(dir: MergeDir) -> u8 { dir as u8 @@ -112,6 +133,13 @@ pub fn pack_merge_dir(dir: MergeDir) -> u8 { /// /// All four 2-bit values map to a valid `MergeDir`; bits 2-7 are /// ignored. +/// +/// ``` +/// use ndarray::hpc::codec::{pack_merge_dir, unpack_merge_dir, MergeDir}; +/// for d in [MergeDir::North, MergeDir::East, MergeDir::West, MergeDir::South] { +/// assert_eq!(unpack_merge_dir(pack_merge_dir(d)), d); +/// } +/// ``` #[inline] pub fn unpack_merge_dir(byte: u8) -> MergeDir { match byte & 0b11 { @@ -133,7 +161,9 @@ pub fn unpack_merge_dir(byte: u8) -> MergeDir { /// worst case) — callers iterating CTUs typically pre-allocate /// `6 * cell_count` and trim afterwards. /// -/// Returns `None` if `out.len() < 6` (insufficient capacity). +/// Returns `None` if `out.len() < packed_byte_len(leaf.mode)` (insufficient +/// capacity for the *mode's* width — Skip needs 2, Merge/Delta need 3, +/// Escape needs 6). /// /// Format: /// - Bytes 0-1: header (`pack_header(mode, basin_idx)`, LE) @@ -151,7 +181,8 @@ pub fn unpack_merge_dir(byte: u8) -> MergeDir { /// assert_eq!(consumed, 3); /// ``` pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option { - if out.len() < 6 { + let required = packed_byte_len(leaf.mode); + if out.len() < required { return None; } let header = pack_header(leaf.mode, leaf.basin_idx); @@ -184,6 +215,16 @@ pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option { /// /// Returns `None` if the buffer is shorter than the per-mode width /// (2 for Skip, 3 for Merge/Delta, 6 for Escape). +/// +/// ``` +/// use ndarray::hpc::codec::{pack_leaf, unpack_leaf, LeafCu, MergeDir}; +/// let leaf = LeafCu::merge(7, MergeDir::West); +/// let mut buf = [0u8; 3]; +/// pack_leaf(&leaf, &mut buf).unwrap(); +/// let (decoded, n) = unpack_leaf(&buf).unwrap(); +/// assert_eq!(decoded, leaf); +/// assert_eq!(n, 3); +/// ``` pub fn unpack_leaf(buf: &[u8]) -> Option<(LeafCu, usize)> { if buf.len() < 2 { return None; @@ -217,6 +258,14 @@ pub fn unpack_leaf(buf: &[u8]) -> Option<(LeafCu, usize)> { /// Byte cost of packing a leaf in this mode. Useful for pre-sizing /// a buffer without packing first. +/// +/// ``` +/// use ndarray::hpc::codec::{packed_byte_len, CellMode}; +/// assert_eq!(packed_byte_len(CellMode::Skip), 2); +/// assert_eq!(packed_byte_len(CellMode::Merge), 3); +/// assert_eq!(packed_byte_len(CellMode::Delta), 3); +/// assert_eq!(packed_byte_len(CellMode::Escape), 6); +/// ``` #[inline] pub const fn packed_byte_len(mode: CellMode) -> usize { match mode { diff --git a/src/hpc/codec/predict.rs b/src/hpc/codec/predict.rs index 727b94ef..d172f143 100644 --- a/src/hpc/codec/predict.rs +++ b/src/hpc/codec/predict.rs @@ -87,6 +87,17 @@ use super::mode::BASIN_NONE; /// - `neighbours`: NESW (in [`MergeDir`] discriminant order) optional /// neighbour leaves. `None` for boundary cells; the Merge candidate /// scan skips `None` entries. +/// +/// ``` +/// use ndarray::hpc::codec::{IntraContext, LeafCu}; +/// let north = LeafCu::delta(5, 17); +/// let ctx = IntraContext { +/// basin_idx: 5, +/// delta_i32: 17, +/// neighbours: [Some(&north), None, None, None], +/// }; +/// assert_eq!(ctx.basin_idx, 5); +/// ``` #[derive(Debug, Clone, Copy)] pub struct IntraContext<'a> { /// Pre-resolved basin index (12-bit max). @@ -102,6 +113,14 @@ pub struct IntraContext<'a> { /// /// Today a single field; the field exists so the API can grow /// (Merge tolerance, RDO knobs in A6) without a signature break. +/// +/// ``` +/// use ndarray::hpc::codec::IntraConfig; +/// let default_cfg = IntraConfig::default(); +/// assert!(default_cfg.escape_next_idx.is_none()); +/// let allocated = IntraConfig { escape_next_idx: Some(42) }; +/// assert_eq!(allocated.escape_next_idx, Some(42)); +/// ``` #[derive(Debug, Clone, Copy)] pub struct IntraConfig { /// Future allocator for the encoder's escape vector — returns the @@ -139,7 +158,7 @@ impl Default for IntraConfig { /// /// ``` /// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig}; -/// use ndarray::hpc::codec::{CellMode, LeafCu}; +/// use ndarray::hpc::codec::CellMode; /// let ctx = IntraContext { /// basin_idx: 42, /// delta_i32: 0, @@ -170,6 +189,14 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { return LeafCu::skip(ctx.basin_idx); } + // i8-fit gates both Merge and Delta. Out-of-range δ must skip + // Merge entirely — wrapping `200_i32 as u8` aliases to `0xC8`, + // which could spuriously match a neighbour whose byte equals + // `0xC8` (i8 = -56), producing a leaf the decoder reconstructs as + // -56 instead of 200. + let fits_i8 = (-128..=127).contains(&ctx.delta_i32); + let our_delta_u8 = ctx.delta_i32 as u8; // wrapping cast matches A2 pack + // ── 2. Merge ───────────────────────────────────────────────────── // // A neighbour is a Merge candidate iff: @@ -186,20 +213,21 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { // Multiple matches all collapse to the same coded leaf, so the // first-hit policy is order-deterministic without affecting // bitstream length. - let our_delta_u8 = ctx.delta_i32 as u8; // wrapping cast matches A2 pack - for (i, nb_slot) in ctx.neighbours.iter().enumerate() { - let Some(nb) = nb_slot else { continue }; - if nb.mode != CellMode::Delta { - continue; + if fits_i8 { + for (i, nb_slot) in ctx.neighbours.iter().enumerate() { + let Some(nb) = nb_slot else { continue }; + if nb.mode != CellMode::Delta { + continue; + } + if nb.basin_idx != ctx.basin_idx { + continue; + } + if nb.delta != Some(our_delta_u8) { + continue; + } + let dir = merge_dir_from_index(i); + return LeafCu::merge(ctx.basin_idx, dir); } - if nb.basin_idx != ctx.basin_idx { - continue; - } - if nb.delta != Some(our_delta_u8) { - continue; - } - let dir = merge_dir_from_index(i); - return LeafCu::merge(ctx.basin_idx, dir); } // ── 3. Delta ───────────────────────────────────────────────────── @@ -208,7 +236,7 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { // the encoder's reconstruction must read the byte back as i8 to // recover the sign. This matches how `LeafCu::delta` stores it and // how `super::mode::pack_leaf` writes it. - if (-128..=127).contains(&ctx.delta_i32) { + if fits_i8 { return LeafCu::delta(ctx.basin_idx, our_delta_u8); } @@ -244,6 +272,12 @@ fn merge_dir_from_index(i: usize) -> MergeDir { /// is the "no basin" marker. Encoders that compute basins lazily can /// short-circuit Skip/Merge/Delta and emit Escape directly when this /// fires. +/// +/// ``` +/// use ndarray::hpc::codec::{is_no_basin, BASIN_NONE}; +/// assert!(is_no_basin(BASIN_NONE)); +/// assert!(!is_no_basin(0)); +/// ``` #[inline] pub fn is_no_basin(basin_idx: u16) -> bool { basin_idx == BASIN_NONE @@ -394,4 +428,52 @@ mod tests { assert!(!is_no_basin(0)); assert!(!is_no_basin(100)); } + + #[test] + fn overflow_delta_does_not_alias_to_merge() { + // Regression for the wrapping-cast Merge alias bug: + // δ = 200 (overflows i8) must NOT match a neighbour whose + // u8 byte equals (200 as u8) = 0xC8 (= -56 in i8). The + // encoder must take the Escape path (or, here, the lossy + // clamp fallback because no allocator is wired). + let nb_alias = LeafCu::delta(100, 0xC8); + let neighbours = [Some(&nb_alias), None, None, None]; + let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default()); + assert_ne!(leaf.mode, CellMode::Merge, "overflow δ must not Merge"); + // With no allocator the encoder clamps to +127 (lossy Delta). + assert_eq!(leaf.mode, CellMode::Delta); + assert_eq!(leaf.delta, Some(127)); + } + + #[test] + fn overflow_delta_with_allocator_takes_escape() { + let nb_alias = LeafCu::delta(100, 0xC8); + let neighbours = [Some(&nb_alias), None, None, None]; + let cfg = IntraConfig { + escape_next_idx: Some(7), + }; + let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &cfg); + assert_eq!(leaf.mode, CellMode::Escape); + assert_eq!(leaf.escape_idx, Some(7)); + } + + #[test] + fn pack_leaf_accepts_mode_sized_buffers() { + // Regression for the P2 6-byte-minimum bug: Skip should pack + // into a 2-byte buffer, Merge/Delta into a 3-byte buffer. + use super::super::mode::{pack_leaf, packed_byte_len}; + let skip = LeafCu::skip(10); + let mut buf2 = [0u8; 2]; + assert_eq!(pack_leaf(&skip, &mut buf2), Some(2)); + assert_eq!(packed_byte_len(CellMode::Skip), 2); + + let delta = LeafCu::delta(10, 7); + let mut buf3 = [0u8; 3]; + assert_eq!(pack_leaf(&delta, &mut buf3), Some(3)); + + // Escape still needs 6 bytes; a 3-byte buffer is rejected. + let esc = LeafCu::escape(10, 99); + let mut buf3b = [0u8; 3]; + assert_eq!(pack_leaf(&esc, &mut buf3b), None); + } } From b44fe59fbbd0237b6c1dcc0c3102452cc8201c79 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 06:48:03 +0000 Subject: [PATCH 3/5] =?UTF-8?q?fix(codec):=20address=20owner=20review=20on?= =?UTF-8?q?=20PR-195=20=E2=80=94=20escape=20allocator=20+=20NEWS=20doc=20+?= =?UTF-8?q?=20diagram?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 — escape allocator collision in batched encoding: Old: cfg.escape_next_idx: Option was a static field — every Escape leaf in a batch got the same idx, colliding on the escape vector slot at decode time. New: predict_intra now takes a third arg `escape_next: Option<&mut u32>` that the kernel post-increments when Escape fires. Sequential cells in the same batch see fresh, non-colliding idxs. IntraConfig becomes empty (reserved for future RDO knobs). + 1 regression test: escape_allocator_advances_across_batched_calls + escape_when_delta_overflows_i8_and_allocator_present extended to assert the cursor advances. P1 — NESW vs NEWS doc/code mismatch: MergeDir discriminants are North=0, East=1, West=2, South=3 (NEWS), but the doc said "NESW". Fixed in IntraContext docstring + the inline Merge-scan comment. Added explicit slot-to-direction table. + 1 regression test: merge_slot_2_maps_to_west_and_slot_3_to_south P2 — ASCII header diagram in mode.rs: Old diagram put M0/M1 at MSB bits 15-14, but pack_header uses `(mode << 12) | basin`, placing mode at bits 12-13 with bits 14-15 reserved at zero. Redrew the diagram with explicit bit indices and labelled the reserved high bits for future encoder use. Nits: - A2 doc table now says "Merge tail: 1 byte (low 2 bits = MergeDir; high 6 reserved)" instead of the misleading "MergeDir 2-bit". - Renamed merge_picks_first_hit_in_nesw_order → ..._in_news_order to match the corrected ordering. Already-addressed (no-op this commit): - P0 overflow Merge alias → fixed in b39a5769 - P2 pack_leaf 6-byte minimum → fixed in b39a5769 Gates: cargo test --features codec --lib hpc::codec → 50 passed (+2 new) cargo test --features codec --doc hpc::codec → 15 passed (+1 new) cargo fmt --all -- --check → clean cargo clippy --features codec --lib -- -D warnings → clean --- src/hpc/codec/mode.rs | 34 ++++---- src/hpc/codec/predict.rs | 179 +++++++++++++++++++++++++-------------- 2 files changed, 136 insertions(+), 77 deletions(-) diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs index ef4cf965..f748b7bd 100644 --- a/src/hpc/codec/mode.rs +++ b/src/hpc/codec/mode.rs @@ -13,28 +13,32 @@ //! decoder can route on a single `u16` load: //! //! ```text -//! MSB LSB -//! ┌──┬──┬──────────────────────────────┐ -//! │M0│M1│ basin_idx (12) │ ← 16-bit header -//! └──┴──┴──────────────────────────────┘ -//! │ │ └─ basin_idx is the only payload field always present -//! └──┴──── 2-bit mode discriminant (CellMode::as_u8()) -//! (top 2 bits) +//! bit 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +//! ┌──┬──┬──┬──┬──────────────────────┐ +//! │ 0│ 0│M1│M0│ basin_idx (12) │ ← 16-bit header +//! └──┴──┴──┴──┴──────────────────────┘ +//! │ │ │ │ └────────────────────── basin_idx (bits 0..=11) +//! │ │ └──┴────────────────────────── 2-bit mode (bits 12..=13) +//! └──┴──────────────────────────────── reserved high bits 14,15 //! ``` //! -//! The remaining 2 bits at the top of the second byte are reserved for -//! the encoder's future `merge_dir` overlap when the mode is `Merge`; -//! a separate `pack_mode_dir` helper keeps `Merge`'s direction in a -//! single byte alongside `Skip`/`Delta`/`Escape`'s mode tag. +//! Bits 14-15 are reserved at zero; the impl is +//! `(mode_bits << 12) | basin_bits`, so mode lives at bits 12-13 and +//! basin at bits 0-11. A future encoder can repurpose bits 14-15 +//! (e.g., for a per-leaf `merge_dir` overlap) without disturbing +//! existing decoders that mask bits 14-15 off. //! //! # Per-mode tail width //! //! | Mode | Header | Tail bytes | Total | //! |--------|--------|--------------------------|-------| -//! | Skip | 2 | 0 | 2 | -//! | Merge | 2 | 1 (`MergeDir` 2-bit) | 3 | -//! | Delta | 2 | 1 (`u8` perturbation) | 3 | -//! | Escape | 2 | 4 (`u32` escape_idx, LE) | 6 | +//! | Skip | 2 | 0 | 2 | +//! | Merge | 2 | 1 (low 2 bits = `MergeDir`) | 3 | +//! | Delta | 2 | 1 (`u8` perturbation) | 3 | +//! | Escape | 2 | 4 (`u32` escape_idx, LE) | 6 | +//! +//! The Merge tail is a full byte even though only its low 2 bits carry +//! `MergeDir` — high 6 bits are reserved and masked off on read. //! //! The compact pack writes header (LE) then the per-mode tail. The //! `escape_idx` width is the worst case; a future A7 rANS pass can diff --git a/src/hpc/codec/predict.rs b/src/hpc/codec/predict.rs index d172f143..3014cb8a 100644 --- a/src/hpc/codec/predict.rs +++ b/src/hpc/codec/predict.rs @@ -84,9 +84,17 @@ use super::mode::BASIN_NONE; /// to decide between Delta (fits in i8) and Escape (overflows). /// `i32` width avoids overflow when the caller computes /// `cell_value - basin_value` for two u8 inputs. -/// - `neighbours`: NESW (in [`MergeDir`] discriminant order) optional -/// neighbour leaves. `None` for boundary cells; the Merge candidate -/// scan skips `None` entries. +/// - `neighbours`: NEWS (in [`MergeDir`] discriminant order: +/// `North=0, East=1, West=2, South=3`) optional neighbour leaves. +/// `None` for boundary cells; the Merge candidate scan skips `None` +/// entries. +/// +/// ```text +/// slot 0 → MergeDir::North (discr 0) +/// slot 1 → MergeDir::East (discr 1) +/// slot 2 → MergeDir::West (discr 2) +/// slot 3 → MergeDir::South (discr 3) +/// ``` /// /// ``` /// use ndarray::hpc::codec::{IntraContext, LeafCu}; @@ -105,41 +113,27 @@ pub struct IntraContext<'a> { /// Signed delta from basin → cell, in the basin's u8 quantisation /// space. pub delta_i32: i32, - /// NESW neighbour leaves, indexed by [`MergeDir`] discriminant. + /// NEWS neighbour leaves, indexed by [`MergeDir`] discriminant + /// (`North=0, East=1, West=2, South=3`). pub neighbours: [Option<&'a LeafCu>; 4], } /// Configuration for the intra-prediction decision. /// -/// Today a single field; the field exists so the API can grow -/// (Merge tolerance, RDO knobs in A6) without a signature break. +/// Reserved for future expansion (Merge tolerance, RDO knobs in A6). +/// Empty today; constructed via [`Default`] so additions don't break +/// callers. /// /// ``` /// use ndarray::hpc::codec::IntraConfig; -/// let default_cfg = IntraConfig::default(); -/// assert!(default_cfg.escape_next_idx.is_none()); -/// let allocated = IntraConfig { escape_next_idx: Some(42) }; -/// assert_eq!(allocated.escape_next_idx, Some(42)); +/// let cfg = IntraConfig::default(); +/// // No tunables yet — call sites stay future-compatible. +/// let _ = cfg; /// ``` -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Default)] pub struct IntraConfig { - /// Future allocator for the encoder's escape vector — returns the - /// next index to write. `None` disables Escape mode (the encoder - /// will fall back to Delta-with-truncation, which **loses - /// precision** but never panics; callers wanting lossless coding - /// must provide a real allocator). - /// - /// Stateless API today: encoder calls `escape_next_idx` once per - /// Escape decision. The caller is responsible for actually - /// appending the u64 cell value into the escape vector at the - /// returned index — this kernel doesn't see the cell value. - pub escape_next_idx: Option, -} - -impl Default for IntraConfig { - fn default() -> Self { - Self { escape_next_idx: None } - } + // Reserved. Future fields land here without breaking the signature. + _reserved: (), } // ════════════════════════════════════════════════════════════════════ @@ -152,6 +146,14 @@ impl Default for IntraConfig { /// See the module docs for the decision tree (Skip → Merge → Delta → /// Escape) and the rationale (monotone wire cost). /// +/// `escape_next` is a write-cursor into the caller's per-frame escape +/// vector. When the decision falls through to Escape, the kernel reads +/// the cursor, emits a leaf referencing that idx, and post-increments +/// the cursor so subsequent cells in the same batch get fresh, +/// non-colliding idxs. Pass `None` to disable lossless Escape — the +/// kernel then clamps `δ` to i8 range and emits a `Delta` leaf whose +/// reconstruction is **not bit-exact** (caller must accept the loss). +/// /// # Examples /// /// Skip when the cell is exactly the basin: @@ -164,7 +166,7 @@ impl Default for IntraConfig { /// delta_i32: 0, /// neighbours: [None; 4], /// }; -/// let leaf = predict_intra(&ctx, &IntraConfig::default()); +/// let leaf = predict_intra(&ctx, &IntraConfig::default(), None); /// assert_eq!(leaf.mode, CellMode::Skip); /// assert_eq!(leaf.basin_idx, 42); /// ``` @@ -179,11 +181,26 @@ impl Default for IntraConfig { /// delta_i32: 17, /// neighbours: [None; 4], /// }; -/// let leaf = predict_intra(&ctx, &IntraConfig::default()); +/// let leaf = predict_intra(&ctx, &IntraConfig::default(), None); /// assert_eq!(leaf.mode, CellMode::Delta); /// assert_eq!(leaf.delta, Some(17)); /// ``` -pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { +/// +/// Escape with an allocator — repeated calls bump the cursor: +/// +/// ``` +/// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig}; +/// use ndarray::hpc::codec::CellMode; +/// let mut next = 7u32; +/// let ctx = IntraContext { basin_idx: 1, delta_i32: 1000, neighbours: [None; 4] }; +/// let a = predict_intra(&ctx, &IntraConfig::default(), Some(&mut next)); +/// let b = predict_intra(&ctx, &IntraConfig::default(), Some(&mut next)); +/// assert_eq!(a.escape_idx, Some(7)); +/// assert_eq!(b.escape_idx, Some(8)); +/// assert_eq!(next, 9); +/// assert_eq!(a.mode, CellMode::Escape); +/// ``` +pub fn predict_intra(ctx: &IntraContext, _cfg: &IntraConfig, escape_next: Option<&mut u32>) -> LeafCu { // ── 1. Skip ────────────────────────────────────────────────────── if ctx.delta_i32 == 0 { return LeafCu::skip(ctx.basin_idx); @@ -209,7 +226,8 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { // wrapping cast; matches the A2 pack format where Delta // stores a raw u8 byte without a sign bit) // - // We scan NESW in discriminant order and pick the first match. + // We scan NEWS in discriminant order (N=0, E=1, W=2, S=3) and + // pick the first match. // Multiple matches all collapse to the same coded leaf, so the // first-hit policy is order-deterministic without affecting // bitstream length. @@ -242,16 +260,23 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu { // ── 4. Escape ──────────────────────────────────────────────────── // - // |δ| doesn't fit in i8. Caller must own the per-frame escape - // vector and provide the next-write index; we return a leaf that - // references it. If the caller didn't provide an allocator, we - // fall back to a saturated Delta (lossy but never panicking) so - // a misconfigured encoder still produces a valid bytestream. - match cfg.escape_next_idx { - Some(idx) => LeafCu::escape(ctx.basin_idx, idx), + // |δ| doesn't fit in i8. The cursor `escape_next` is a write-pointer + // into the caller's per-frame escape vector; we read it, emit a + // leaf referencing that idx, and post-increment so subsequent + // overflow cells in the batch don't collide on the same vector + // slot. If the caller didn't provide an allocator, we fall back to + // a saturated Delta (lossy: reconstruction is NOT bit-exact, but + // never panicking) so a misconfigured encoder still produces a + // valid bytestream. The lossy leaf's `mode` is `CellMode::Delta` + // even though its semantic value overflowed i8 — by contract the + // caller has acknowledged the precision loss. + match escape_next { + Some(next) => { + let idx = *next; + *next = next.wrapping_add(1); + LeafCu::escape(ctx.basin_idx, idx) + } None => { - // Lossy fallback: clamp to i8 range. Caller is responsible - // for noticing that the reconstruction won't be bit-exact. let clamped = ctx.delta_i32.clamp(-128, 127) as u8; LeafCu::delta(ctx.basin_idx, clamped) } @@ -301,7 +326,7 @@ mod tests { #[test] fn skip_when_delta_is_zero() { - let leaf = predict_intra(&ctx_with_neighbours(100, 0, [None; 4]), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 0, [None; 4]), &IntraConfig::default(), None); assert_eq!(leaf, LeafCu::skip(100)); } @@ -310,14 +335,14 @@ mod tests { // δ=0 trumps everything else, even a perfect Merge candidate. let nb = LeafCu::delta(100, 0); let neighbours = [Some(&nb), None, None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, 0, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 0, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Skip); } #[test] fn delta_in_i8_range() { for d in [-128i32, -1, 1, 127] { - let leaf = predict_intra(&ctx_with_neighbours(100, d, [None; 4]), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, d, [None; 4]), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Delta); assert_eq!(leaf.delta, Some(d as u8)); } @@ -328,7 +353,7 @@ mod tests { // Northern neighbour: Delta-mode, same basin, same δ as us. let nb_north = LeafCu::delta(100, 17); let neighbours = [Some(&nb_north), None, None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Merge); assert_eq!(leaf.merge_dir, Some(MergeDir::North)); assert_eq!(leaf.basin_idx, 100); @@ -340,7 +365,7 @@ mod tests { // reference frame). Falls through to Delta. let nb_north = LeafCu::delta(99, 17); let neighbours = [Some(&nb_north), None, None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Delta); } @@ -351,19 +376,35 @@ mod tests { let nb_merge = LeafCu::merge(100, MergeDir::North); let nb_esc = LeafCu::escape(100, 0); let neighbours = [Some(&nb_skip), Some(&nb_merge), None, Some(&nb_esc)]; - let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Delta); } #[test] - fn merge_picks_first_hit_in_nesw_order() { + fn merge_picks_first_hit_in_news_order() { // Both N and E qualify; encoder must pick N (lower index). let nb_match = LeafCu::delta(100, 17); let neighbours = [Some(&nb_match), Some(&nb_match), None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.merge_dir, Some(MergeDir::North)); } + #[test] + fn merge_slot_2_maps_to_west_and_slot_3_to_south() { + // Slot-3 South coverage gap noted in review. Verify the + // discriminant order (N=0, E=1, W=2, S=3) is reflected at + // the merge_dir output, not just NEWS-by-convention. + let nb = LeafCu::delta(100, 17); + + let only_west = [None, None, Some(&nb), None]; + let leaf_w = predict_intra(&ctx_with_neighbours(100, 17, only_west), &IntraConfig::default(), None); + assert_eq!(leaf_w.merge_dir, Some(MergeDir::West)); + + let only_south = [None, None, None, Some(&nb)]; + let leaf_s = predict_intra(&ctx_with_neighbours(100, 17, only_south), &IntraConfig::default(), None); + assert_eq!(leaf_s.merge_dir, Some(MergeDir::South)); + } + #[test] fn merge_negative_delta_via_wrapping_cast() { // δ = -17 packs to 0xEF (= 239 as u8). Neighbour stored as @@ -371,20 +412,35 @@ mod tests { // saturating. let nb_match = LeafCu::delta(100, (-17_i32) as u8); let neighbours = [None, Some(&nb_match), None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, -17, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, -17, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Merge); assert_eq!(leaf.merge_dir, Some(MergeDir::East)); } #[test] fn escape_when_delta_overflows_i8_and_allocator_present() { - let cfg = IntraConfig { - escape_next_idx: Some(42), - }; - let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &cfg); + let mut next = 42u32; + let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default(), Some(&mut next)); assert_eq!(leaf.mode, CellMode::Escape); assert_eq!(leaf.escape_idx, Some(42)); assert_eq!(leaf.basin_idx, 100); + // Cursor advanced so the next Escape gets a fresh idx. + assert_eq!(next, 43); + } + + #[test] + fn escape_allocator_advances_across_batched_calls() { + // Regression: two consecutive Escape decisions must not + // collide on the same vector slot. With a `&mut u32` cursor + // the kernel post-increments, so cell A sees idx N and + // cell B sees idx N+1. + let mut next = 5u32; + let a = predict_intra(&ctx_with_neighbours(7, 999, [None; 4]), &IntraConfig::default(), Some(&mut next)); + let b = predict_intra(&ctx_with_neighbours(7, -999, [None; 4]), &IntraConfig::default(), Some(&mut next)); + assert_eq!(a.escape_idx, Some(5)); + assert_eq!(b.escape_idx, Some(6)); + assert_eq!(next, 7); + assert_ne!(a.escape_idx, b.escape_idx); } #[test] @@ -392,14 +448,14 @@ mod tests { // Without an escape_next_idx, the encoder clamps to i8 range. // The result is a valid LeafCu but the reconstruction won't // be bit-exact. - let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Delta); assert_eq!(leaf.delta, Some(127)); } #[test] fn escape_lossy_fallback_negative_overflow() { - let leaf = predict_intra(&ctx_with_neighbours(100, -1000, [None; 4]), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, -1000, [None; 4]), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Delta); assert_eq!(leaf.delta, Some((-128_i32) as u8)); } @@ -412,7 +468,7 @@ mod tests { use super::super::mode::{pack_leaf, unpack_leaf}; let nb = LeafCu::delta(100, 17); let neighbours = [None, Some(&nb), None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None); assert_eq!(leaf.mode, CellMode::Merge); let mut buf = [0u8; 6]; @@ -438,7 +494,7 @@ mod tests { // clamp fallback because no allocator is wired). let nb_alias = LeafCu::delta(100, 0xC8); let neighbours = [Some(&nb_alias), None, None, None]; - let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default()); + let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default(), None); assert_ne!(leaf.mode, CellMode::Merge, "overflow δ must not Merge"); // With no allocator the encoder clamps to +127 (lossy Delta). assert_eq!(leaf.mode, CellMode::Delta); @@ -449,12 +505,11 @@ mod tests { fn overflow_delta_with_allocator_takes_escape() { let nb_alias = LeafCu::delta(100, 0xC8); let neighbours = [Some(&nb_alias), None, None, None]; - let cfg = IntraConfig { - escape_next_idx: Some(7), - }; - let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &cfg); + let mut next = 7u32; + let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default(), Some(&mut next)); assert_eq!(leaf.mode, CellMode::Escape); assert_eq!(leaf.escape_idx, Some(7)); + assert_eq!(next, 8); } #[test] From 01c77cccb7811a034920829faa816e0effcf08d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 07:30:21 +0000 Subject: [PATCH 4/5] =?UTF-8?q?docs(codec):=20cross-domain=20synergies=20?= =?UTF-8?q?=E2=80=94=20x265=20=E2=87=84=203DGS=20=E2=87=84=20attention=20?= =?UTF-8?q?=E2=87=84=20SGD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 722-line knowledge doc capturing the architectural claim that PR-X12's primitives (LeafCu, pack_leaf, predict_intra, Ctu, eventual rANS) are not video-codec-specific — they serve four loads on a single predictive-coder substrate: 1. Cognitive cells (the original target) 2. 3D Gaussian Splat coefficient compression 3. Transformer attention sparsification 4. Distributed-SGD gradient streaming Sections: - The 4-load isomorphism (§ 2) and 16-state (CellMode × MergeDir) classification table across all four domains - Primitive → load mapping matrix (§ 3) with file:line refs - 10 cross-domain epiphanies (§ 4) — including MergeDir-as-topology, predict_intra-as-attention-sparsifier, escape_next-as-allreduce-slot, mode-coded LoRA - 6 integration plans (§ 5) with worker × week + dependency graph - 8 exploration paths (§ 6) for sprint-scale research - 6 holy grail outcomes (§ 7) — sub-1-bit/Gaussian 3DGS, bit-exact attention, federated SGD at 8-16×, Lance substrate identity - 10 codec-side debts (§ 8) and 13 stack-side debts (§ 9) - Sequencing summary (§ 10): critical path = A7 rANS Companion to pr-x12-codec-x265-design.md (the mechanical design); a "See also" breadcrumb added there points here. --- .claude/knowledge/pr-x12-codec-x265-design.md | 6 + .../pr-x12-cross-domain-synergies.md | 722 ++++++++++++++++++ 2 files changed, 728 insertions(+) create mode 100644 .claude/knowledge/pr-x12-cross-domain-synergies.md diff --git a/.claude/knowledge/pr-x12-codec-x265-design.md b/.claude/knowledge/pr-x12-codec-x265-design.md index 182d2849..4d80fa62 100644 --- a/.claude/knowledge/pr-x12-codec-x265-design.md +++ b/.claude/knowledge/pr-x12-codec-x265-design.md @@ -8,6 +8,12 @@ > **Depends on**: PR-X10 (linalg-core), PR-X3 BlockedGrid (shipped). > **Used by**: PR-X9 (basin-codebook lazy storage) — the codec encodes > cognitive cells into skip/merge/delta/escape modes. +> +> **See also**: `pr-x12-cross-domain-synergies.md` — the architectural +> claim that the primitives in this design doc *also* serve 3DGS +> coefficient compression, transformer attention sparsification, and +> distributed-SGD gradient streaming. Holy grail outcomes, integration +> plans, and honest debt catalogue live there. ## Why diff --git a/.claude/knowledge/pr-x12-cross-domain-synergies.md b/.claude/knowledge/pr-x12-cross-domain-synergies.md new file mode 100644 index 00000000..ee074059 --- /dev/null +++ b/.claude/knowledge/pr-x12-cross-domain-synergies.md @@ -0,0 +1,722 @@ +# PR-X12 — Cross-Domain Synergies: x265 ⇄ 3D Gaussian Splat ⇄ Cognitive Shaders ⇄ BLAS/MKL + +> READ BY: savant-architect, codec-architect, cognitive-architect, +> splat-architect, vector-synthesis, l3-strategist, sentinel-qa, +> product-engineer. +> +> Status: epiphany-grade exploration doc, drafted 2026-05-22 during +> the PR-195 (A2 + A3-intra) review cycle. +> +> Companion to `.claude/knowledge/pr-x12-codec-x265-design.md` (the +> mechanical design). This doc captures the **why-it-generalizes** +> that the design doc deliberately scopes out. + +## TL;DR + +PR-X12 was framed as "x265 for cognitive cells" — the mechanical +design doc already maps x265 onto BlockedGrid. The deeper observation +this doc commits to is that the **same primitives — `LeafCu`, +`pack_leaf`, `predict_intra`, `Ctu`, rANS — also serve 3D Gaussian +splat coefficient compression, transformer attention sparsification, +and distributed-SGD gradient streaming.** The four domains are not +analogous; they are **four loads on a single predictive-coder +substrate.** This doc: + +1. Names the isomorphism precisely (§ 2) +2. Maps every codec primitive to its load in each domain (§ 3) +3. Calls out the epiphanies — cross-domain insights I have not seen + in print (§ 4) +4. Lays out integration plans with concrete PR-arc estimates (§ 5) +5. Catalogues exploration paths that warrant a sprint, not a PR (§ 6) +6. States the holy grail outcomes that fall out if it all lands (§ 7) +7. Honest debt accounting — codec side (§ 8) and existing stack + side (§ 9). No marketing. + +## 0. Audience preconditions + +This doc assumes the reader has internalised: + +- `Ctu` / `CtuArena` / `CtuPartition` / `LeafCu` / `CellMode` / + `MergeDir` from `src/hpc/codec/ctu.rs` (PR-170). +- `pack_leaf` / `unpack_leaf` / `pack_header` / `predict_intra` from + `src/hpc/codec/mode.rs` + `src/hpc/codec/predict.rs` (PR-195). +- The Click P-1 method discipline (operations on carriers, not free + functions) and the data-flow rule (no `&mut self` during compute). +- The cognitive cell → basin codebook story from + `.claude/knowledge/pr-x12-codec-x265-design.md` § "Core types". +- Inria 3DGS paper (Kerbl et al. 2023) + EWA Splatting (Zwicker 2001). +- That the cognitive `splat.rs` in `lance-graph-contract` is sacred + and **separate** from `splat3d::*` (the geometric forward renderer + shipped in PRs 1-7 of the May sprint). + +If any of the above is fuzzy, read those sources first; the rest of +this doc compresses. + +## 1. The four loads + +| Load | Carrier | Per-element payload | Predictability source | +|------|---------|--------------------|----------------------| +| **Cognitive cell** | `BlockedGrid` | 64-bit fingerprint | basin codebook (per-frame), spatial NEWS neighbours | +| **3DGS Gaussian** | SoA `(μ, scale, rot, opacity, SH)` | ~236 bytes raw | sorted-along-curve neighbours, basin (color/scale clusters) | +| **Transformer attention** | `(Q, K, V)` per (head, token) | Q,K,V vectors | KV palette clusters, previous-token attention pattern | +| **Distributed SGD gradient** | per-parameter `∂L/∂w` | FP32 grad | mini-batch siblings, gradient sparsity, sign agreement | + +All four loads share the same predictive-coding skeleton: + +``` + ┌──────────────────────────────────────┐ + │ 1. Build basin codebook (offline or │ + │ online k-means on the carrier) │ + └──────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ 2. Resolve nearest basin per element │ + │ → (basin_idx, δ from basin) │ + └──────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ 3. Mode-decide per element: │ + │ Skip (δ=0) │ + │ Merge (δ matches NEWS neighbour) │ + │ Delta (δ fits 8-bit) │ + │ Escape (full payload, idx into │ + │ per-frame escape vector) │ + └──────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ 4. Pack LeafCu (2/3/3/6 bytes) into │ + │ bytestream │ + └──────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ 5. rANS-encode the bytestream with │ + │ per-frame frequency tables (A7) │ + └──────────────────────────────────────┘ +``` + +Steps 1-5 are domain-agnostic. **What changes per load is the basin +clustering rule (step 1) and the escape payload (step 4's Escape +branch).** Everything else is shared kernel. + +## 2. The isomorphism + +**Claim:** `LeafCu` is a **discriminated union over (basin_ref, +residual) representations**, parameterised by a 2-bit predictability +class. The four bits across `(CellMode × MergeDir)` form a 16-state +classification machine that is not specific to video or cognitive +content. It is the natural mode-coding alphabet for any signal that +is: + +- **Locally predictable** from a small per-frame codebook +- **Spatially smooth** in a defined neighbour topology (NEWS, in + PR-X12 today; trivially generalisable to 6-way XYZ or + token-sequential) +- **Heavy-tailed** in its residuals (most values fit a small δ; + rare values need full Escape) + +All four loads named in § 1 satisfy these three properties. The +codec we shipped is therefore not "an HEVC port"; it is the +**reference encoder for predictable-codebook signals**. HEVC is one +consumer. + +### The 16-state classification table + +`(CellMode, MergeDir)` cross product, repurposed per domain: + +| Mode × Dir | Cognitive cell | 3DGS Gaussian | Attention | Gradient | +|------------|----------------|---------------|-----------|----------| +| Skip, — | cell = basin exactly | Gaussian = palette splat exactly | Q has no significant K | grad ≈ 0 (sparse update) | +| Merge, N | inherit δ from N-neighbour | inherit from prev-Morton Gaussian | inherit attention from prev-token | inherit grad from prev-layer sibling | +| Merge, E | inherit from E-neighbour | inherit from next-Morton | inherit from next-token | inherit from next-layer | +| Merge, W | inherit from W-neighbour | inherit from coarse-tier parent | inherit from prev-head | inherit from prev-iteration | +| Merge, S | inherit from S-neighbour | inherit from fine-tier child | inherit from next-head | inherit from next-iteration | +| Delta, — | 8-bit cell perturbation | 8-bit residual on (μ, scale, op) | 8-bit attention weight δ | 8-bit grad (QSGD, signSGD-magnitude) | +| Escape, — | full 64-bit fingerprint via idx | full SH coeffs ≥ L=2 via idx | full FP16 Q vector via idx | full FP32 grad via idx | + +`MergeDir`'s 4-way alphabet is **already the natural carrier** for +"inherit from one of 4 neighbours in some topology". The topology +varies per load; the encoding does not. + +## 3. Primitive → load mapping matrix + +This is the dense one. Each row is one primitive from PR-X12; the +columns are the four loads. Cells say what the primitive does in +that load, with file/line refs back to ndarray master. + +### 3.1 Carrier primitives + +| Primitive | Cognitive cell | 3DGS Gaussian | Attention | Gradient SGD | +|-----------|----------------|---------------|-----------|--------------| +| `Ctu` (`ctu.rs:285`) | one L1 BlockedGrid block (64×64 cells) | one tile-bin or one octree node (64-256 Gaussians) | one (token-window × heads) block (typically 64×16) | one parameter-shard (64K weights) | +| `CtuArena` (`ctu.rs:212`) | 85-node quad-tree per CTU | tile quad-tree (LOD cascade) | token-window prefix-tree | per-shard residual hierarchy | +| `CtuPartition` (`ctu.rs:193`) | recursive 64→32→16→8 split | tile 64×64 → 16×16 → 4×4 LOD | window 64→16→4 attention granularity | shard 64K→16K→4K gradient grouping | +| `LeafCu` (`ctu.rs:114`) | one cell's encoded mode | one Gaussian's encoded mode | one (head, token-position)'s mode | one weight's gradient mode | +| `MAX_BASIN_IDX = 4095` (`mode.rs:62`) | 4096-entry basin codebook | 4096-entry palette (μ_color × scale clusters) | 4096-entry KV cluster centroid | 4096-entry gradient-pattern bank | +| `BASIN_NONE` (`mode.rs:71`) | cell outside any basin | Gaussian outside palette range | Q outside KV palette (forces Escape) | grad outside known patterns | + +### 3.2 Encoder primitives + +| Primitive | Cognitive cell | 3DGS Gaussian | Attention | Gradient SGD | +|-----------|----------------|---------------|-----------|--------------| +| `pack_header(mode, basin_idx)` (`mode.rs:83`) | 16-bit cell header | 16-bit Gaussian header | 16-bit (head, token) header | 16-bit weight header | +| `pack_leaf` (`mode.rs:172`) | 2/3/3/6 byte cell record | 2/3/3/N byte Gaussian record (N depends on SH order in Escape) | 2/3/3/Q-width byte attention record | 2/3/3/4-byte weight gradient record | +| `predict_intra` (`predict.rs:186`) | encoder picks mode for cell | encoder picks mode for Gaussian per-Morton-step | encoder picks mode for (Q, K) pair | encoder picks mode for ∂L/∂w | +| `IntraContext.neighbours` (`predict.rs:117`) | NEWS spatial neighbours | prev/next Morton-sorted neighbours + parent/child tier | prev/next token + prev/next head | prev/next layer + prev/next iter | +| `IntraConfig` (`predict.rs:132`) | (future RDO knobs) | (future LOD/PSNR tradeoff knobs) | (future accuracy/latency knobs) | (future compression/convergence knobs) | +| `escape_next: Option<&mut u32>` (`predict.rs:202`) | escape vector cursor for full-payload cells | escape vector cursor for SH-heavy Gaussians | escape vector cursor for outlier Q | escape vector cursor for outlier grads | + +### 3.3 Wire-format primitives (deferred to A7/A8) + +| Primitive | Cognitive cell | 3DGS Gaussian | Attention | Gradient SGD | +|-----------|----------------|---------------|-----------|--------------| +| rANS encoder (A7) | per-frame basin-frequency table | per-asset palette-frequency table | per-context attention-pattern frequency | per-layer gradient-mode frequency | +| Stream framing (A8) | CTU markers, frame headers | tile-bin markers, asset headers | window markers, batch headers | shard markers, iter headers | +| Escape vector | per-frame `Vec` of full fingerprints | per-asset `Vec` | per-context `Vec` | per-shard `Vec` | + +## 4. Epiphanies + +Cross-domain insights worth flagging because each has 1-3 papers' +worth of novelty if pursued. None of these are in print as of the +literature snapshot I'm working from; **claim** is the right word, not +"finding". + +### E1. **`MergeDir` is a topology, not a direction.** + +`{North, East, West, South}` happens to be a 2D Cartesian raster +mental model. The codec doesn't care. The discriminant alphabet just +needs to be a 4-way categorical over "which of 4 neighbours did I +inherit from". In 3DGS that's `{prev-Morton, next-Morton, parent-LOD, +child-LOD}`. In attention that's `{prev-token, next-token, prev-head, +next-head}`. In SGD that's `{prev-iter, next-iter, prev-layer, +next-layer}`. **No code change required.** The doc + the docstring +in `IntraContext.neighbours` are the only constraints; the 2-bit +encoding is topology-free. → write up as: "Carrier-agnostic Merge +inheritance via parameterised 4-neighbour topology" (mini-paper). + +### E2. **`predict_intra` already encodes attention sinks.** + +The "Skip" mode case in `predict_intra` (`predict.rs:189-190`) — +returns when `delta_i32 == 0` — is exactly the attention-sink +phenomenon Streaming-LLM, H2O, SnapKV chase. Their attention mass +concentrates on a tiny subset of tokens; the rest are "Skip". With +the basin codebook as KV cluster centroids, **`predict_intra` is a +zero-shot attention sparsifier**: it labels every (Q, K) pair as +Skip/Merge/Delta/Escape and the wire cost is monotone in attention +mass. Combine with the rANS A7 and you get **bit-exact KV-cache +compression with a tunable accuracy floor**. The encoder is shipped. + +### E3. **`escape_next: &mut u32` is the lineage of gradient streaming.** + +The owner-author review's P1 — escape allocator collision — is the +exact issue federated-SGD papers solve with "all-reduce buckets": +multiple workers each emit gradient deltas, the aggregator needs +non-colliding slots in a shared vector. The `Option<&mut u32>` cursor +**is the all-reduce slot allocator**, just per-CTU instead of +per-batch. Lift it to a worker-pool API and you have a federated +gradient codec without writing new code. + +### E4. **The 64-bit `Fingerprint` and a 3DGS Gaussian's first-six floats compress identically.** + +Cognitive `Fingerprint` is 64 bits = 4×16-bit lanes. A 3DGS Gaussian's +`(μ_x, μ_y, μ_z, scale_x, scale_y, scale_z)` is 6 × FP16 quantised = +96 bits, but with 32 high bits dominated by the scale envelope which +is locally constant per palette basin. After basin subtraction, the +residual is ~64 bits — **identical to the cognitive cell case**. The +same `pack_leaf` works. The escape vector type changes from `u64` to +`[u16; 6]` but the codec is structurally invariant. + +### E5. **The Morton/Hilbert sort along which we encode 3DGS Gaussians is the EXACT spatial structure HEVC's macroblock raster scan implements in 2D.** + +HEVC's CTU traversal is z-order. 3DGS Gaussians sorted Morton/Hilbert +along their μ are z-order in 3D. The encoder doesn't know it's seeing +3D content; the spatial coherence in 1D-along-curve is identical to +2D-along-raster. **The CTU partition machinery in `ctu.rs` ports to +3DGS with zero changes to the partition logic.** What changes is the +predicate that decides when to split (variance of (μ, scale, opacity) +inside the node vs. PSNR target). + +### E6. **rANS with per-frame frequency tables is the **only** entropy coder that scales to 10⁶+ tokens.** + +CABAC is fine for video at ~10⁵ macroblocks/frame. Attention at +10⁶+ tokens/sec needs an entropy coder whose state machine fits in +L1 cache and whose throughput is gated by table lookup, not by the +serial CABAC interval renormalisation. rANS is that. **A7 is the +critical piece; without it the codec is academic.** Prioritise. + +### E7. **The 4096-entry basin codebook is identical to attention's KV palette identity in lance-graph.** + +This is the architectural payoff. `lance-graph::SpoDistanceMatrices` +computes (basin_id, distance) for SPO triples at 611M lookups/sec +(see CLAUDE.md "Session: Qwen3.5 × Opus 4.5/4.6"). The same data +structure feeds the cognitive codec basin lookup AND the attention +KV-cluster lookup AND the 3DGS palette nearest-neighbour. **One +codebook, three consumers, identical lookup kernel.** Lance is the +column substrate; the codebook is its first "logical schema" — and +that schema is shared. + +### E8. **Mode-coding is parameter-efficient supervised LoRA.** + +A LoRA adapter on a weight matrix `W` is a rank-r perturbation +`W + ΔW = W + B·A`. Express `ΔW` as a `BlockedGrid` and +mode-code it. Most weights are Skip (no LoRA contribution), some +inherit from neighbours (Merge), a few have small per-weight deltas +(Delta), and the heavy hitters are Escape. **`LeafCu`-coded LoRA is +~10× smaller than rank-32 LoRA on weight matrices > 4096².** The +codec is the parameter-efficient fine-tuning representation. +The user's "Pertuberationslernen" instinct lands here. + +### E9. **The `splat3d` PRs 1-7 (May sprint) and the `codec` PRs are the SAME pipeline shifted 90°.** + +The splat3d forward pipeline is: project → tile-bin → mode-decide +(which Gaussian contributes at which pixel) → alpha-composite. The +codec pipeline is: build codebook → block-partition → mode-decide +(which mode each cell takes) → entropy-code. **Both end in +mode-decide → reduce.** The mode-decide kernel is `predict_intra` +in both cases; the reduction differs (alpha vs. rANS). A unified +"mode-decide + reduce" trait would collapse 2 KLoC. **Worth a +sprint, not a PR**. + +### E10. **The lossy Escape fallback is a PSNR knob in disguise.** + +The owner-review's P2 nit — "lossy Escape emits `CellMode::Delta`, +the docstring lies" — is a feature, not a bug, **iff** we expose a +"lossy_threshold: u8" config. Then the fallback becomes "use Delta +for any |δ| ≤ threshold even if it would normally Escape". That's +the rate-distortion knob HEVC's λ-RDO tunes. **Promote the +docstring acknowledgement into a config field in A6 RDO.** + +## 5. Integration plans + +Concrete branches/PRs, each with effort estimate + dependency. +Listed by priority (impact ÷ risk). + +### Plan A — A7 rANS (critical, no domain-specific blockers) + +**Effort:** 1 worker × 1 week. Standard rANS, single-symbol, +encoder + decoder + parity test. Consumes `pack_leaf` output, emits +compressed bytestream. + +**File:** `src/hpc/codec/ans.rs` (new). + +**Dependency:** none — A2 + A3-intra are sufficient input. + +**Why first:** without entropy coding, the codec gives 2-3× over +raw. With rANS at per-frame frequency tables, 6-10×. Below the +rANS threshold, the codec is academic. + +### Plan B — A3-inter (cross-tier neighbour scan) (codec-side completion) + +**Effort:** 1 worker × 3 days. Extend `IntraContext.neighbours` to +include parent-tier and child-tier neighbours from `BlockedGrid`'s +L2/L3 cascade. Mode-decision tree gains 8 candidates instead of 4. + +**File:** `src/hpc/codec/predict.rs` (extend) + new `inter.rs`. + +**Dependency:** PR-X3 BlockedGrid L2/L3 cascade (shipped). + +**Why second:** unlocks the recursive partition compression. Without +inter prediction, parent-tier basins don't seed child-tier deltas. + +### Plan C — EWA SYRK-batched (3DGS performance, no codec changes) + +**Effort:** 1 worker × 1 week. Replace `sandwich_x16` per-Gaussian +loop with batched `cblas_ssyrk`. Add backend-dispatch (native / +intel-mkl / openblas). + +**File:** `src/hpc/splat3d/spd3.rs` (extend) + +`src/backend/{native,mkl,openblas}.rs` (add syrk wiring). + +**Dependency:** ndarray BLAS backend infra (shipped). + +**Why third:** biggest pure-FLOPS win, splat-aligned, no codec +coupling. Hits the holy grail outcome §7.1. + +### Plan D — Attention codec PoC (cognitive-side new ground) + +**Effort:** 2 workers × 2 weeks. Wire `predict_intra` against a +synthetic KV cache; build the basin codebook via mini-batch k-means +on K vectors; measure compression vs. accuracy on a known LLM +benchmark (LongBench, RULER). + +**File:** new crate `crates/attention-codec/` consuming +`ndarray::hpc::codec::*`. + +**Dependency:** Plan A (rANS) for realistic compression numbers. + +**Why fourth:** highest-novelty load; depends on A7 to be convincing. + +### Plan E — 3DGS coefficient codec (splat-side compression) + +**Effort:** 2 workers × 3 weeks. Morton-sort a trained scene's +Gaussians, build per-asset palette codebook via k-means over +(color, scale), mode-code the residuals through `pack_leaf`, rANS +through A7. + +**File:** new module `src/hpc/splat3d/codec.rs`. + +**Dependency:** Plan A (rANS), Plan B (A3-inter for LOD cascade). + +**Why fifth:** highest engineering value, but has external benchmark +risk — Inria's PLY format has format-stability constraints we'd +need to negotiate (or just ship a parallel format). + +### Plan F — Gradient streaming codec (federated SGD) + +**Effort:** 2 workers × 4 weeks. Workers emit `LeafCu` streams; the +aggregator decodes and applies. Requires a `&mut u32` allocator +generalised across worker pools (see E3). + +**File:** new crate `crates/grad-codec/`. + +**Dependency:** Plan A, Plan B. + +**Why sixth:** highest research novelty; lowest near-term ROI +(federated SGD is a niche stack). + +## 6. Exploration paths + +Things that warrant a sprint or research session, not a single PR. +Each has at least one unresolved question that disqualifies it from +"integration plan" status. + +### X1. Carrier-agnostic 4-neighbour topology trait + +Design a `trait NeighbourTopology` that +`IntraContext` consumes generically. Cognitive: N=4 NEWS. 3DGS: N=4 +(prev/next-Morton, parent/child-LOD). Attention: N=4 (prev/next +token, prev/next head). SGD: N=4 (prev/next iter, prev/next layer). +Compile-time-resolved, zero-cost. **Open question:** does mode-coding +generalise to N=6 (3D XYZ)? Two more `MergeDir` discriminants needed; +bit-budget impact on the wire format. + +### X2. Hierarchical motion estimation as cross-tier prediction + +HEVC's hierarchical ME (4-tier coarse-to-fine pyramid) maps onto +the BlockedGrid L1/L2/L3/L4 cascade. **Open question:** the cost +function. HEVC uses SAD on luma; cognitive uses Hamming on +Fingerprints; 3DGS uses PSNR on rendered tiles. Three cost +functions, one search structure — is the hierarchical-ME logic +worth the abstraction? + +### X3. CABAC vs. rANS for attention KV cache + +CABAC's serial dependency caps throughput at ~10⁸ symbols/sec on +modern CPU. rANS gets ~10⁹. **Open question:** does the latency +floor matter for attention's real bottleneck (memory bandwidth, +not entropy decode)? Bench before committing to A7. + +### X4. SH coefficient intra-prediction in spectral space + +Predict L=2, L=3 SH from a learned linear function of L=0, L=1. +**Open question:** is the linear function global or per-basin? Per- +basin is more expensive but probably 2× better; need data to +decide. Inria's stock 3DGS dataset (Mip-NeRF 360, T&T, Deep +Blending) is the benchmark. + +### X5. Mode-coded LoRA + +E8 above. **Open question:** does Skip-heavy `ΔW` retain LoRA's +fine-tuning quality? Run a controlled experiment on a Qwen3.5-7B +checkpoint with LoRA rank 8 vs. mode-coded ΔW at the same byte +budget. Measure on MMLU-redux + a downstream task. + +### X6. Unified `mode_decide + reduce` trait (E9) + +Generalise `predict_intra` so it's parameterised on the **reduction +operator**: alpha-composite (3DGS), rANS-encode (codec), +sum-reduce (SGD all-reduce), softmax (attention). **Open +question:** does a single trait actually compose, or does each +domain need its own bespoke variant? Risk: premature abstraction. + +### X7. Lance column substrate as the universal palette codebook backing store + +`SpoDistanceMatrices` at 611M lookups/sec, 388 KB RAM. If we +extend it to handle (basin_centroid → idx) lookups for all four +loads in § 1, we get one column-store serving cognitive cells, +KV palettes, 3DGS palette, and gradient-pattern banks. **Open +question:** the centroid distance function differs per load +(Hamming for fingerprints, L2 for Gaussians, cosine for Q vectors, +sign-vote for gradients). Does `SpoDistanceMatrices` accept +pluggable metrics? + +### X8. AMX TDPBF16PS for batched EWA sandwich + +The `M · Σ · Mᵀ` operation on 16 Gaussians at a time fits AMX's +16×16 BF16 tile exactly. **Open question:** the precision loss +from BF16 vs. FP32 on 2D conic invertibility — preliminary lit +search says fine, but needs Pillar-7-style probe before commit. + +## 7. Holy Grail material + +If all of § 5 + § 6 land, the following outcomes fall out. None +are guaranteed; each is the "yes, that worked" branch. + +### HG1. **One codec, four loads.** + +A unified bytestream format codes cognitive cells, 3DGS scenes, +KV caches, and gradient streams interchangeably. The Lance column +substrate stores them all in the same Arrow-backed layout. A +single `cargo install` ships compression for video-codec-equivalent ++ all four cognitive/ML loads. + +Marketing line: *"x265 was a codec for one signal. PR-X12 is a +codec for the manifold of predictable codebook-coded signals."* + +### HG2. **Sub-1-bit-per-Gaussian 3DGS compression.** + +Stock 3DGS: ~250 bytes/Gaussian raw, ~50 bytes after PLY-trim. +PR-X12 mode-coded + A7 rANS: ~3-8 bits/Gaussian for the dominant +modes. **30-60× over current state of the art.** A 1M-Gaussian +scene fits in ~500 KB instead of 50 MB. Streamable as a video. + +### HG3. **Bit-exact attention with tunable accuracy floor.** + +`predict_intra` over (Q, K) palette gives an attention sparsifier +that is bit-exact at the "Escape always" setting and gradually +loses precision as Skip/Merge/Delta dominate. The accuracy floor +is a single knob (`escape_threshold: u8`) — no per-model tuning. +Streaming-LLM, H2O, SnapKV become consumers of one codec. + +### HG4. **Federated SGD at 8-16× compression with zero accuracy loss.** + +Worker→aggregator gradient streams via `LeafCu`. Skip-mode kills +noise; Merge-mode discovers parameter sharing online; Delta-mode +gives QSGD; Escape-mode preserves outliers. The compression is +free because the codec already exists. + +### HG5. **Lance column-substrate identity becomes the ground truth.** + +The same Arrow buffer feeds: cognitive cell storage, 3DGS Gaussian +SoA, KV cache, gradient shards. The codec encodes the same bytes +across all four. `lance-graph::SpoDistanceMatrices` becomes the +universal palette codebook lookup. ndarray = hardware; lance = +substrate; codec = compression; PR-X12 closes the substrate loop. + +### HG6. **The "splat3d × x265" bet pays out as one library.** + +The May splat3d sprint (PRs 1-7) gave a CPU-SIMD 3DGS renderer. +PR-X12 gives the codec. Combined, the same library compresses, +streams, decodes, and renders 3D scenes in real-time on a single +core. **The combination is novel; neither half is.** + +## 8. Codec-side technical debt + +Honest accounting. PR-X12 shipped A2 + A3-intra; what we owe +ourselves to make the rest of this doc bankable: + +### D-CODEC-1. A3-inter is unwritten. (P1) + +The `IntraContext` consumes 4 NEWS neighbours; the design doc +calls for parent-tier + child-tier extension. Without inter +prediction, the BlockedGrid L2/L3/L4 cascade contributes nothing +to compression. **Plan B in § 5.** + +### D-CODEC-2. rANS A7 is unwritten. (P0 for any real benchmark) + +Without entropy coding, the per-mode bit budget is rounded to +bytes. 2 bits/cell achievable becomes 2 bytes/cell shipped — 8× +overhead. Plan A. + +### D-CODEC-3. λ-RDO A6 is unwritten. (P1) + +Mode-decision is greedy (cheapest-fit wire cost). Real codecs +trade bits for distortion via λ-RDO. Without it, the codec +cannot be tuned for accuracy/compression trade-off — the lossy +Escape fallback is the only knob and it's binary. + +### D-CODEC-4. Stream framing A8 is unwritten. (P1) + +`pack_leaf` writes raw `LeafCu` records back-to-back. No frame +boundaries, no CTU markers, no error recovery. Live streaming +needs all three. + +### D-CODEC-5. The basin codebook is **not built**. (P1, blocks all loads) + +The codec assumes `basin_idx` comes from somewhere. For cognitive +cells the somewhere is `OgitBridge` (downstream). For 3DGS, +attention, SGD — the codebook construction is per-load, k-means +over the carrier, no shared infra yet. + +### D-CODEC-6. The lossy Escape fallback is a footgun. (P3) + +Owner-review noted the docstring acknowledges the lie. Long-term: +promote to a config field (E10). Short-term: docstring is fine. + +### D-CODEC-7. NEWS topology is hard-coded. (P2) + +`merge_dir_from_index` in `predict.rs:281` is a 4-way match. The +codec is not generic over topology yet. Plan X1 — exploration. + +### D-CODEC-8. No SIMD-batched CTU sweep. (P2) + +`predict_intra` is scalar; per-CTU at 64×64 = 4096 cells, the +SIMD opportunity is obvious (16 cells per `F32x16` lane). Deferred +until reference + reconstruction parity test land. + +### D-CODEC-9. No `Result`-shaped error variant. (P3) + +`pack_leaf` returns `Option`. Real errors (buffer too +short, mode-decision inconsistency) lose semantics. Promote to +a typed `enum CodecError`. + +### D-CODEC-10. The mode 2-bit encoding pins us to ≤4 modes. (P3, architectural) + +`pack_header` puts 2 bits at bits 12-13 of u16, leaving 2 reserved +high bits. Future "mode 5" (e.g., a 16-bit Delta variant for +splat) needs to claim bit 14. **Plan the upgrade path in the +design doc before shipping A7.** + +## 9. Stack-side technical debt when combining synergies + +The harder accounting. PR-X12 fits cleanly into ndarray. But when +we wire the synergies of §§ 4-7, the **existing stack** has debts +that get worse, not better, under multi-load pressure. Honest +catalogue: + +### D-STACK-1. `BlockedGrid` block size is fixed at 64×64. (P1 if 3DGS lands) + +3DGS tiles in the splat3d crate are 16×16. The codec assumes 64×64 +CTUs. The pre-sprint prompt for `pr-x12` aligns them at L1 = 64×64 +of cognitive cells. **For 3DGS coefficient compression**, the +natural CTU is one tile = 16×16. Mismatch: either generalise +`Ctu` over block size (preferred, low cost) or maintain two block +formats (technical debt). Decide before Plan E (3DGS codec). + +### D-STACK-2. The basin codebook lookup has no SIMD path. (P1) + +`SpoDistanceMatrices` at 611M lookups/sec is sequential; the codec +needs **batched** lookup (1 CTU = 4096 cells × 4096 basins = 16M +distance computes). Without SIMD, the encoder is lookup-bound at +~10⁵ CTU/sec. With AVX-512 + AMX, 10⁷ CTU/sec achievable. **Bench +before A6 RDO.** + +### D-STACK-3. `MergeDir`'s 4-way alphabet is wire-pinned. (P1 for X1) + +`cell_mode_discriminants_match_wire_codes` test pins MergeDir to +`{N=0, E=1, W=2, S=3}` on the wire. If X1 generalises topology to +N=6 or N=8, the wire format breaks. Plan the upgrade with a +version byte in A8 stream framing. + +### D-STACK-4. `Fingerprint` is 64-bit only. (P2 for 3DGS) + +3DGS basin residual is 96 bits (6 × FP16). Either widen +`Fingerprint` (touches truth/cascade/bf16_truth modules) or +introduce a sibling type for splat (better — keep cognitive +cells fingerprint-typed). The codec is type-generic enough to +not care, but consumers will. + +### D-STACK-5. The `splat3d` PRs do not consume `codec`. (P2) + +Currently independent. Combining E9's "mode-decide + reduce" trait +requires either (a) a shared trait crate or (b) a refactor of +both. Decide before committing to Plan E. + +### D-STACK-6. Lance column substrate exists in `lance-graph`, not `ndarray`. (P1 for HG5) + +The HG5 "Lance is the substrate" outcome requires `ndarray::hpc::codec` +to depend on `lance-graph::SpoDistanceMatrices`. Currently ndarray +is the **dependency-bottom** of the stack. Two options: invert +(ndarray depends on lance — wrong, breaks the layering rule from +CLAUDE.md "Architecture Rule"), or introduce a third crate that +both depend on. Probably the latter; needs a sprint. + +### D-STACK-7. The cognitive `splat.rs` in `lance-graph-contract` is sacred. (P0, do not touch) + +Per the sprint setup: that file is the contract. PR-X12 must never +import or refactor it. If E4 (Fingerprint ≡ 3DGS first-6-floats) +becomes provable, it'll be **tempting** to fold them. Don't. The +abstraction boundary is load-bearing for the cognitive +architecture, even if the bit patterns rhyme. + +### D-STACK-8. No backend dispatch in the codec. (P2) + +`pack_leaf` is one implementation. EWA, BLAS, MKL all have backend +dispatch (`native` / `intel-mkl` / `openblas` features). The codec +will need: scalar / SIMD / AMX backends for the SIMD-batched CTU +sweep (D-CODEC-8). Plan when D-CODEC-8 lands. + +### D-STACK-9. The 4096-basin codebook size assumes "per-frame, reset between frames". (P3) + +For attention's KV cache, the "frame" is the (context-window, +batch-element) tuple. For 3DGS, the "frame" is the entire trained +scene (codebook is static after training). For SGD, the "frame" +is one mini-batch. **Three different lifetimes, one type.** Either +generalise lifetime (preferred) or document the discipline (likely). + +### D-STACK-10. The current PR-arc cadence is one PR per worker per day. (P2, organisational) + +The synergies in §§ 5-7 will require multi-worker coordinated +sprints (e.g., Plan D = 2 workers × 2 weeks). The autoattended +multi-agent protocol scales worker count, but the coordinator's +state machine doesn't currently model multi-week dependencies. +**Update the coordinator agent prompt before kicking off Plan D.** + +### D-STACK-11. AVX-512 is mandatory in `.cargo/config.toml`. (P1 for portability) + +CLAUDE.md: `target-cpu=x86-64-v4`. Plan F (federated SGD) implies +multi-architecture (NEON workers, AVX2 workers). Either drop the +mandatory AVX-512 or scope federated SGD to AVX-512 nodes only. + +### D-STACK-12. The cognitive `Base17` / `NarsTruth` / `TripleModel` types live in `lance-graph`. (P1 for HG3) + +HG3 (attention codec) wants to consume cognitive truth values +(NarsTruth) to gate the Escape-or-Skip decision. Same dependency +inversion as D-STACK-6. + +### D-STACK-13. No multi-domain benchmark harness. (P0 if we want to claim HG1) + +We have splat3d bench, codec tests, SpoDistanceMatrices bench +separately. A combined "single-codec-four-loads" benchmark — one +build, one binary, four scenarios — does not exist. Without it, +HG1 is a claim, not a demonstration. **Build the harness before +the marketing.** + +## 10. Sequencing summary + +If we commit to all of this, the order matters: + +``` + Plan A (rANS A7) + │ + ▼ + ┌─────────────────┬────────────────────┬─────────────────┐ + ▼ ▼ ▼ ▼ + Plan B (A3-inter) Plan C (EWA SYRK) Plan X8 (AMX BF16) D-STACK-2 (SIMD lookup) + │ │ + ▼ ▼ + Plan E (3DGS codec) ◄──────────────────────────────── D-STACK-1 (block size) + │ + ▼ + Plan D (attention codec) ◄──────────────────────────── D-STACK-6/12 (third crate) + │ + ▼ + Plan F (gradient codec) + │ + ▼ + HG1-HG6 unlocked +``` + +Critical path: **A7 rANS** → everything else. Without it, no +benchmark is convincing. Plan A is one worker for one week. Ship +that next; the rest of this doc is just inventory until A7 lands. + +## 11. Compaction-preservation note + +Per CLAUDE.md § Compaction Preservation, this doc must survive +summarisation. The blackboard entry should reference this file by +path; do not inline the matrix. Key facts to retain across +compaction: + +1. PR-X12 A2 + A3-intra shipped in PR-195 (master commits b39a5769, + b44fe59f). All review comments resolved or outdated. +2. The four-load isomorphism (§ 2) is the architectural claim; + everything else is sequencing. +3. The critical path is A7 rANS — without it, the codec is academic. +4. The Lance column substrate identity (HG5) is the convergence + highway; both ndarray and lance-graph land there. +5. The sacred file is `lance-graph-contract/src/splat.rs`. Never + touch even if the bit patterns rhyme (D-STACK-7). From 242329853e61d08d0f2cc8bd3fb0259c2a9f2ac9 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 07:35:56 +0000 Subject: [PATCH 5/5] =?UTF-8?q?fix(codec):=20address=20PR-195=20follow-up?= =?UTF-8?q?=20review=20=E2=80=94=20bijective=20pack=5Fleaf=20+=20BASIN=5FN?= =?UTF-8?q?ONE=20distinctness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real bugs flagged by CodeRabbit's outside-diff review on b44fe59f. P1 — `pack_leaf` was non-bijective via unwrap_or fallbacks (mode.rs:194): Hand-constructing a `LeafCu { mode: Merge, merge_dir: None, .. }` serialized as `LeafCu::merge(.., MergeDir::North)` — silently rewriting malformed input into a different valid leaf and hiding upstream invariant breaks. Now uses `?` to short-circuit with None when required fields are missing for the mode (Merge needs merge_dir; Delta needs delta; Escape needs escape_idx). The `LeafCu::merge/delta/escape` constructors still enforce the invariants — only struct-literal bypass callers can hit the rejection. + 3 regression tests: leaf_pack_rejects_malformed_merge_without_dir, leaf_pack_rejects_malformed_delta_without_value, leaf_pack_rejects_malformed_escape_without_idx + pack_leaf docstring documents the bijective contract. P1 — `BASIN_NONE` collided with `MAX_BASIN_IDX` at 4095 (mode.rs:69): Both equaled `(1 << 12) - 1`, so basin 4095 was ambiguous — a real basin and the "no basin" sentinel had the same encoded value. Now: MAX_BASIN_IDX = 4094 (highest real basin), BASIN_NONE = 4095 (sentinel one slot above). Introduced private BASIN_FIELD_MASK = 0x0FFF for header packing/unpacking — independent of MAX_BASIN_IDX so BASIN_NONE still round-trips through the 12-bit field as a sentinel marker. pack_header/unpack_header switched from `& MAX_BASIN_IDX` (would wrongly clear bit 0 with 0xFFE) to `& BASIN_FIELD_MASK`. + 2 regression tests: basin_none_distinct_from_max_basin_idx, header_round_trips_max_basin_idx_and_basin_none_distinctly + MAX_BASIN_IDX + BASIN_NONE doctests updated to assert the new relationship. Gates: cargo test --features codec --lib hpc::codec → 55 passed (+5) cargo test --features codec --doc hpc::codec → 15 passed cargo fmt --all -- --check → clean cargo clippy --features codec --lib -- -D warnings → clean --- src/hpc/codec/mode.rs | 132 +++++++++++++++++++++++++++++++++++------- 1 file changed, 112 insertions(+), 20 deletions(-) diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs index f748b7bd..4d756812 100644 --- a/src/hpc/codec/mode.rs +++ b/src/hpc/codec/mode.rs @@ -59,23 +59,41 @@ use super::ctu::{CellMode, LeafCu, MergeDir}; // Header pack / unpack (16-bit) // ════════════════════════════════════════════════════════════════════ -/// Maximum encodable `basin_idx`. Stored in the lower 12 bits of the -/// header; values >= this constant overflow the header field. +/// Maximum encodable real `basin_idx`. Equal to `(1 << 12) - 2 = 4094` +/// so that the all-ones 12-bit pattern (`0xFFF = 4095`) is reserved as +/// the [`BASIN_NONE`] sentinel — without that reservation, basin 4095 +/// would round-trip ambiguously with "no basin assigned". +/// +/// The on-wire 12-bit field still holds any value `0..=0xFFF`; only the +/// encoder's *valid-basin* range is restricted to `0..=MAX_BASIN_IDX`. +/// [`BASIN_NONE`] is encodable in the header field too (when an encoder +/// emits a "no basin" record), but it must never appear as a real basin +/// codebook index. /// /// ``` -/// use ndarray::hpc::codec::MAX_BASIN_IDX; -/// assert_eq!(MAX_BASIN_IDX, (1 << 12) - 1); +/// use ndarray::hpc::codec::{BASIN_NONE, MAX_BASIN_IDX}; +/// assert_eq!(MAX_BASIN_IDX, (1 << 12) - 2); +/// assert_eq!(MAX_BASIN_IDX, 4094); +/// assert!(MAX_BASIN_IDX < BASIN_NONE); /// ``` -pub const MAX_BASIN_IDX: u16 = (1 << 12) - 1; // 4095 +pub const MAX_BASIN_IDX: u16 = (1 << 12) - 2; // 4094 /// Tag inside the per-frame basin codebook for "no basin assigned" -/// (encoder-side sentinel during mode decision). +/// (encoder-side sentinel during mode decision). Equal to `0xFFF` +/// (the all-ones 12-bit pattern) so it sits one slot above the highest +/// real basin index ([`MAX_BASIN_IDX`]). /// /// ``` /// use ndarray::hpc::codec::{BASIN_NONE, MAX_BASIN_IDX}; -/// assert_eq!(BASIN_NONE, MAX_BASIN_IDX); +/// assert_eq!(BASIN_NONE, 4095); +/// assert_eq!(BASIN_NONE, MAX_BASIN_IDX + 1); /// ``` -pub const BASIN_NONE: u16 = MAX_BASIN_IDX; +pub const BASIN_NONE: u16 = (1 << 12) - 1; + +/// Private: 12-bit mask for the basin field of the packed header. +/// Independent of [`MAX_BASIN_IDX`] so that [`BASIN_NONE`] (which sits +/// in the 12-bit field but is not a real basin) still round-trips. +const BASIN_FIELD_MASK: u16 = 0x0FFF; /// Pack `(mode, basin_idx)` into a 16-bit header. /// @@ -91,7 +109,7 @@ pub const BASIN_NONE: u16 = MAX_BASIN_IDX; #[inline] pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 { let mode_bits = (mode as u16) & 0b11; - let basin_bits = basin_idx & MAX_BASIN_IDX; + let basin_bits = basin_idx & BASIN_FIELD_MASK; (mode_bits << 12) | basin_bits } @@ -108,7 +126,7 @@ pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 { #[inline] pub fn unpack_header(packed: u16) -> (CellMode, u16) { let mode_bits = ((packed >> 12) & 0b11) as u8; - let basin_idx = packed & MAX_BASIN_IDX; + let basin_idx = packed & BASIN_FIELD_MASK; let mode = match mode_bits { 0b00 => CellMode::Skip, 0b01 => CellMode::Merge, @@ -165,9 +183,15 @@ pub fn unpack_merge_dir(byte: u8) -> MergeDir { /// worst case) — callers iterating CTUs typically pre-allocate /// `6 * cell_count` and trim afterwards. /// -/// Returns `None` if `out.len() < packed_byte_len(leaf.mode)` (insufficient -/// capacity for the *mode's* width — Skip needs 2, Merge/Delta need 3, -/// Escape needs 6). +/// Returns `None` in two cases: +/// - `out.len() < packed_byte_len(leaf.mode)` (insufficient capacity for +/// the *mode's* width — Skip needs 2, Merge/Delta need 3, Escape needs 6). +/// - `leaf` is structurally malformed for its mode: `Merge` without a +/// `merge_dir`, `Delta` without a `delta`, or `Escape` without an +/// `escape_idx`. The `LeafCu::merge` / `delta` / `escape` constructors +/// enforce these invariants; only struct-literal callers bypassing the +/// constructors can hit this case. Pack is therefore bijective on the +/// well-formed `LeafCu` subset. /// /// Format: /// - Bytes 0-1: header (`pack_header(mode, basin_idx)`, LE) @@ -191,22 +215,24 @@ pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option { } let header = pack_header(leaf.mode, leaf.basin_idx); out[..2].copy_from_slice(&header.to_le_bytes()); + // Per-mode tail. `?` rejects malformed `LeafCu`s (e.g. a hand-built + // `LeafCu { mode: Merge, merge_dir: None, .. }`) with `None` rather + // than silently rewriting them into a different valid leaf. The + // `LeafCu::merge/delta/escape` constructors enforce the invariants; + // only struct-literal callers bypassing those constructors hit + // these short-circuits. let tail_len = match leaf.mode { CellMode::Skip => 0, CellMode::Merge => { - // Caller guarantees `merge_dir.is_some()` for `Merge` mode - // (LeafCu::merge constructor enforces this). Fall back to - // North if the invariant is violated, to keep encoder - // robustness — the decoder will still produce a valid leaf. - out[2] = pack_merge_dir(leaf.merge_dir.unwrap_or(MergeDir::North)); + out[2] = pack_merge_dir(leaf.merge_dir?); 1 } CellMode::Delta => { - out[2] = leaf.delta.unwrap_or(0); + out[2] = leaf.delta?; 1 } CellMode::Escape => { - let idx = leaf.escape_idx.unwrap_or(0); + let idx = leaf.escape_idx?; out[2..6].copy_from_slice(&idx.to_le_bytes()); 4 } @@ -372,6 +398,72 @@ mod tests { assert!(pack_leaf(&leaf, &mut buf).is_none()); } + #[test] + fn leaf_pack_rejects_malformed_merge_without_dir() { + // Bypass `LeafCu::merge` constructor and hand-build a leaf with + // mode = Merge but merge_dir = None. The previous unwrap_or(North) + // behavior would silently coerce this into a valid leaf — now we + // reject with None instead. + let malformed = LeafCu { + mode: CellMode::Merge, + basin_idx: 10, + delta: None, + merge_dir: None, + escape_idx: None, + }; + let mut buf = [0u8; 6]; + assert!(pack_leaf(&malformed, &mut buf).is_none()); + } + + #[test] + fn leaf_pack_rejects_malformed_delta_without_value() { + let malformed = LeafCu { + mode: CellMode::Delta, + basin_idx: 10, + delta: None, + merge_dir: None, + escape_idx: None, + }; + let mut buf = [0u8; 6]; + assert!(pack_leaf(&malformed, &mut buf).is_none()); + } + + #[test] + fn leaf_pack_rejects_malformed_escape_without_idx() { + let malformed = LeafCu { + mode: CellMode::Escape, + basin_idx: 10, + delta: None, + merge_dir: None, + escape_idx: None, + }; + let mut buf = [0u8; 6]; + assert!(pack_leaf(&malformed, &mut buf).is_none()); + } + + #[test] + fn basin_none_distinct_from_max_basin_idx() { + // Regression for the BASIN_NONE/MAX_BASIN_IDX collision: the + // sentinel must sit one slot above the highest real basin so + // basin 4094 is unambiguously "a real basin" and 4095 is + // unambiguously "no basin assigned". + assert_eq!(MAX_BASIN_IDX, 4094); + assert_eq!(BASIN_NONE, 4095); + assert!(MAX_BASIN_IDX < BASIN_NONE); + } + + #[test] + fn header_round_trips_max_basin_idx_and_basin_none_distinctly() { + // Both values fit in the 12-bit field; the encoder treats them + // as different. (Decoders that route on BASIN_NONE need to + // compare against the sentinel explicitly.) + let real = pack_header(CellMode::Skip, MAX_BASIN_IDX); + let none = pack_header(CellMode::Skip, BASIN_NONE); + assert_ne!(real, none); + assert_eq!(unpack_header(real), (CellMode::Skip, MAX_BASIN_IDX)); + assert_eq!(unpack_header(none), (CellMode::Skip, BASIN_NONE)); + } + #[test] fn leaf_unpack_rejects_short_buffer() { // Header says Escape but only 2 bytes follow → not enough.