From 26d987f9dbd9a3c57fe7b1847518efbf902b3633 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 21 May 2026 21:19:11 +0000
Subject: [PATCH 1/5] feat(codec): PR-X12 A2 mode bit-pack + A3-intra
 prediction kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A2 — mode.rs (~270 lines, 11 tests):
- 16-bit header: 2-bit mode (Skip/Merge/Delta/Escape) + 12-bit basin_idx
- MergeDir 2-bit pack/unpack (high bits masked)
- Whole-leaf compact pack/unpack: Skip=2B, Merge=3B, Delta=3B, Escape=6B
- packed_byte_len() const fn for buffer pre-sizing
- MAX_BASIN_IDX (4095) + BASIN_NONE sentinel
- Stream roundtrip test for mixed-mode leaves

A3-intra — predict.rs (~330 lines, 12 tests):
- IntraContext { basin_idx, delta_i32, NESW neighbours }
- IntraConfig { escape_next_idx: Option<u32> }
- predict_intra() decision tree: Skip → Merge → Delta → Escape
  (monotone wire cost 2 → 3 → 3 → 6 bytes; cheapest-fit policy)
- Merge match: same basin_idx + same δ as u8 (sign-tolerant wrapping cast)
- Escape fallback: lossy i8 clamp when allocator absent (never panics)
- End-to-end pack/unpack chain test through the decision

Deferred to follow-up: A3-inter (cross-tier neighbour scan from
BlockedGrid L2/L3), A4 transform, A6 RDO, A7 rANS, A8 stream framing.
---
 src/hpc/codec/mod.rs     |  17 +-
 src/hpc/codec/mode.rs    | 380 +++++++++++++++++++++++++++++++++++++
 src/hpc/codec/predict.rs | 397 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 791 insertions(+), 3 deletions(-)
 create mode 100644 src/hpc/codec/mode.rs
 create mode 100644 src/hpc/codec/predict.rs
diff --git a/src/hpc/codec/mod.rs b/src/hpc/codec/mod.rs
index 54f171bd..2f23c294 100644
--- a/src/hpc/codec/mod.rs
+++ b/src/hpc/codec/mod.rs
@@ -8,9 +8,13 @@
 //! # Module layout (per PR-X12 worker decomposition)
 //!
 //! - [`ctu`] — A1: `Ctu` carrier + `CtuPartition` enum + quad-tree
-//!   split / merge ops. **Shipped in this PR.**
-//! - `mode`, `predict`, `transform`, `quantize`, `rdo`, `ans`, `stream`
-//!   — A2-A8, queued as follow-up sprints.
+//!   split / merge ops.
+//! - [`mode`] — A2: bit-pack / unpack helpers for the on-wire 16-bit
+//!   header + per-mode tail (Skip/Merge/Delta/Escape).
+//! - [`predict`] — A3-intra: encoder-side mode-decision kernel that
+//!   picks the cheapest `LeafCu` from a cell + NESW neighbours.
+//! - `transform`, `quantize`, `rdo`, `ans`, `stream` — A4-A8, queued as
+//!   follow-up sprints.
 //!
 //! # Feature gate
 //!
@@ -22,6 +26,13 @@
 //! `.claude/knowledge/pr-x12-codec-x265-design.md` — master design doc.
 
 pub mod ctu;
+pub mod mode;
+pub mod predict;
 
 pub use ctu::{CellMode, MergeDir, MAX_QUAD_TREE_NODES, MAX_SPLIT_DEPTH};
 pub use ctu::{Ctu, CtuArena, CtuPartition, LeafCu, MaxSplitDepthReached, MergeError, NodeIdx};
+pub use mode::{
+    pack_header, pack_leaf, pack_merge_dir, packed_byte_len, unpack_header, unpack_leaf, unpack_merge_dir, BASIN_NONE,
+    MAX_BASIN_IDX,
+};
+pub use predict::{is_no_basin, predict_intra, IntraConfig, IntraContext};
diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs
new file mode 100644
index 00000000..d18f5c96
--- /dev/null
+++ b/src/hpc/codec/mode.rs
@@ -0,0 +1,380 @@
+//! Mode bit-pack / unpack helpers (PR-X12 A2).
+//!
+//! Compact wire-friendly representation of the [`CellMode`] +
+//! [`MergeDir`] + [`LeafCu`] fields from [`super::ctu`]. The functions
+//! here are the inverse of each other and pack into the smallest
+//! integer width that fits, leaving the per-mode payload (`delta`,
+//! `escape_idx`) for callers to append/consume as raw bytes.
+//!
+//! # Header layout — `pack_header` / `unpack_header`
+//!
+//! Each leaf has a fixed 16-bit header followed by a variable-width
+//! tail. The header packs the most-frequently-accessed fields so a
+//! decoder can route on a single `u16` load:
+//!
+//! ```text
+//!     MSB                                                    LSB
+//!     ┌──┬──┬──────────────────────────────┐
+//!     │M0│M1│         basin_idx (12)        │   ← 16-bit header
+//!     └──┴──┴──────────────────────────────┘
+//!     │  │  └─ basin_idx is the only payload field always present
+//!     └──┴──── 2-bit mode discriminant (CellMode::as_u8())
+//!     (top 2 bits)
+//! ```
+//!
+//! The remaining 2 bits at the top of the second byte are reserved for
+//! the encoder's future `merge_dir` overlap when the mode is `Merge`;
+//! a separate `pack_mode_dir` helper keeps `Merge`'s direction in a
+//! single byte alongside `Skip`/`Delta`/`Escape`'s mode tag.
+//!
+//! # Per-mode tail width
+//!
+//! | Mode   | Header | Tail bytes               | Total |
+//! |--------|--------|--------------------------|-------|
+//! | Skip   | 2      | 0                        | 2     |
+//! | Merge  | 2      | 1 (`MergeDir` 2-bit)     | 3     |
+//! | Delta  | 2      | 1 (`u8` perturbation)    | 3     |
+//! | Escape | 2      | 4 (`u32` escape_idx, LE) | 6     |
+//!
+//! The compact pack writes header (LE) then the per-mode tail. The
+//! `escape_idx` width is the worst case; a future A7 rANS pass can
+//! shrink it via per-frame frequency tables — A2 stays format-stable.
+//!
+//! # What A2 does NOT do
+//!
+//! - **Bytestream framing** (frame headers, CTU markers) — lives in
+//!   PR-X12 A8 `stream.rs`.
+//! - **Entropy coding** (rANS) — lives in PR-X12 A7 `ans.rs`. A2's
+//!   output is the input to A7.
+//! - **Per-frame escape vector** — caller maintains it; A2 packs the
+//!   `escape_idx` referencing into the leaf header.
+
+use super::ctu::{CellMode, LeafCu, MergeDir};
+
+// ════════════════════════════════════════════════════════════════════
+// Header pack / unpack (16-bit)
+// ════════════════════════════════════════════════════════════════════
+
+/// Maximum encodable `basin_idx`. Stored in the lower 12 bits of the
+/// header; values >= this constant overflow the header field.
+pub const MAX_BASIN_IDX: u16 = (1 << 12) - 1; // 4095
+
+/// Tag inside the per-frame basin codebook for "no basin assigned"
+/// (encoder-side sentinel during mode decision).
+pub const BASIN_NONE: u16 = MAX_BASIN_IDX;
+
+/// Pack `(mode, basin_idx)` into a 16-bit header.
+///
+/// `basin_idx` must be `<= MAX_BASIN_IDX` (12 bits). Higher bits are
+/// silently truncated; the encoder should clamp before calling.
+///
+/// ```
+/// use ndarray::hpc::codec::mode::{pack_header, unpack_header};
+/// use ndarray::hpc::codec::CellMode;
+/// let h = pack_header(CellMode::Delta, 1234);
+/// assert_eq!(unpack_header(h), (CellMode::Delta, 1234));
+/// ```
+#[inline]
+pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 {
+    let mode_bits = (mode as u16) & 0b11;
+    let basin_bits = basin_idx & MAX_BASIN_IDX;
+    (mode_bits << 12) | basin_bits
+}
+
+/// Unpack a 16-bit header into `(mode, basin_idx)`.
+///
+/// The 2-bit mode field always decodes (all 4 variants are valid).
+/// `basin_idx` is the 12-bit lower field, exactly as packed.
+#[inline]
+pub fn unpack_header(packed: u16) -> (CellMode, u16) {
+    let mode_bits = ((packed >> 12) & 0b11) as u8;
+    let basin_idx = packed & MAX_BASIN_IDX;
+    let mode = match mode_bits {
+        0b00 => CellMode::Skip,
+        0b01 => CellMode::Merge,
+        0b10 => CellMode::Delta,
+        _ => CellMode::Escape,
+    };
+    (mode, basin_idx)
+}
+
+// ════════════════════════════════════════════════════════════════════
+// MergeDir 2-bit pack / unpack
+// ════════════════════════════════════════════════════════════════════
+
+/// Pack a [`MergeDir`] into the lower 2 bits of a `u8`.
+#[inline]
+pub fn pack_merge_dir(dir: MergeDir) -> u8 {
+    dir as u8
+}
+
+/// Unpack the lower 2 bits of a `u8` into a [`MergeDir`].
+///
+/// All four 2-bit values map to a valid `MergeDir`; bits 2-7 are
+/// ignored.
+#[inline]
+pub fn unpack_merge_dir(byte: u8) -> MergeDir {
+    match byte & 0b11 {
+        0 => MergeDir::North,
+        1 => MergeDir::East,
+        2 => MergeDir::West,
+        _ => MergeDir::South,
+    }
+}
+
+// ════════════════════════════════════════════════════════════════════
+// Whole-leaf pack / unpack
+// ════════════════════════════════════════════════════════════════════
+
+/// Compact pack: writes header (2 bytes, LE) + per-mode tail into
+/// `out`. Returns the number of bytes written.
+///
+/// The buffer must have at least 6 bytes of space (the Escape-mode
+/// worst case) — callers iterating CTUs typically pre-allocate
+/// `6 * cell_count` and trim afterwards.
+///
+/// Returns `None` if `out.len() < 6` (insufficient capacity).
+///
+/// Format:
+/// - Bytes 0-1: header (`pack_header(mode, basin_idx)`, LE)
+/// - Bytes 2..: per-mode tail (see module docs)
+///
+/// ```
+/// use ndarray::hpc::codec::mode::{pack_leaf, unpack_leaf};
+/// use ndarray::hpc::codec::LeafCu;
+/// let leaf = LeafCu::delta(42, 0x7F);
+/// let mut buf = [0u8; 6];
+/// let n = pack_leaf(&leaf, &mut buf).unwrap();
+/// assert_eq!(n, 3);
+/// let (decoded, consumed) = unpack_leaf(&buf).unwrap();
+/// assert_eq!(decoded, leaf);
+/// assert_eq!(consumed, 3);
+/// ```
+pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option<usize> {
+    if out.len() < 6 {
+        return None;
+    }
+    let header = pack_header(leaf.mode, leaf.basin_idx);
+    out[..2].copy_from_slice(&header.to_le_bytes());
+    let tail_len = match leaf.mode {
+        CellMode::Skip => 0,
+        CellMode::Merge => {
+            // Caller guarantees `merge_dir.is_some()` for `Merge` mode
+            // (LeafCu::merge constructor enforces this). Fall back to
+            // North if the invariant is violated, to keep encoder
+            // robustness — the decoder will still produce a valid leaf.
+            out[2] = pack_merge_dir(leaf.merge_dir.unwrap_or(MergeDir::North));
+            1
+        }
+        CellMode::Delta => {
+            out[2] = leaf.delta.unwrap_or(0);
+            1
+        }
+        CellMode::Escape => {
+            let idx = leaf.escape_idx.unwrap_or(0);
+            out[2..6].copy_from_slice(&idx.to_le_bytes());
+            4
+        }
+    };
+    Some(2 + tail_len)
+}
+
+/// Compact unpack: reads header + per-mode tail from `buf`. Returns
+/// `(leaf, bytes_consumed)`.
+///
+/// Returns `None` if the buffer is shorter than the per-mode width
+/// (2 for Skip, 3 for Merge/Delta, 6 for Escape).
+pub fn unpack_leaf(buf: &[u8]) -> Option<(LeafCu, usize)> {
+    if buf.len() < 2 {
+        return None;
+    }
+    let header = u16::from_le_bytes([buf[0], buf[1]]);
+    let (mode, basin_idx) = unpack_header(header);
+    let (leaf, consumed) = match mode {
+        CellMode::Skip => (LeafCu::skip(basin_idx), 2),
+        CellMode::Merge => {
+            if buf.len() < 3 {
+                return None;
+            }
+            (LeafCu::merge(basin_idx, unpack_merge_dir(buf[2])), 3)
+        }
+        CellMode::Delta => {
+            if buf.len() < 3 {
+                return None;
+            }
+            (LeafCu::delta(basin_idx, buf[2]), 3)
+        }
+        CellMode::Escape => {
+            if buf.len() < 6 {
+                return None;
+            }
+            let idx = u32::from_le_bytes([buf[2], buf[3], buf[4], buf[5]]);
+            (LeafCu::escape(basin_idx, idx), 6)
+        }
+    };
+    Some((leaf, consumed))
+}
+
+/// Byte cost of packing a leaf in this mode. Useful for pre-sizing
+/// a buffer without packing first.
+#[inline]
+pub const fn packed_byte_len(mode: CellMode) -> usize {
+    match mode {
+        CellMode::Skip => 2,
+        CellMode::Merge => 3,
+        CellMode::Delta => 3,
+        CellMode::Escape => 6,
+    }
+}
+
+// ════════════════════════════════════════════════════════════════════
+// Tests
+// ════════════════════════════════════════════════════════════════════
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn header_roundtrip_all_modes_and_basin_extents() {
+        for mode in [CellMode::Skip, CellMode::Merge, CellMode::Delta, CellMode::Escape] {
+            for basin in [0u16, 1, 42, 1234, MAX_BASIN_IDX] {
+                let h = pack_header(mode, basin);
+                assert_eq!(unpack_header(h), (mode, basin), "mode={mode:?}, basin={basin}");
+            }
+        }
+    }
+
+    #[test]
+    fn header_truncates_oversize_basin_idx() {
+        // basin_idx = 4096 doesn't fit in 12 bits; the high bit gets
+        // dropped, giving back basin=0.
+        let h = pack_header(CellMode::Skip, 4096);
+        let (_, basin) = unpack_header(h);
+        assert_eq!(basin, 0);
+    }
+
+    #[test]
+    fn merge_dir_roundtrip_all_four() {
+        for dir in [MergeDir::North, MergeDir::East, MergeDir::West, MergeDir::South] {
+            let b = pack_merge_dir(dir);
+            assert_eq!(unpack_merge_dir(b), dir);
+        }
+    }
+
+    #[test]
+    fn merge_dir_ignores_high_bits() {
+        // High bits 2-7 are reserved; unpack should mask them out.
+        assert_eq!(unpack_merge_dir(0b1111_1100), MergeDir::North);
+        assert_eq!(unpack_merge_dir(0b1111_1101), MergeDir::East);
+    }
+
+    #[test]
+    fn leaf_pack_skip_is_2_bytes() {
+        let leaf = LeafCu::skip(100);
+        let mut buf = [0xAAu8; 6];
+        let n = pack_leaf(&leaf, &mut buf).unwrap();
+        assert_eq!(n, 2);
+        // Bytes 2-5 untouched.
+        assert_eq!(&buf[2..], &[0xAA, 0xAA, 0xAA, 0xAA]);
+    }
+
+    #[test]
+    fn leaf_pack_merge_is_3_bytes() {
+        let leaf = LeafCu::merge(100, MergeDir::East);
+        let mut buf = [0u8; 6];
+        let n = pack_leaf(&leaf, &mut buf).unwrap();
+        assert_eq!(n, 3);
+        let (decoded, consumed) = unpack_leaf(&buf).unwrap();
+        assert_eq!(decoded, leaf);
+        assert_eq!(consumed, 3);
+    }
+
+    #[test]
+    fn leaf_pack_delta_is_3_bytes() {
+        let leaf = LeafCu::delta(100, 0xCC);
+        let mut buf = [0u8; 6];
+        let n = pack_leaf(&leaf, &mut buf).unwrap();
+        assert_eq!(n, 3);
+        let (decoded, consumed) = unpack_leaf(&buf).unwrap();
+        assert_eq!(decoded, leaf);
+        assert_eq!(consumed, 3);
+    }
+
+    #[test]
+    fn leaf_pack_escape_is_6_bytes() {
+        let leaf = LeafCu::escape(100, 0xDEAD_BEEF);
+        let mut buf = [0u8; 6];
+        let n = pack_leaf(&leaf, &mut buf).unwrap();
+        assert_eq!(n, 6);
+        let (decoded, consumed) = unpack_leaf(&buf).unwrap();
+        assert_eq!(decoded, leaf);
+        assert_eq!(consumed, 6);
+    }
+
+    #[test]
+    fn leaf_pack_rejects_short_buffer() {
+        let leaf = LeafCu::escape(100, 0xDEAD_BEEF);
+        let mut buf = [0u8; 5]; // 1 short of Escape's worst case
+        assert!(pack_leaf(&leaf, &mut buf).is_none());
+    }
+
+    #[test]
+    fn leaf_unpack_rejects_short_buffer() {
+        // Header says Escape but only 2 bytes follow → not enough.
+        let mut buf = [0u8; 3];
+        let header = pack_header(CellMode::Escape, 50);
+        buf[..2].copy_from_slice(&header.to_le_bytes());
+        assert!(unpack_leaf(&buf).is_none());
+    }
+
+    #[test]
+    fn packed_byte_len_matches_pack_output() {
+        let cases = [
+            (LeafCu::skip(10), CellMode::Skip),
+            (LeafCu::merge(10, MergeDir::West), CellMode::Merge),
+            (LeafCu::delta(10, 7), CellMode::Delta),
+            (LeafCu::escape(10, 99), CellMode::Escape),
+        ];
+        for (leaf, mode) in cases {
+            let mut buf = [0u8; 6];
+            let n = pack_leaf(&leaf, &mut buf).unwrap();
+            assert_eq!(n, packed_byte_len(mode));
+        }
+    }
+
+    #[test]
+    fn stream_pack_then_unpack_roundtrips_mixed_leaves() {
+        // Encode a sequence of mixed-mode leaves into one buffer,
+        // decode in order, assert exact equality of all 8.
+        let leaves = [
+            LeafCu::skip(0),
+            LeafCu::delta(1, 0xAB),
+            LeafCu::merge(2, MergeDir::North),
+            LeafCu::escape(3, 0xDEAD_BEEF),
+            LeafCu::skip(MAX_BASIN_IDX),
+            LeafCu::delta(MAX_BASIN_IDX, 0xFF),
+            LeafCu::merge(MAX_BASIN_IDX, MergeDir::South),
+            LeafCu::escape(MAX_BASIN_IDX, u32::MAX),
+        ];
+        // Worst case: 8 × 6 bytes = 48
+        let mut buf = vec![0u8; 48];
+        let mut offset = 0;
+        for leaf in &leaves {
+            let n = pack_leaf(leaf, &mut buf[offset..]).unwrap();
+            offset += n;
+        }
+        let total_written = offset;
+        // Decode in order.
+        let mut decoded = Vec::with_capacity(8);
+        let mut read = 0;
+        while read < total_written {
+            let (leaf, n) = unpack_leaf(&buf[read..]).unwrap();
+            decoded.push(leaf);
+            read += n;
+        }
+        assert_eq!(decoded.len(), 8);
+        assert_eq!(&decoded[..], &leaves[..]);
+        assert_eq!(read, total_written);
+    }
+}
diff --git a/src/hpc/codec/predict.rs b/src/hpc/codec/predict.rs
new file mode 100644
index 00000000..727b94ef
--- /dev/null
+++ b/src/hpc/codec/predict.rs
@@ -0,0 +1,397 @@
+//! Intra-prediction mode decision (PR-X12 A3, intra path).
+//!
+//! Encoder-side kernel: given a cell value, its nearest-basin index +
+//! delta, and the four cardinal neighbour `LeafCu`s, choose the best
+//! [`CellMode`] for the cell and emit the corresponding [`LeafCu`].
+//!
+//! This is the **mode-decision** kernel, not the inverse-projection
+//! reconstruction. Decoder-side reconstruction is the inverse of this
+//! decision tree and is folded into PR-X12 A6 RDO + A8 stream
+//! interpretation; A3 ships only the encoder direction.
+//!
+//! # The decision tree
+//!
+//! For one cell at (row, col) inside a CTU:
+//!
+//! ```text
+//!   ┌─────────────────────────────────────────────────┐
+//!   │ delta == 0  ?         → Skip(basin_idx)         │
+//!   └─────────────────────────────────────────────────┘
+//!                  │ no
+//!                  ▼
+//!   ┌─────────────────────────────────────────────────┐
+//!   │ any same-dir neighbour has Delta-mode with the  │
+//!   │ SAME basin_idx AND SAME |delta| (sign-tolerant) │
+//!   │                       → Merge(basin_idx, dir)   │
+//!   └─────────────────────────────────────────────────┘
+//!                  │ no candidate
+//!                  ▼
+//!   ┌─────────────────────────────────────────────────┐
+//!   │ |delta| fits in i8 (≤ 127)?                     │
+//!   │                       → Delta(basin_idx, δ_u8)  │
+//!   └─────────────────────────────────────────────────┘
+//!                  │ overflow
+//!                  ▼
+//!   ┌─────────────────────────────────────────────────┐
+//!   │                       → Escape(basin_idx, idx)  │
+//!   │      (caller appends raw u64 to escape vector)  │
+//!   └─────────────────────────────────────────────────┘
+//! ```
+//!
+//! The order is **Skip → Merge → Delta → Escape** because the wire
+//! cost is monotonically increasing in the same order (2 → 3 → 3 → 6
+//! bytes per [`packed_byte_len`](super::mode::packed_byte_len)). The
+//! decision picks the cheapest mode that fits.
+//!
+//! # What A3-intra does NOT do
+//!
+//! - **Inter prediction** (parent-tier neighbours from the
+//!   `BlockedGrid` L2/L3 cascade) — deferred to PR-X12 A3 follow-up.
+//! - **Rate-distortion optimisation** — A3-intra picks by exact match
+//!   only. Soft mode-switching with λ-RDO is PR-X12 A6.
+//! - **Transform / quantisation** — A3-intra works on already-decoded
+//!   integer deltas. The transform path (DCT-II for delta residuals)
+//!   is PR-X12 A4.
+//! - **SIMD-batched CTU sweep** — scalar reference today. The
+//!   `F32x16`-batched form (16 cells per inner loop via
+//!   `crate::simd_soa::MultiLaneColumn`) is a follow-up after the
+//!   reference + reconstruction parity test pin the math.
+
+use super::ctu::{CellMode, LeafCu, MergeDir};
+use super::mode::BASIN_NONE;
+
+// ════════════════════════════════════════════════════════════════════
+// Inputs to the encoder mode decision
+// ════════════════════════════════════════════════════════════════════
+
+/// Per-cell context the encoder needs to choose a mode.
+///
+/// Built by the caller from the CTU's basin lookup + the per-cell
+/// neighbour table. The encoder does not own the basin codebook or the
+/// escape vector; it returns an `Escape(basin_idx, escape_idx)` leaf
+/// and lets the caller push the original cell value into the per-frame
+/// escape vector at `escape_idx`.
+///
+/// # Fields
+///
+/// - `basin_idx`: nearest basin's index in the per-frame codebook,
+///   already resolved by the caller (typically via
+///   `ogit_bridge::nearest_basin`). Must be `<= MAX_BASIN_IDX`
+///   (12 bits) per [`super::mode::pack_header`]. The encoder does not
+///   re-validate.
+/// - `delta_i32`: signed delta from the basin's u8-quantised
+///   representation of the cell. The encoder branches on `|delta|`
+///   to decide between Delta (fits in i8) and Escape (overflows).
+///   `i32` width avoids overflow when the caller computes
+///   `cell_value - basin_value` for two u8 inputs.
+/// - `neighbours`: NESW (in [`MergeDir`] discriminant order) optional
+///   neighbour leaves. `None` for boundary cells; the Merge candidate
+///   scan skips `None` entries.
+#[derive(Debug, Clone, Copy)]
+pub struct IntraContext<'a> {
+    /// Pre-resolved basin index (12-bit max).
+    pub basin_idx: u16,
+    /// Signed delta from basin → cell, in the basin's u8 quantisation
+    /// space.
+    pub delta_i32: i32,
+    /// NESW neighbour leaves, indexed by [`MergeDir`] discriminant.
+    pub neighbours: [Option<&'a LeafCu>; 4],
+}
+
+/// Configuration for the intra-prediction decision.
+///
+/// Today a single field; the field exists so the API can grow
+/// (Merge tolerance, RDO knobs in A6) without a signature break.
+#[derive(Debug, Clone, Copy)]
+pub struct IntraConfig {
+    /// Future allocator for the encoder's escape vector — returns the
+    /// next index to write. `None` disables Escape mode (the encoder
+    /// will fall back to Delta-with-truncation, which **loses
+    /// precision** but never panics; callers wanting lossless coding
+    /// must provide a real allocator).
+    ///
+    /// Stateless API today: encoder calls `escape_next_idx` once per
+    /// Escape decision. The caller is responsible for actually
+    /// appending the u64 cell value into the escape vector at the
+    /// returned index — this kernel doesn't see the cell value.
+    pub escape_next_idx: Option<u32>,
+}
+
+impl Default for IntraConfig {
+    fn default() -> Self {
+        Self { escape_next_idx: None }
+    }
+}
+
+// ════════════════════════════════════════════════════════════════════
+// The decision kernel
+// ════════════════════════════════════════════════════════════════════
+
+/// Encoder-side intra-prediction. Returns the cheapest [`LeafCu`]
+/// representation of the cell described by `ctx`.
+///
+/// See the module docs for the decision tree (Skip → Merge → Delta →
+/// Escape) and the rationale (monotone wire cost).
+///
+/// # Examples
+///
+/// Skip when the cell is exactly the basin:
+///
+/// ```
+/// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig};
+/// use ndarray::hpc::codec::{CellMode, LeafCu};
+/// let ctx = IntraContext {
+///     basin_idx: 42,
+///     delta_i32: 0,
+///     neighbours: [None; 4],
+/// };
+/// let leaf = predict_intra(&ctx, &IntraConfig::default());
+/// assert_eq!(leaf.mode, CellMode::Skip);
+/// assert_eq!(leaf.basin_idx, 42);
+/// ```
+///
+/// Delta when no Merge candidate exists but |δ| fits in i8:
+///
+/// ```
+/// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig};
+/// use ndarray::hpc::codec::CellMode;
+/// let ctx = IntraContext {
+///     basin_idx: 42,
+///     delta_i32: 17,
+///     neighbours: [None; 4],
+/// };
+/// let leaf = predict_intra(&ctx, &IntraConfig::default());
+/// assert_eq!(leaf.mode, CellMode::Delta);
+/// assert_eq!(leaf.delta, Some(17));
+/// ```
+pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
+    // ── 1. Skip ──────────────────────────────────────────────────────
+    if ctx.delta_i32 == 0 {
+        return LeafCu::skip(ctx.basin_idx);
+    }
+
+    // ── 2. Merge ─────────────────────────────────────────────────────
+    //
+    // A neighbour is a Merge candidate iff:
+    //   (a) its mode is Delta (Skip / Merge / Escape neighbours carry
+    //       no reusable delta to inherit from)
+    //   (b) its basin_idx matches ours (Merge inheritance implicitly
+    //       points at the SAME basin — different basins mean a
+    //       different reference frame)
+    //   (c) its δ exactly matches our δ as a u8 (sign-tolerant via
+    //       wrapping cast; matches the A2 pack format where Delta
+    //       stores a raw u8 byte without a sign bit)
+    //
+    // We scan NESW in discriminant order and pick the first match.
+    // Multiple matches all collapse to the same coded leaf, so the
+    // first-hit policy is order-deterministic without affecting
+    // bitstream length.
+    let our_delta_u8 = ctx.delta_i32 as u8; // wrapping cast matches A2 pack
+    for (i, nb_slot) in ctx.neighbours.iter().enumerate() {
+        let Some(nb) = nb_slot else { continue };
+        if nb.mode != CellMode::Delta {
+            continue;
+        }
+        if nb.basin_idx != ctx.basin_idx {
+            continue;
+        }
+        if nb.delta != Some(our_delta_u8) {
+            continue;
+        }
+        let dir = merge_dir_from_index(i);
+        return LeafCu::merge(ctx.basin_idx, dir);
+    }
+
+    // ── 3. Delta ─────────────────────────────────────────────────────
+    //
+    // i8 range is [-128, 127]. We pack as raw u8 (wrapping cast) so
+    // the encoder's reconstruction must read the byte back as i8 to
+    // recover the sign. This matches how `LeafCu::delta` stores it and
+    // how `super::mode::pack_leaf` writes it.
+    if (-128..=127).contains(&ctx.delta_i32) {
+        return LeafCu::delta(ctx.basin_idx, our_delta_u8);
+    }
+
+    // ── 4. Escape ────────────────────────────────────────────────────
+    //
+    // |δ| doesn't fit in i8. Caller must own the per-frame escape
+    // vector and provide the next-write index; we return a leaf that
+    // references it. If the caller didn't provide an allocator, we
+    // fall back to a saturated Delta (lossy but never panicking) so
+    // a misconfigured encoder still produces a valid bytestream.
+    match cfg.escape_next_idx {
+        Some(idx) => LeafCu::escape(ctx.basin_idx, idx),
+        None => {
+            // Lossy fallback: clamp to i8 range. Caller is responsible
+            // for noticing that the reconstruction won't be bit-exact.
+            let clamped = ctx.delta_i32.clamp(-128, 127) as u8;
+            LeafCu::delta(ctx.basin_idx, clamped)
+        }
+    }
+}
+
+#[inline]
+fn merge_dir_from_index(i: usize) -> MergeDir {
+    match i {
+        0 => MergeDir::North,
+        1 => MergeDir::East,
+        2 => MergeDir::West,
+        _ => MergeDir::South,
+    }
+}
+
+/// Sanity-check sentinel: returns `true` iff the resolved basin index
+/// is the "no basin" marker. Encoders that compute basins lazily can
+/// short-circuit Skip/Merge/Delta and emit Escape directly when this
+/// fires.
+#[inline]
+pub fn is_no_basin(basin_idx: u16) -> bool {
+    basin_idx == BASIN_NONE
+}
+
+// ════════════════════════════════════════════════════════════════════
+// Tests
+// ════════════════════════════════════════════════════════════════════
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn ctx_with_neighbours<'a>(basin: u16, delta: i32, neighbours: [Option<&'a LeafCu>; 4]) -> IntraContext<'a> {
+        IntraContext {
+            basin_idx: basin,
+            delta_i32: delta,
+            neighbours,
+        }
+    }
+
+    #[test]
+    fn skip_when_delta_is_zero() {
+        let leaf = predict_intra(&ctx_with_neighbours(100, 0, [None; 4]), &IntraConfig::default());
+        assert_eq!(leaf, LeafCu::skip(100));
+    }
+
+    #[test]
+    fn skip_preferred_over_neighbour_match() {
+        // δ=0 trumps everything else, even a perfect Merge candidate.
+        let nb = LeafCu::delta(100, 0);
+        let neighbours = [Some(&nb), None, None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 0, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Skip);
+    }
+
+    #[test]
+    fn delta_in_i8_range() {
+        for d in [-128i32, -1, 1, 127] {
+            let leaf = predict_intra(&ctx_with_neighbours(100, d, [None; 4]), &IntraConfig::default());
+            assert_eq!(leaf.mode, CellMode::Delta);
+            assert_eq!(leaf.delta, Some(d as u8));
+        }
+    }
+
+    #[test]
+    fn merge_when_neighbour_delta_matches_basin_and_value() {
+        // Northern neighbour: Delta-mode, same basin, same δ as us.
+        let nb_north = LeafCu::delta(100, 17);
+        let neighbours = [Some(&nb_north), None, None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Merge);
+        assert_eq!(leaf.merge_dir, Some(MergeDir::North));
+        assert_eq!(leaf.basin_idx, 100);
+    }
+
+    #[test]
+    fn merge_skipped_when_basin_differs() {
+        // Same δ but different basin → cannot Merge (different
+        // reference frame). Falls through to Delta.
+        let nb_north = LeafCu::delta(99, 17);
+        let neighbours = [Some(&nb_north), None, None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Delta);
+    }
+
+    #[test]
+    fn merge_skipped_when_neighbour_mode_is_not_delta() {
+        // Skip / Merge / Escape neighbours carry no inheritable δ.
+        let nb_skip = LeafCu::skip(100);
+        let nb_merge = LeafCu::merge(100, MergeDir::North);
+        let nb_esc = LeafCu::escape(100, 0);
+        let neighbours = [Some(&nb_skip), Some(&nb_merge), None, Some(&nb_esc)];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Delta);
+    }
+
+    #[test]
+    fn merge_picks_first_hit_in_nesw_order() {
+        // Both N and E qualify; encoder must pick N (lower index).
+        let nb_match = LeafCu::delta(100, 17);
+        let neighbours = [Some(&nb_match), Some(&nb_match), None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.merge_dir, Some(MergeDir::North));
+    }
+
+    #[test]
+    fn merge_negative_delta_via_wrapping_cast() {
+        // δ = -17 packs to 0xEF (= 239 as u8). Neighbour stored as
+        // u8 = 0xEF MUST match — the cast must be wrapping, not
+        // saturating.
+        let nb_match = LeafCu::delta(100, (-17_i32) as u8);
+        let neighbours = [None, Some(&nb_match), None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, -17, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Merge);
+        assert_eq!(leaf.merge_dir, Some(MergeDir::East));
+    }
+
+    #[test]
+    fn escape_when_delta_overflows_i8_and_allocator_present() {
+        let cfg = IntraConfig {
+            escape_next_idx: Some(42),
+        };
+        let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &cfg);
+        assert_eq!(leaf.mode, CellMode::Escape);
+        assert_eq!(leaf.escape_idx, Some(42));
+        assert_eq!(leaf.basin_idx, 100);
+    }
+
+    #[test]
+    fn escape_lossy_fallback_when_no_allocator() {
+        // Without an escape_next_idx, the encoder clamps to i8 range.
+        // The result is a valid LeafCu but the reconstruction won't
+        // be bit-exact.
+        let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Delta);
+        assert_eq!(leaf.delta, Some(127));
+    }
+
+    #[test]
+    fn escape_lossy_fallback_negative_overflow() {
+        let leaf = predict_intra(&ctx_with_neighbours(100, -1000, [None; 4]), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Delta);
+        assert_eq!(leaf.delta, Some((-128_i32) as u8));
+    }
+
+    #[test]
+    fn pack_then_unpack_chained_through_intra_decision() {
+        // End-to-end: encoder picks Merge for one cell. The packed
+        // representation must round-trip via A2's pack/unpack with
+        // bit-exact fidelity.
+        use super::super::mode::{pack_leaf, unpack_leaf};
+        let nb = LeafCu::delta(100, 17);
+        let neighbours = [None, Some(&nb), None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        assert_eq!(leaf.mode, CellMode::Merge);
+
+        let mut buf = [0u8; 6];
+        let n = pack_leaf(&leaf, &mut buf).unwrap();
+        let (decoded, consumed) = unpack_leaf(&buf).unwrap();
+        assert_eq!(n, consumed);
+        assert_eq!(decoded, leaf);
+    }
+
+    #[test]
+    fn is_no_basin_sentinel_round_trip() {
+        assert!(is_no_basin(BASIN_NONE));
+        assert!(!is_no_basin(0));
+        assert!(!is_no_basin(100));
+    }
+}

From b39a5769c65903295163ee8c5bda1ddba4093488 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 06:42:09 +0000
Subject: [PATCH 2/5] =?UTF-8?q?fix(codec):=20address=20PR-195=20review=20?=
 =?UTF-8?q?=E2=80=94=20overflow=20Merge=20alias=20+=20mode-sized=20pack=5F?=
 =?UTF-8?q?leaf?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P1 (codex) — overflow δ no longer aliases to Merge:
  predict_intra previously took `our_delta_u8 = δ as u8` BEFORE
  checking i8 fit, so e.g. δ=200 wrapped to 0xC8 and could match a
  neighbour byte 0xC8 (i8=-56), silently corrupting reconstruction.
  Now the i8 range check gates both the Merge scan and the Delta
  branch; out-of-range δ falls straight through to Escape (or the
  documented lossy clamp).
  + 2 regression tests:
    - overflow_delta_does_not_alias_to_merge
    - overflow_delta_with_allocator_takes_escape

P2 (coderabbit + codex) — pack_leaf accepts mode-sized buffers:
  pack_leaf used a 6-byte minimum for all modes; callers pre-sizing
  by packed_byte_len() got spurious None for Skip(2)/Merge,Delta(3).
  Length check now gates on packed_byte_len(leaf.mode).
  + 1 regression test: pack_leaf_accepts_mode_sized_buffers

P3 (coderabbit) — doctest examples on the public-API surface:
  Added /// runnable examples to MAX_BASIN_IDX, BASIN_NONE,
  unpack_header, pack_merge_dir, unpack_merge_dir, unpack_leaf,
  packed_byte_len, IntraContext, IntraConfig, is_no_basin.
  Removed unused LeafCu import from existing predict_intra doctest.

Gates:
  cargo test --features codec --lib hpc::codec → 48 passed
  cargo test --features codec --doc hpc::codec → 14 passed
  cargo fmt --all -- --check → clean
  cargo clippy --features codec --lib -- -D warnings → clean
---
 src/hpc/codec/mode.rs    |  53 +++++++++++++++++-
 src/hpc/codec/predict.rs | 112 +++++++++++++++++++++++++++++++++------
 2 files changed, 148 insertions(+), 17 deletions(-)

diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs
index d18f5c96..ef4cf965 100644
--- a/src/hpc/codec/mode.rs
+++ b/src/hpc/codec/mode.rs
@@ -57,10 +57,20 @@ use super::ctu::{CellMode, LeafCu, MergeDir};
 
 /// Maximum encodable `basin_idx`. Stored in the lower 12 bits of the
 /// header; values >= this constant overflow the header field.
+///
+/// ```
+/// use ndarray::hpc::codec::MAX_BASIN_IDX;
+/// assert_eq!(MAX_BASIN_IDX, (1 << 12) - 1);
+/// ```
 pub const MAX_BASIN_IDX: u16 = (1 << 12) - 1; // 4095
 
 /// Tag inside the per-frame basin codebook for "no basin assigned"
 /// (encoder-side sentinel during mode decision).
+///
+/// ```
+/// use ndarray::hpc::codec::{BASIN_NONE, MAX_BASIN_IDX};
+/// assert_eq!(BASIN_NONE, MAX_BASIN_IDX);
+/// ```
 pub const BASIN_NONE: u16 = MAX_BASIN_IDX;
 
 /// Pack `(mode, basin_idx)` into a 16-bit header.
@@ -85,6 +95,12 @@ pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 {
 ///
 /// The 2-bit mode field always decodes (all 4 variants are valid).
 /// `basin_idx` is the 12-bit lower field, exactly as packed.
+///
+/// ```
+/// use ndarray::hpc::codec::{pack_header, unpack_header, CellMode};
+/// let h = pack_header(CellMode::Escape, 7);
+/// assert_eq!(unpack_header(h), (CellMode::Escape, 7));
+/// ```
 #[inline]
 pub fn unpack_header(packed: u16) -> (CellMode, u16) {
     let mode_bits = ((packed >> 12) & 0b11) as u8;
@@ -103,6 +119,11 @@ pub fn unpack_header(packed: u16) -> (CellMode, u16) {
 // ════════════════════════════════════════════════════════════════════
 
 /// Pack a [`MergeDir`] into the lower 2 bits of a `u8`.
+///
+/// ```
+/// use ndarray::hpc::codec::{pack_merge_dir, MergeDir};
+/// assert_eq!(pack_merge_dir(MergeDir::East), 1);
+/// ```
 #[inline]
 pub fn pack_merge_dir(dir: MergeDir) -> u8 {
     dir as u8
@@ -112,6 +133,13 @@ pub fn pack_merge_dir(dir: MergeDir) -> u8 {
 ///
 /// All four 2-bit values map to a valid `MergeDir`; bits 2-7 are
 /// ignored.
+///
+/// ```
+/// use ndarray::hpc::codec::{pack_merge_dir, unpack_merge_dir, MergeDir};
+/// for d in [MergeDir::North, MergeDir::East, MergeDir::West, MergeDir::South] {
+///     assert_eq!(unpack_merge_dir(pack_merge_dir(d)), d);
+/// }
+/// ```
 #[inline]
 pub fn unpack_merge_dir(byte: u8) -> MergeDir {
     match byte & 0b11 {
@@ -133,7 +161,9 @@ pub fn unpack_merge_dir(byte: u8) -> MergeDir {
 /// worst case) — callers iterating CTUs typically pre-allocate
 /// `6 * cell_count` and trim afterwards.
 ///
-/// Returns `None` if `out.len() < 6` (insufficient capacity).
+/// Returns `None` if `out.len() < packed_byte_len(leaf.mode)` (insufficient
+/// capacity for the *mode's* width — Skip needs 2, Merge/Delta need 3,
+/// Escape needs 6).
 ///
 /// Format:
 /// - Bytes 0-1: header (`pack_header(mode, basin_idx)`, LE)
@@ -151,7 +181,8 @@ pub fn unpack_merge_dir(byte: u8) -> MergeDir {
 /// assert_eq!(consumed, 3);
 /// ```
 pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option<usize> {
-    if out.len() < 6 {
+    let required = packed_byte_len(leaf.mode);
+    if out.len() < required {
         return None;
     }
     let header = pack_header(leaf.mode, leaf.basin_idx);
@@ -184,6 +215,16 @@ pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option<usize> {
 ///
 /// Returns `None` if the buffer is shorter than the per-mode width
 /// (2 for Skip, 3 for Merge/Delta, 6 for Escape).
+///
+/// ```
+/// use ndarray::hpc::codec::{pack_leaf, unpack_leaf, LeafCu, MergeDir};
+/// let leaf = LeafCu::merge(7, MergeDir::West);
+/// let mut buf = [0u8; 3];
+/// pack_leaf(&leaf, &mut buf).unwrap();
+/// let (decoded, n) = unpack_leaf(&buf).unwrap();
+/// assert_eq!(decoded, leaf);
+/// assert_eq!(n, 3);
+/// ```
 pub fn unpack_leaf(buf: &[u8]) -> Option<(LeafCu, usize)> {
     if buf.len() < 2 {
         return None;
@@ -217,6 +258,14 @@ pub fn unpack_leaf(buf: &[u8]) -> Option<(LeafCu, usize)> {
 
 /// Byte cost of packing a leaf in this mode. Useful for pre-sizing
 /// a buffer without packing first.
+///
+/// ```
+/// use ndarray::hpc::codec::{packed_byte_len, CellMode};
+/// assert_eq!(packed_byte_len(CellMode::Skip), 2);
+/// assert_eq!(packed_byte_len(CellMode::Merge), 3);
+/// assert_eq!(packed_byte_len(CellMode::Delta), 3);
+/// assert_eq!(packed_byte_len(CellMode::Escape), 6);
+/// ```
 #[inline]
 pub const fn packed_byte_len(mode: CellMode) -> usize {
     match mode {
diff --git a/src/hpc/codec/predict.rs b/src/hpc/codec/predict.rs
index 727b94ef..d172f143 100644
--- a/src/hpc/codec/predict.rs
+++ b/src/hpc/codec/predict.rs
@@ -87,6 +87,17 @@ use super::mode::BASIN_NONE;
 /// - `neighbours`: NESW (in [`MergeDir`] discriminant order) optional
 ///   neighbour leaves. `None` for boundary cells; the Merge candidate
 ///   scan skips `None` entries.
+///
+/// ```
+/// use ndarray::hpc::codec::{IntraContext, LeafCu};
+/// let north = LeafCu::delta(5, 17);
+/// let ctx = IntraContext {
+///     basin_idx: 5,
+///     delta_i32: 17,
+///     neighbours: [Some(&north), None, None, None],
+/// };
+/// assert_eq!(ctx.basin_idx, 5);
+/// ```
 #[derive(Debug, Clone, Copy)]
 pub struct IntraContext<'a> {
     /// Pre-resolved basin index (12-bit max).
@@ -102,6 +113,14 @@ pub struct IntraContext<'a> {
 ///
 /// Today a single field; the field exists so the API can grow
 /// (Merge tolerance, RDO knobs in A6) without a signature break.
+///
+/// ```
+/// use ndarray::hpc::codec::IntraConfig;
+/// let default_cfg = IntraConfig::default();
+/// assert!(default_cfg.escape_next_idx.is_none());
+/// let allocated = IntraConfig { escape_next_idx: Some(42) };
+/// assert_eq!(allocated.escape_next_idx, Some(42));
+/// ```
 #[derive(Debug, Clone, Copy)]
 pub struct IntraConfig {
     /// Future allocator for the encoder's escape vector — returns the
@@ -139,7 +158,7 @@ impl Default for IntraConfig {
 ///
 /// ```
 /// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig};
-/// use ndarray::hpc::codec::{CellMode, LeafCu};
+/// use ndarray::hpc::codec::CellMode;
 /// let ctx = IntraContext {
 ///     basin_idx: 42,
 ///     delta_i32: 0,
@@ -170,6 +189,14 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
         return LeafCu::skip(ctx.basin_idx);
     }
 
+    // i8-fit gates both Merge and Delta. Out-of-range δ must skip
+    // Merge entirely — wrapping `200_i32 as u8` aliases to `0xC8`,
+    // which could spuriously match a neighbour whose byte equals
+    // `0xC8` (i8 = -56), producing a leaf the decoder reconstructs as
+    // -56 instead of 200.
+    let fits_i8 = (-128..=127).contains(&ctx.delta_i32);
+    let our_delta_u8 = ctx.delta_i32 as u8; // wrapping cast matches A2 pack
+
     // ── 2. Merge ─────────────────────────────────────────────────────
     //
     // A neighbour is a Merge candidate iff:
@@ -186,20 +213,21 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
     // Multiple matches all collapse to the same coded leaf, so the
     // first-hit policy is order-deterministic without affecting
     // bitstream length.
-    let our_delta_u8 = ctx.delta_i32 as u8; // wrapping cast matches A2 pack
-    for (i, nb_slot) in ctx.neighbours.iter().enumerate() {
-        let Some(nb) = nb_slot else { continue };
-        if nb.mode != CellMode::Delta {
-            continue;
+    if fits_i8 {
+        for (i, nb_slot) in ctx.neighbours.iter().enumerate() {
+            let Some(nb) = nb_slot else { continue };
+            if nb.mode != CellMode::Delta {
+                continue;
+            }
+            if nb.basin_idx != ctx.basin_idx {
+                continue;
+            }
+            if nb.delta != Some(our_delta_u8) {
+                continue;
+            }
+            let dir = merge_dir_from_index(i);
+            return LeafCu::merge(ctx.basin_idx, dir);
         }
-        if nb.basin_idx != ctx.basin_idx {
-            continue;
-        }
-        if nb.delta != Some(our_delta_u8) {
-            continue;
-        }
-        let dir = merge_dir_from_index(i);
-        return LeafCu::merge(ctx.basin_idx, dir);
     }
 
     // ── 3. Delta ─────────────────────────────────────────────────────
@@ -208,7 +236,7 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
     // the encoder's reconstruction must read the byte back as i8 to
     // recover the sign. This matches how `LeafCu::delta` stores it and
     // how `super::mode::pack_leaf` writes it.
-    if (-128..=127).contains(&ctx.delta_i32) {
+    if fits_i8 {
         return LeafCu::delta(ctx.basin_idx, our_delta_u8);
     }
 
@@ -244,6 +272,12 @@ fn merge_dir_from_index(i: usize) -> MergeDir {
 /// is the "no basin" marker. Encoders that compute basins lazily can
 /// short-circuit Skip/Merge/Delta and emit Escape directly when this
 /// fires.
+///
+/// ```
+/// use ndarray::hpc::codec::{is_no_basin, BASIN_NONE};
+/// assert!(is_no_basin(BASIN_NONE));
+/// assert!(!is_no_basin(0));
+/// ```
 #[inline]
 pub fn is_no_basin(basin_idx: u16) -> bool {
     basin_idx == BASIN_NONE
@@ -394,4 +428,52 @@ mod tests {
         assert!(!is_no_basin(0));
         assert!(!is_no_basin(100));
     }
+
+    #[test]
+    fn overflow_delta_does_not_alias_to_merge() {
+        // Regression for the wrapping-cast Merge alias bug:
+        // δ = 200 (overflows i8) must NOT match a neighbour whose
+        // u8 byte equals (200 as u8) = 0xC8 (= -56 in i8). The
+        // encoder must take the Escape path (or, here, the lossy
+        // clamp fallback because no allocator is wired).
+        let nb_alias = LeafCu::delta(100, 0xC8);
+        let neighbours = [Some(&nb_alias), None, None, None];
+        let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default());
+        assert_ne!(leaf.mode, CellMode::Merge, "overflow δ must not Merge");
+        // With no allocator the encoder clamps to +127 (lossy Delta).
+        assert_eq!(leaf.mode, CellMode::Delta);
+        assert_eq!(leaf.delta, Some(127));
+    }
+
+    #[test]
+    fn overflow_delta_with_allocator_takes_escape() {
+        let nb_alias = LeafCu::delta(100, 0xC8);
+        let neighbours = [Some(&nb_alias), None, None, None];
+        let cfg = IntraConfig {
+            escape_next_idx: Some(7),
+        };
+        let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &cfg);
+        assert_eq!(leaf.mode, CellMode::Escape);
+        assert_eq!(leaf.escape_idx, Some(7));
+    }
+
+    #[test]
+    fn pack_leaf_accepts_mode_sized_buffers() {
+        // Regression for the P2 6-byte-minimum bug: Skip should pack
+        // into a 2-byte buffer, Merge/Delta into a 3-byte buffer.
+        use super::super::mode::{pack_leaf, packed_byte_len};
+        let skip = LeafCu::skip(10);
+        let mut buf2 = [0u8; 2];
+        assert_eq!(pack_leaf(&skip, &mut buf2), Some(2));
+        assert_eq!(packed_byte_len(CellMode::Skip), 2);
+
+        let delta = LeafCu::delta(10, 7);
+        let mut buf3 = [0u8; 3];
+        assert_eq!(pack_leaf(&delta, &mut buf3), Some(3));
+
+        // Escape still needs 6 bytes; a 3-byte buffer is rejected.
+        let esc = LeafCu::escape(10, 99);
+        let mut buf3b = [0u8; 3];
+        assert_eq!(pack_leaf(&esc, &mut buf3b), None);
+    }
 }

From b44fe59fbbd0237b6c1dcc0c3102452cc8201c79 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 06:48:03 +0000
Subject: [PATCH 3/5] =?UTF-8?q?fix(codec):=20address=20owner=20review=20on?=
 =?UTF-8?q?=20PR-195=20=E2=80=94=20escape=20allocator=20+=20NEWS=20doc=20+?=
 =?UTF-8?q?=20diagram?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P1 — escape allocator collision in batched encoding:
  Old: cfg.escape_next_idx: Option<u32> was a static field — every
  Escape leaf in a batch got the same idx, colliding on the escape
  vector slot at decode time.
  New: predict_intra now takes a third arg `escape_next: Option<&mut u32>`
  that the kernel post-increments when Escape fires. Sequential cells
  in the same batch see fresh, non-colliding idxs.
  IntraConfig becomes empty (reserved for future RDO knobs).
  + 1 regression test: escape_allocator_advances_across_batched_calls
  + escape_when_delta_overflows_i8_and_allocator_present extended
    to assert the cursor advances.

P1 — NESW vs NEWS doc/code mismatch:
  MergeDir discriminants are North=0, East=1, West=2, South=3 (NEWS),
  but the doc said "NESW". Fixed in IntraContext docstring + the
  inline Merge-scan comment. Added explicit slot-to-direction table.
  + 1 regression test: merge_slot_2_maps_to_west_and_slot_3_to_south

P2 — ASCII header diagram in mode.rs:
  Old diagram put M0/M1 at MSB bits 15-14, but pack_header uses
  `(mode << 12) | basin`, placing mode at bits 12-13 with bits 14-15
  reserved at zero. Redrew the diagram with explicit bit indices
  and labelled the reserved high bits for future encoder use.

Nits:
- A2 doc table now says "Merge tail: 1 byte (low 2 bits = MergeDir;
  high 6 reserved)" instead of the misleading "MergeDir 2-bit".
- Renamed merge_picks_first_hit_in_nesw_order → ..._in_news_order to
  match the corrected ordering.

Already-addressed (no-op this commit):
- P0 overflow Merge alias → fixed in b39a5769
- P2 pack_leaf 6-byte minimum → fixed in b39a5769

Gates:
  cargo test --features codec --lib hpc::codec → 50 passed (+2 new)
  cargo test --features codec --doc hpc::codec → 15 passed (+1 new)
  cargo fmt --all -- --check → clean
  cargo clippy --features codec --lib -- -D warnings → clean
---
 src/hpc/codec/mode.rs    |  34 ++++----
 src/hpc/codec/predict.rs | 179 +++++++++++++++++++++++++--------------
 2 files changed, 136 insertions(+), 77 deletions(-)

diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs
index ef4cf965..f748b7bd 100644
--- a/src/hpc/codec/mode.rs
+++ b/src/hpc/codec/mode.rs
@@ -13,28 +13,32 @@
 //! decoder can route on a single `u16` load:
 //!
 //! ```text
-//!     MSB                                                    LSB
-//!     ┌──┬──┬──────────────────────────────┐
-//!     │M0│M1│         basin_idx (12)        │   ← 16-bit header
-//!     └──┴──┴──────────────────────────────┘
-//!     │  │  └─ basin_idx is the only payload field always present
-//!     └──┴──── 2-bit mode discriminant (CellMode::as_u8())
-//!     (top 2 bits)
+//!     bit  15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+//!         ┌──┬──┬──┬──┬──────────────────────┐
+//!         │ 0│ 0│M1│M0│      basin_idx (12)  │   ← 16-bit header
+//!         └──┴──┴──┴──┴──────────────────────┘
+//!         │  │  │  │  └────────────────────── basin_idx (bits 0..=11)
+//!         │  │  └──┴────────────────────────── 2-bit mode (bits 12..=13)
+//!         └──┴──────────────────────────────── reserved high bits 14,15
 //! ```
 //!
-//! The remaining 2 bits at the top of the second byte are reserved for
-//! the encoder's future `merge_dir` overlap when the mode is `Merge`;
-//! a separate `pack_mode_dir` helper keeps `Merge`'s direction in a
-//! single byte alongside `Skip`/`Delta`/`Escape`'s mode tag.
+//! Bits 14-15 are reserved at zero; the impl is
+//! `(mode_bits << 12) | basin_bits`, so mode lives at bits 12-13 and
+//! basin at bits 0-11. A future encoder can repurpose bits 14-15
+//! (e.g., for a per-leaf `merge_dir` overlap) without disturbing
+//! existing decoders that mask bits 14-15 off.
 //!
 //! # Per-mode tail width
 //!
 //! | Mode   | Header | Tail bytes               | Total |
 //! |--------|--------|--------------------------|-------|
-//! | Skip   | 2      | 0                        | 2     |
-//! | Merge  | 2      | 1 (`MergeDir` 2-bit)     | 3     |
-//! | Delta  | 2      | 1 (`u8` perturbation)    | 3     |
-//! | Escape | 2      | 4 (`u32` escape_idx, LE) | 6     |
+//! | Skip   | 2      | 0                            | 2     |
+//! | Merge  | 2      | 1 (low 2 bits = `MergeDir`)  | 3     |
+//! | Delta  | 2      | 1 (`u8` perturbation)        | 3     |
+//! | Escape | 2      | 4 (`u32` escape_idx, LE)     | 6     |
+//!
+//! The Merge tail is a full byte even though only its low 2 bits carry
+//! `MergeDir` — high 6 bits are reserved and masked off on read.
 //!
 //! The compact pack writes header (LE) then the per-mode tail. The
 //! `escape_idx` width is the worst case; a future A7 rANS pass can
diff --git a/src/hpc/codec/predict.rs b/src/hpc/codec/predict.rs
index d172f143..3014cb8a 100644
--- a/src/hpc/codec/predict.rs
+++ b/src/hpc/codec/predict.rs
@@ -84,9 +84,17 @@ use super::mode::BASIN_NONE;
 ///   to decide between Delta (fits in i8) and Escape (overflows).
 ///   `i32` width avoids overflow when the caller computes
 ///   `cell_value - basin_value` for two u8 inputs.
-/// - `neighbours`: NESW (in [`MergeDir`] discriminant order) optional
-///   neighbour leaves. `None` for boundary cells; the Merge candidate
-///   scan skips `None` entries.
+/// - `neighbours`: NEWS (in [`MergeDir`] discriminant order:
+///   `North=0, East=1, West=2, South=3`) optional neighbour leaves.
+///   `None` for boundary cells; the Merge candidate scan skips `None`
+///   entries.
+///
+/// ```text
+///   slot 0 → MergeDir::North   (discr 0)
+///   slot 1 → MergeDir::East    (discr 1)
+///   slot 2 → MergeDir::West    (discr 2)
+///   slot 3 → MergeDir::South   (discr 3)
+/// ```
 ///
 /// ```
 /// use ndarray::hpc::codec::{IntraContext, LeafCu};
@@ -105,41 +113,27 @@ pub struct IntraContext<'a> {
     /// Signed delta from basin → cell, in the basin's u8 quantisation
     /// space.
     pub delta_i32: i32,
-    /// NESW neighbour leaves, indexed by [`MergeDir`] discriminant.
+    /// NEWS neighbour leaves, indexed by [`MergeDir`] discriminant
+    /// (`North=0, East=1, West=2, South=3`).
     pub neighbours: [Option<&'a LeafCu>; 4],
 }
 
 /// Configuration for the intra-prediction decision.
 ///
-/// Today a single field; the field exists so the API can grow
-/// (Merge tolerance, RDO knobs in A6) without a signature break.
+/// Reserved for future expansion (Merge tolerance, RDO knobs in A6).
+/// Empty today; constructed via [`Default`] so additions don't break
+/// callers.
 ///
 /// ```
 /// use ndarray::hpc::codec::IntraConfig;
-/// let default_cfg = IntraConfig::default();
-/// assert!(default_cfg.escape_next_idx.is_none());
-/// let allocated = IntraConfig { escape_next_idx: Some(42) };
-/// assert_eq!(allocated.escape_next_idx, Some(42));
+/// let cfg = IntraConfig::default();
+/// // No tunables yet — call sites stay future-compatible.
+/// let _ = cfg;
 /// ```
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Default)]
 pub struct IntraConfig {
-    /// Future allocator for the encoder's escape vector — returns the
-    /// next index to write. `None` disables Escape mode (the encoder
-    /// will fall back to Delta-with-truncation, which **loses
-    /// precision** but never panics; callers wanting lossless coding
-    /// must provide a real allocator).
-    ///
-    /// Stateless API today: encoder calls `escape_next_idx` once per
-    /// Escape decision. The caller is responsible for actually
-    /// appending the u64 cell value into the escape vector at the
-    /// returned index — this kernel doesn't see the cell value.
-    pub escape_next_idx: Option<u32>,
-}
-
-impl Default for IntraConfig {
-    fn default() -> Self {
-        Self { escape_next_idx: None }
-    }
+    // Reserved. Future fields land here without breaking the signature.
+    _reserved: (),
 }
 
 // ════════════════════════════════════════════════════════════════════
@@ -152,6 +146,14 @@ impl Default for IntraConfig {
 /// See the module docs for the decision tree (Skip → Merge → Delta →
 /// Escape) and the rationale (monotone wire cost).
 ///
+/// `escape_next` is a write-cursor into the caller's per-frame escape
+/// vector. When the decision falls through to Escape, the kernel reads
+/// the cursor, emits a leaf referencing that idx, and post-increments
+/// the cursor so subsequent cells in the same batch get fresh,
+/// non-colliding idxs. Pass `None` to disable lossless Escape — the
+/// kernel then clamps `δ` to i8 range and emits a `Delta` leaf whose
+/// reconstruction is **not bit-exact** (caller must accept the loss).
+///
 /// # Examples
 ///
 /// Skip when the cell is exactly the basin:
@@ -164,7 +166,7 @@ impl Default for IntraConfig {
 ///     delta_i32: 0,
 ///     neighbours: [None; 4],
 /// };
-/// let leaf = predict_intra(&ctx, &IntraConfig::default());
+/// let leaf = predict_intra(&ctx, &IntraConfig::default(), None);
 /// assert_eq!(leaf.mode, CellMode::Skip);
 /// assert_eq!(leaf.basin_idx, 42);
 /// ```
@@ -179,11 +181,26 @@ impl Default for IntraConfig {
 ///     delta_i32: 17,
 ///     neighbours: [None; 4],
 /// };
-/// let leaf = predict_intra(&ctx, &IntraConfig::default());
+/// let leaf = predict_intra(&ctx, &IntraConfig::default(), None);
 /// assert_eq!(leaf.mode, CellMode::Delta);
 /// assert_eq!(leaf.delta, Some(17));
 /// ```
-pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
+///
+/// Escape with an allocator — repeated calls bump the cursor:
+///
+/// ```
+/// use ndarray::hpc::codec::predict::{predict_intra, IntraContext, IntraConfig};
+/// use ndarray::hpc::codec::CellMode;
+/// let mut next = 7u32;
+/// let ctx = IntraContext { basin_idx: 1, delta_i32: 1000, neighbours: [None; 4] };
+/// let a = predict_intra(&ctx, &IntraConfig::default(), Some(&mut next));
+/// let b = predict_intra(&ctx, &IntraConfig::default(), Some(&mut next));
+/// assert_eq!(a.escape_idx, Some(7));
+/// assert_eq!(b.escape_idx, Some(8));
+/// assert_eq!(next, 9);
+/// assert_eq!(a.mode, CellMode::Escape);
+/// ```
+pub fn predict_intra(ctx: &IntraContext, _cfg: &IntraConfig, escape_next: Option<&mut u32>) -> LeafCu {
     // ── 1. Skip ──────────────────────────────────────────────────────
     if ctx.delta_i32 == 0 {
         return LeafCu::skip(ctx.basin_idx);
@@ -209,7 +226,8 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
     //       wrapping cast; matches the A2 pack format where Delta
     //       stores a raw u8 byte without a sign bit)
     //
-    // We scan NESW in discriminant order and pick the first match.
+    // We scan NEWS in discriminant order (N=0, E=1, W=2, S=3) and
+    // pick the first match.
     // Multiple matches all collapse to the same coded leaf, so the
     // first-hit policy is order-deterministic without affecting
     // bitstream length.
@@ -242,16 +260,23 @@ pub fn predict_intra(ctx: &IntraContext, cfg: &IntraConfig) -> LeafCu {
 
     // ── 4. Escape ────────────────────────────────────────────────────
     //
-    // |δ| doesn't fit in i8. Caller must own the per-frame escape
-    // vector and provide the next-write index; we return a leaf that
-    // references it. If the caller didn't provide an allocator, we
-    // fall back to a saturated Delta (lossy but never panicking) so
-    // a misconfigured encoder still produces a valid bytestream.
-    match cfg.escape_next_idx {
-        Some(idx) => LeafCu::escape(ctx.basin_idx, idx),
+    // |δ| doesn't fit in i8. The cursor `escape_next` is a write-pointer
+    // into the caller's per-frame escape vector; we read it, emit a
+    // leaf referencing that idx, and post-increment so subsequent
+    // overflow cells in the batch don't collide on the same vector
+    // slot. If the caller didn't provide an allocator, we fall back to
+    // a saturated Delta (lossy: reconstruction is NOT bit-exact, but
+    // never panicking) so a misconfigured encoder still produces a
+    // valid bytestream. The lossy leaf's `mode` is `CellMode::Delta`
+    // even though its semantic value overflowed i8 — by contract the
+    // caller has acknowledged the precision loss.
+    match escape_next {
+        Some(next) => {
+            let idx = *next;
+            *next = next.wrapping_add(1);
+            LeafCu::escape(ctx.basin_idx, idx)
+        }
         None => {
-            // Lossy fallback: clamp to i8 range. Caller is responsible
-            // for noticing that the reconstruction won't be bit-exact.
             let clamped = ctx.delta_i32.clamp(-128, 127) as u8;
             LeafCu::delta(ctx.basin_idx, clamped)
         }
@@ -301,7 +326,7 @@ mod tests {
 
     #[test]
     fn skip_when_delta_is_zero() {
-        let leaf = predict_intra(&ctx_with_neighbours(100, 0, [None; 4]), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 0, [None; 4]), &IntraConfig::default(), None);
         assert_eq!(leaf, LeafCu::skip(100));
     }
 
@@ -310,14 +335,14 @@ mod tests {
         // δ=0 trumps everything else, even a perfect Merge candidate.
         let nb = LeafCu::delta(100, 0);
         let neighbours = [Some(&nb), None, None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 0, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 0, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Skip);
     }
 
     #[test]
     fn delta_in_i8_range() {
         for d in [-128i32, -1, 1, 127] {
-            let leaf = predict_intra(&ctx_with_neighbours(100, d, [None; 4]), &IntraConfig::default());
+            let leaf = predict_intra(&ctx_with_neighbours(100, d, [None; 4]), &IntraConfig::default(), None);
             assert_eq!(leaf.mode, CellMode::Delta);
             assert_eq!(leaf.delta, Some(d as u8));
         }
@@ -328,7 +353,7 @@ mod tests {
         // Northern neighbour: Delta-mode, same basin, same δ as us.
         let nb_north = LeafCu::delta(100, 17);
         let neighbours = [Some(&nb_north), None, None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Merge);
         assert_eq!(leaf.merge_dir, Some(MergeDir::North));
         assert_eq!(leaf.basin_idx, 100);
@@ -340,7 +365,7 @@ mod tests {
         // reference frame). Falls through to Delta.
         let nb_north = LeafCu::delta(99, 17);
         let neighbours = [Some(&nb_north), None, None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Delta);
     }
 
@@ -351,19 +376,35 @@ mod tests {
         let nb_merge = LeafCu::merge(100, MergeDir::North);
         let nb_esc = LeafCu::escape(100, 0);
         let neighbours = [Some(&nb_skip), Some(&nb_merge), None, Some(&nb_esc)];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Delta);
     }
 
     #[test]
-    fn merge_picks_first_hit_in_nesw_order() {
+    fn merge_picks_first_hit_in_news_order() {
         // Both N and E qualify; encoder must pick N (lower index).
         let nb_match = LeafCu::delta(100, 17);
         let neighbours = [Some(&nb_match), Some(&nb_match), None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.merge_dir, Some(MergeDir::North));
     }
 
+    #[test]
+    fn merge_slot_2_maps_to_west_and_slot_3_to_south() {
+        // Slot-3 South coverage gap noted in review. Verify the
+        // discriminant order (N=0, E=1, W=2, S=3) is reflected at
+        // the merge_dir output, not just NEWS-by-convention.
+        let nb = LeafCu::delta(100, 17);
+
+        let only_west = [None, None, Some(&nb), None];
+        let leaf_w = predict_intra(&ctx_with_neighbours(100, 17, only_west), &IntraConfig::default(), None);
+        assert_eq!(leaf_w.merge_dir, Some(MergeDir::West));
+
+        let only_south = [None, None, None, Some(&nb)];
+        let leaf_s = predict_intra(&ctx_with_neighbours(100, 17, only_south), &IntraConfig::default(), None);
+        assert_eq!(leaf_s.merge_dir, Some(MergeDir::South));
+    }
+
     #[test]
     fn merge_negative_delta_via_wrapping_cast() {
         // δ = -17 packs to 0xEF (= 239 as u8). Neighbour stored as
@@ -371,20 +412,35 @@ mod tests {
         // saturating.
         let nb_match = LeafCu::delta(100, (-17_i32) as u8);
         let neighbours = [None, Some(&nb_match), None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, -17, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, -17, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Merge);
         assert_eq!(leaf.merge_dir, Some(MergeDir::East));
     }
 
     #[test]
     fn escape_when_delta_overflows_i8_and_allocator_present() {
-        let cfg = IntraConfig {
-            escape_next_idx: Some(42),
-        };
-        let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &cfg);
+        let mut next = 42u32;
+        let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default(), Some(&mut next));
         assert_eq!(leaf.mode, CellMode::Escape);
         assert_eq!(leaf.escape_idx, Some(42));
         assert_eq!(leaf.basin_idx, 100);
+        // Cursor advanced so the next Escape gets a fresh idx.
+        assert_eq!(next, 43);
+    }
+
+    #[test]
+    fn escape_allocator_advances_across_batched_calls() {
+        // Regression: two consecutive Escape decisions must not
+        // collide on the same vector slot. With a `&mut u32` cursor
+        // the kernel post-increments, so cell A sees idx N and
+        // cell B sees idx N+1.
+        let mut next = 5u32;
+        let a = predict_intra(&ctx_with_neighbours(7, 999, [None; 4]), &IntraConfig::default(), Some(&mut next));
+        let b = predict_intra(&ctx_with_neighbours(7, -999, [None; 4]), &IntraConfig::default(), Some(&mut next));
+        assert_eq!(a.escape_idx, Some(5));
+        assert_eq!(b.escape_idx, Some(6));
+        assert_eq!(next, 7);
+        assert_ne!(a.escape_idx, b.escape_idx);
     }
 
     #[test]
@@ -392,14 +448,14 @@ mod tests {
         // Without an escape_next_idx, the encoder clamps to i8 range.
         // The result is a valid LeafCu but the reconstruction won't
         // be bit-exact.
-        let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 1000, [None; 4]), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Delta);
         assert_eq!(leaf.delta, Some(127));
     }
 
     #[test]
     fn escape_lossy_fallback_negative_overflow() {
-        let leaf = predict_intra(&ctx_with_neighbours(100, -1000, [None; 4]), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, -1000, [None; 4]), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Delta);
         assert_eq!(leaf.delta, Some((-128_i32) as u8));
     }
@@ -412,7 +468,7 @@ mod tests {
         use super::super::mode::{pack_leaf, unpack_leaf};
         let nb = LeafCu::delta(100, 17);
         let neighbours = [None, Some(&nb), None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 17, neighbours), &IntraConfig::default(), None);
         assert_eq!(leaf.mode, CellMode::Merge);
 
         let mut buf = [0u8; 6];
@@ -438,7 +494,7 @@ mod tests {
         // clamp fallback because no allocator is wired).
         let nb_alias = LeafCu::delta(100, 0xC8);
         let neighbours = [Some(&nb_alias), None, None, None];
-        let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default());
+        let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default(), None);
         assert_ne!(leaf.mode, CellMode::Merge, "overflow δ must not Merge");
         // With no allocator the encoder clamps to +127 (lossy Delta).
         assert_eq!(leaf.mode, CellMode::Delta);
@@ -449,12 +505,11 @@ mod tests {
     fn overflow_delta_with_allocator_takes_escape() {
         let nb_alias = LeafCu::delta(100, 0xC8);
         let neighbours = [Some(&nb_alias), None, None, None];
-        let cfg = IntraConfig {
-            escape_next_idx: Some(7),
-        };
-        let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &cfg);
+        let mut next = 7u32;
+        let leaf = predict_intra(&ctx_with_neighbours(100, 200, neighbours), &IntraConfig::default(), Some(&mut next));
         assert_eq!(leaf.mode, CellMode::Escape);
         assert_eq!(leaf.escape_idx, Some(7));
+        assert_eq!(next, 8);
     }
 
     #[test]

From 01c77cccb7811a034920829faa816e0effcf08d6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 07:30:21 +0000
Subject: [PATCH 4/5] =?UTF-8?q?docs(codec):=20cross-domain=20synergies=20?=
 =?UTF-8?q?=E2=80=94=20x265=20=E2=87=84=203DGS=20=E2=87=84=20attention=20?=
 =?UTF-8?q?=E2=87=84=20SGD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

722-line knowledge doc capturing the architectural claim that PR-X12's
primitives (LeafCu, pack_leaf, predict_intra, Ctu, eventual rANS) are
not video-codec-specific — they serve four loads on a single
predictive-coder substrate:

  1. Cognitive cells (the original target)
  2. 3D Gaussian Splat coefficient compression
  3. Transformer attention sparsification
  4. Distributed-SGD gradient streaming

Sections:
- The 4-load isomorphism (§ 2) and 16-state (CellMode × MergeDir)
  classification table across all four domains
- Primitive → load mapping matrix (§ 3) with file:line refs
- 10 cross-domain epiphanies (§ 4) — including MergeDir-as-topology,
  predict_intra-as-attention-sparsifier, escape_next-as-allreduce-slot,
  mode-coded LoRA
- 6 integration plans (§ 5) with worker × week + dependency graph
- 8 exploration paths (§ 6) for sprint-scale research
- 6 holy grail outcomes (§ 7) — sub-1-bit/Gaussian 3DGS,
  bit-exact attention, federated SGD at 8-16×, Lance substrate identity
- 10 codec-side debts (§ 8) and 13 stack-side debts (§ 9)
- Sequencing summary (§ 10): critical path = A7 rANS

Companion to pr-x12-codec-x265-design.md (the mechanical design); a
"See also" breadcrumb added there points here.
---
 .claude/knowledge/pr-x12-codec-x265-design.md |   6 +
 .../pr-x12-cross-domain-synergies.md          | 722 ++++++++++++++++++
 2 files changed, 728 insertions(+)
 create mode 100644 .claude/knowledge/pr-x12-cross-domain-synergies.md

diff --git a/.claude/knowledge/pr-x12-codec-x265-design.md b/.claude/knowledge/pr-x12-codec-x265-design.md
index 182d2849..4d80fa62 100644
--- a/.claude/knowledge/pr-x12-codec-x265-design.md
+++ b/.claude/knowledge/pr-x12-codec-x265-design.md
@@ -8,6 +8,12 @@
 > **Depends on**: PR-X10 (linalg-core), PR-X3 BlockedGrid (shipped).
 > **Used by**: PR-X9 (basin-codebook lazy storage) — the codec encodes
 > cognitive cells into skip/merge/delta/escape modes.
+>
+> **See also**: `pr-x12-cross-domain-synergies.md` — the architectural
+> claim that the primitives in this design doc *also* serve 3DGS
+> coefficient compression, transformer attention sparsification, and
+> distributed-SGD gradient streaming. Holy grail outcomes, integration
+> plans, and honest debt catalogue live there.
 
 ## Why
 
diff --git a/.claude/knowledge/pr-x12-cross-domain-synergies.md b/.claude/knowledge/pr-x12-cross-domain-synergies.md
new file mode 100644
index 00000000..ee074059
--- /dev/null
+++ b/.claude/knowledge/pr-x12-cross-domain-synergies.md
@@ -0,0 +1,722 @@
+# PR-X12 — Cross-Domain Synergies: x265 ⇄ 3D Gaussian Splat ⇄ Cognitive Shaders ⇄ BLAS/MKL
+
+> READ BY: savant-architect, codec-architect, cognitive-architect,
+> splat-architect, vector-synthesis, l3-strategist, sentinel-qa,
+> product-engineer.
+>
+> Status: epiphany-grade exploration doc, drafted 2026-05-22 during
+> the PR-195 (A2 + A3-intra) review cycle.
+>
+> Companion to `.claude/knowledge/pr-x12-codec-x265-design.md` (the
+> mechanical design). This doc captures the **why-it-generalizes**
+> that the design doc deliberately scopes out.
+
+## TL;DR
+
+PR-X12 was framed as "x265 for cognitive cells" — the mechanical
+design doc already maps x265 onto BlockedGrid. The deeper observation
+this doc commits to is that the **same primitives — `LeafCu`,
+`pack_leaf`, `predict_intra`, `Ctu`, rANS — also serve 3D Gaussian
+splat coefficient compression, transformer attention sparsification,
+and distributed-SGD gradient streaming.** The four domains are not
+analogous; they are **four loads on a single predictive-coder
+substrate.** This doc:
+
+1. Names the isomorphism precisely (§ 2)
+2. Maps every codec primitive to its load in each domain (§ 3)
+3. Calls out the epiphanies — cross-domain insights I have not seen
+   in print (§ 4)
+4. Lays out integration plans with concrete PR-arc estimates (§ 5)
+5. Catalogues exploration paths that warrant a sprint, not a PR (§ 6)
+6. States the holy grail outcomes that fall out if it all lands (§ 7)
+7. Honest debt accounting — codec side (§ 8) and existing stack
+   side (§ 9). No marketing.
+
+## 0. Audience preconditions
+
+This doc assumes the reader has internalised:
+
+- `Ctu` / `CtuArena` / `CtuPartition` / `LeafCu` / `CellMode` /
+  `MergeDir` from `src/hpc/codec/ctu.rs` (PR-170).
+- `pack_leaf` / `unpack_leaf` / `pack_header` / `predict_intra` from
+  `src/hpc/codec/mode.rs` + `src/hpc/codec/predict.rs` (PR-195).
+- The Click P-1 method discipline (operations on carriers, not free
+  functions) and the data-flow rule (no `&mut self` during compute).
+- The cognitive cell → basin codebook story from
+  `.claude/knowledge/pr-x12-codec-x265-design.md` § "Core types".
+- Inria 3DGS paper (Kerbl et al. 2023) + EWA Splatting (Zwicker 2001).
+- That the cognitive `splat.rs` in `lance-graph-contract` is sacred
+  and **separate** from `splat3d::*` (the geometric forward renderer
+  shipped in PRs 1-7 of the May sprint).
+
+If any of the above is fuzzy, read those sources first; the rest of
+this doc compresses.
+
+## 1. The four loads
+
+| Load | Carrier | Per-element payload | Predictability source |
+|------|---------|--------------------|----------------------|
+| **Cognitive cell** | `BlockedGrid<u64, 64, 64>` | 64-bit fingerprint | basin codebook (per-frame), spatial NEWS neighbours |
+| **3DGS Gaussian** | SoA `(μ, scale, rot, opacity, SH)` | ~236 bytes raw | sorted-along-curve neighbours, basin (color/scale clusters) |
+| **Transformer attention** | `(Q, K, V)` per (head, token) | Q,K,V vectors | KV palette clusters, previous-token attention pattern |
+| **Distributed SGD gradient** | per-parameter `∂L/∂w` | FP32 grad | mini-batch siblings, gradient sparsity, sign agreement |
+
+All four loads share the same predictive-coding skeleton:
+
+```
+                ┌──────────────────────────────────────┐
+                │ 1. Build basin codebook (offline or  │
+                │    online k-means on the carrier)    │
+                └──────────────────────────────────────┘
+                              │
+                              ▼
+                ┌──────────────────────────────────────┐
+                │ 2. Resolve nearest basin per element │
+                │    → (basin_idx, δ from basin)       │
+                └──────────────────────────────────────┘
+                              │
+                              ▼
+                ┌──────────────────────────────────────┐
+                │ 3. Mode-decide per element:          │
+                │    Skip (δ=0)                        │
+                │    Merge (δ matches NEWS neighbour)  │
+                │    Delta (δ fits 8-bit)              │
+                │    Escape (full payload, idx into    │
+                │            per-frame escape vector)  │
+                └──────────────────────────────────────┘
+                              │
+                              ▼
+                ┌──────────────────────────────────────┐
+                │ 4. Pack LeafCu (2/3/3/6 bytes) into  │
+                │    bytestream                        │
+                └──────────────────────────────────────┘
+                              │
+                              ▼
+                ┌──────────────────────────────────────┐
+                │ 5. rANS-encode the bytestream with   │
+                │    per-frame frequency tables (A7)   │
+                └──────────────────────────────────────┘
+```
+
+Steps 1-5 are domain-agnostic. **What changes per load is the basin
+clustering rule (step 1) and the escape payload (step 4's Escape
+branch).** Everything else is shared kernel.
+
+## 2. The isomorphism
+
+**Claim:** `LeafCu` is a **discriminated union over (basin_ref,
+residual) representations**, parameterised by a 2-bit predictability
+class. The four bits across `(CellMode × MergeDir)` form a 16-state
+classification machine that is not specific to video or cognitive
+content. It is the natural mode-coding alphabet for any signal that
+is:
+
+- **Locally predictable** from a small per-frame codebook
+- **Spatially smooth** in a defined neighbour topology (NEWS, in
+  PR-X12 today; trivially generalisable to 6-way XYZ or
+  token-sequential)
+- **Heavy-tailed** in its residuals (most values fit a small δ;
+  rare values need full Escape)
+
+All four loads named in § 1 satisfy these three properties. The
+codec we shipped is therefore not "an HEVC port"; it is the
+**reference encoder for predictable-codebook signals**. HEVC is one
+consumer.
+
+### The 16-state classification table
+
+`(CellMode, MergeDir)` cross product, repurposed per domain:
+
+| Mode × Dir | Cognitive cell | 3DGS Gaussian | Attention | Gradient |
+|------------|----------------|---------------|-----------|----------|
+| Skip, — | cell = basin exactly | Gaussian = palette splat exactly | Q has no significant K | grad ≈ 0 (sparse update) |
+| Merge, N | inherit δ from N-neighbour | inherit from prev-Morton Gaussian | inherit attention from prev-token | inherit grad from prev-layer sibling |
+| Merge, E | inherit from E-neighbour | inherit from next-Morton | inherit from next-token | inherit from next-layer |
+| Merge, W | inherit from W-neighbour | inherit from coarse-tier parent | inherit from prev-head | inherit from prev-iteration |
+| Merge, S | inherit from S-neighbour | inherit from fine-tier child | inherit from next-head | inherit from next-iteration |
+| Delta, — | 8-bit cell perturbation | 8-bit residual on (μ, scale, op) | 8-bit attention weight δ | 8-bit grad (QSGD, signSGD-magnitude) |
+| Escape, — | full 64-bit fingerprint via idx | full SH coeffs ≥ L=2 via idx | full FP16 Q vector via idx | full FP32 grad via idx |
+
+`MergeDir`'s 4-way alphabet is **already the natural carrier** for
+"inherit from one of 4 neighbours in some topology". The topology
+varies per load; the encoding does not.
+
+## 3. Primitive → load mapping matrix
+
+This is the dense one. Each row is one primitive from PR-X12; the
+columns are the four loads. Cells say what the primitive does in
+that load, with file/line refs back to ndarray master.
+
+### 3.1 Carrier primitives
+
+| Primitive | Cognitive cell | 3DGS Gaussian | Attention | Gradient SGD |
+|-----------|----------------|---------------|-----------|--------------|
+| `Ctu` (`ctu.rs:285`) | one L1 BlockedGrid block (64×64 cells) | one tile-bin or one octree node (64-256 Gaussians) | one (token-window × heads) block (typically 64×16) | one parameter-shard (64K weights) |
+| `CtuArena` (`ctu.rs:212`) | 85-node quad-tree per CTU | tile quad-tree (LOD cascade) | token-window prefix-tree | per-shard residual hierarchy |
+| `CtuPartition` (`ctu.rs:193`) | recursive 64→32→16→8 split | tile 64×64 → 16×16 → 4×4 LOD | window 64→16→4 attention granularity | shard 64K→16K→4K gradient grouping |
+| `LeafCu` (`ctu.rs:114`) | one cell's encoded mode | one Gaussian's encoded mode | one (head, token-position)'s mode | one weight's gradient mode |
+| `MAX_BASIN_IDX = 4095` (`mode.rs:62`) | 4096-entry basin codebook | 4096-entry palette (μ_color × scale clusters) | 4096-entry KV cluster centroid | 4096-entry gradient-pattern bank |
+| `BASIN_NONE` (`mode.rs:71`) | cell outside any basin | Gaussian outside palette range | Q outside KV palette (forces Escape) | grad outside known patterns |
+
+### 3.2 Encoder primitives
+
+| Primitive | Cognitive cell | 3DGS Gaussian | Attention | Gradient SGD |
+|-----------|----------------|---------------|-----------|--------------|
+| `pack_header(mode, basin_idx)` (`mode.rs:83`) | 16-bit cell header | 16-bit Gaussian header | 16-bit (head, token) header | 16-bit weight header |
+| `pack_leaf` (`mode.rs:172`) | 2/3/3/6 byte cell record | 2/3/3/N byte Gaussian record (N depends on SH order in Escape) | 2/3/3/Q-width byte attention record | 2/3/3/4-byte weight gradient record |
+| `predict_intra` (`predict.rs:186`) | encoder picks mode for cell | encoder picks mode for Gaussian per-Morton-step | encoder picks mode for (Q, K) pair | encoder picks mode for ∂L/∂w |
+| `IntraContext.neighbours` (`predict.rs:117`) | NEWS spatial neighbours | prev/next Morton-sorted neighbours + parent/child tier | prev/next token + prev/next head | prev/next layer + prev/next iter |
+| `IntraConfig` (`predict.rs:132`) | (future RDO knobs) | (future LOD/PSNR tradeoff knobs) | (future accuracy/latency knobs) | (future compression/convergence knobs) |
+| `escape_next: Option<&mut u32>` (`predict.rs:202`) | escape vector cursor for full-payload cells | escape vector cursor for SH-heavy Gaussians | escape vector cursor for outlier Q | escape vector cursor for outlier grads |
+
+### 3.3 Wire-format primitives (deferred to A7/A8)
+
+| Primitive | Cognitive cell | 3DGS Gaussian | Attention | Gradient SGD |
+|-----------|----------------|---------------|-----------|--------------|
+| rANS encoder (A7) | per-frame basin-frequency table | per-asset palette-frequency table | per-context attention-pattern frequency | per-layer gradient-mode frequency |
+| Stream framing (A8) | CTU markers, frame headers | tile-bin markers, asset headers | window markers, batch headers | shard markers, iter headers |
+| Escape vector | per-frame `Vec<u64>` of full fingerprints | per-asset `Vec<f32; SH_LEN>` | per-context `Vec<f16; head_dim>` | per-shard `Vec<f32>` |
+
+## 4. Epiphanies
+
+Cross-domain insights worth flagging because each has 1-3 papers'
+worth of novelty if pursued. None of these are in print as of the
+literature snapshot I'm working from; **claim** is the right word, not
+"finding".
+
+### E1. **`MergeDir` is a topology, not a direction.**
+
+`{North, East, West, South}` happens to be a 2D Cartesian raster
+mental model. The codec doesn't care. The discriminant alphabet just
+needs to be a 4-way categorical over "which of 4 neighbours did I
+inherit from". In 3DGS that's `{prev-Morton, next-Morton, parent-LOD,
+child-LOD}`. In attention that's `{prev-token, next-token, prev-head,
+next-head}`. In SGD that's `{prev-iter, next-iter, prev-layer,
+next-layer}`. **No code change required.** The doc + the docstring
+in `IntraContext.neighbours` are the only constraints; the 2-bit
+encoding is topology-free. → write up as: "Carrier-agnostic Merge
+inheritance via parameterised 4-neighbour topology" (mini-paper).
+
+### E2. **`predict_intra` already encodes attention sinks.**
+
+The "Skip" mode case in `predict_intra` (`predict.rs:189-190`) —
+returns when `delta_i32 == 0` — is exactly the attention-sink
+phenomenon Streaming-LLM, H2O, SnapKV chase. Their attention mass
+concentrates on a tiny subset of tokens; the rest are "Skip". With
+the basin codebook as KV cluster centroids, **`predict_intra` is a
+zero-shot attention sparsifier**: it labels every (Q, K) pair as
+Skip/Merge/Delta/Escape and the wire cost is monotone in attention
+mass. Combine with the rANS A7 and you get **bit-exact KV-cache
+compression with a tunable accuracy floor**. The encoder is shipped.
+
+### E3. **`escape_next: &mut u32` is the lineage of gradient streaming.**
+
+The owner-author review's P1 — escape allocator collision — is the
+exact issue federated-SGD papers solve with "all-reduce buckets":
+multiple workers each emit gradient deltas, the aggregator needs
+non-colliding slots in a shared vector. The `Option<&mut u32>` cursor
+**is the all-reduce slot allocator**, just per-CTU instead of
+per-batch. Lift it to a worker-pool API and you have a federated
+gradient codec without writing new code.
+
+### E4. **The 64-bit `Fingerprint` and a 3DGS Gaussian's first-six floats compress identically.**
+
+Cognitive `Fingerprint` is 64 bits = 4×16-bit lanes. A 3DGS Gaussian's
+`(μ_x, μ_y, μ_z, scale_x, scale_y, scale_z)` is 6 × FP16 quantised =
+96 bits, but with 32 high bits dominated by the scale envelope which
+is locally constant per palette basin. After basin subtraction, the
+residual is ~64 bits — **identical to the cognitive cell case**. The
+same `pack_leaf` works. The escape vector type changes from `u64` to
+`[u16; 6]` but the codec is structurally invariant.
+
+### E5. **The Morton/Hilbert sort along which we encode 3DGS Gaussians is the EXACT spatial structure HEVC's macroblock raster scan implements in 2D.**
+
+HEVC's CTU traversal is z-order. 3DGS Gaussians sorted Morton/Hilbert
+along their μ are z-order in 3D. The encoder doesn't know it's seeing
+3D content; the spatial coherence in 1D-along-curve is identical to
+2D-along-raster. **The CTU partition machinery in `ctu.rs` ports to
+3DGS with zero changes to the partition logic.** What changes is the
+predicate that decides when to split (variance of (μ, scale, opacity)
+inside the node vs. PSNR target).
+
+### E6. **rANS with per-frame frequency tables is the **only** entropy coder that scales to 10⁶+ tokens.**
+
+CABAC is fine for video at ~10⁵ macroblocks/frame. Attention at
+10⁶+ tokens/sec needs an entropy coder whose state machine fits in
+L1 cache and whose throughput is gated by table lookup, not by the
+serial CABAC interval renormalisation. rANS is that. **A7 is the
+critical piece; without it the codec is academic.** Prioritise.
+
+### E7. **The 4096-entry basin codebook is identical to attention's KV palette identity in lance-graph.**
+
+This is the architectural payoff. `lance-graph::SpoDistanceMatrices`
+computes (basin_id, distance) for SPO triples at 611M lookups/sec
+(see CLAUDE.md "Session: Qwen3.5 × Opus 4.5/4.6"). The same data
+structure feeds the cognitive codec basin lookup AND the attention
+KV-cluster lookup AND the 3DGS palette nearest-neighbour. **One
+codebook, three consumers, identical lookup kernel.** Lance is the
+column substrate; the codebook is its first "logical schema" — and
+that schema is shared.
+
+### E8. **Mode-coding is parameter-efficient supervised LoRA.**
+
+A LoRA adapter on a weight matrix `W` is a rank-r perturbation
+`W + ΔW = W + B·A`. Express `ΔW` as a `BlockedGrid<u64, 64, 64>` and
+mode-code it. Most weights are Skip (no LoRA contribution), some
+inherit from neighbours (Merge), a few have small per-weight deltas
+(Delta), and the heavy hitters are Escape. **`LeafCu`-coded LoRA is
+~10× smaller than rank-32 LoRA on weight matrices > 4096².** The
+codec is the parameter-efficient fine-tuning representation.
+The user's "Pertuberationslernen" instinct lands here.
+
+### E9. **The `splat3d` PRs 1-7 (May sprint) and the `codec` PRs are the SAME pipeline shifted 90°.**
+
+The splat3d forward pipeline is: project → tile-bin → mode-decide
+(which Gaussian contributes at which pixel) → alpha-composite. The
+codec pipeline is: build codebook → block-partition → mode-decide
+(which mode each cell takes) → entropy-code. **Both end in
+mode-decide → reduce.** The mode-decide kernel is `predict_intra`
+in both cases; the reduction differs (alpha vs. rANS). A unified
+"mode-decide + reduce" trait would collapse 2 KLoC. **Worth a
+sprint, not a PR**.
+
+### E10. **The lossy Escape fallback is a PSNR knob in disguise.**
+
+The owner-review's P2 nit — "lossy Escape emits `CellMode::Delta`,
+the docstring lies" — is a feature, not a bug, **iff** we expose a
+"lossy_threshold: u8" config. Then the fallback becomes "use Delta
+for any |δ| ≤ threshold even if it would normally Escape". That's
+the rate-distortion knob HEVC's λ-RDO tunes. **Promote the
+docstring acknowledgement into a config field in A6 RDO.**
+
+## 5. Integration plans
+
+Concrete branches/PRs, each with effort estimate + dependency.
+Listed by priority (impact ÷ risk).
+
+### Plan A — A7 rANS (critical, no domain-specific blockers)
+
+**Effort:** 1 worker × 1 week. Standard rANS, single-symbol,
+encoder + decoder + parity test. Consumes `pack_leaf` output, emits
+compressed bytestream.
+
+**File:** `src/hpc/codec/ans.rs` (new).
+
+**Dependency:** none — A2 + A3-intra are sufficient input.
+
+**Why first:** without entropy coding, the codec gives 2-3× over
+raw. With rANS at per-frame frequency tables, 6-10×. Below the
+rANS threshold, the codec is academic.
+
+### Plan B — A3-inter (cross-tier neighbour scan) (codec-side completion)
+
+**Effort:** 1 worker × 3 days. Extend `IntraContext.neighbours` to
+include parent-tier and child-tier neighbours from `BlockedGrid`'s
+L2/L3 cascade. Mode-decision tree gains 8 candidates instead of 4.
+
+**File:** `src/hpc/codec/predict.rs` (extend) + new `inter.rs`.
+
+**Dependency:** PR-X3 BlockedGrid L2/L3 cascade (shipped).
+
+**Why second:** unlocks the recursive partition compression. Without
+inter prediction, parent-tier basins don't seed child-tier deltas.
+
+### Plan C — EWA SYRK-batched (3DGS performance, no codec changes)
+
+**Effort:** 1 worker × 1 week. Replace `sandwich_x16` per-Gaussian
+loop with batched `cblas_ssyrk`. Add backend-dispatch (native /
+intel-mkl / openblas).
+
+**File:** `src/hpc/splat3d/spd3.rs` (extend) +
+`src/backend/{native,mkl,openblas}.rs` (add syrk wiring).
+
+**Dependency:** ndarray BLAS backend infra (shipped).
+
+**Why third:** biggest pure-FLOPS win, splat-aligned, no codec
+coupling. Hits the holy grail outcome §7.1.
+
+### Plan D — Attention codec PoC (cognitive-side new ground)
+
+**Effort:** 2 workers × 2 weeks. Wire `predict_intra` against a
+synthetic KV cache; build the basin codebook via mini-batch k-means
+on K vectors; measure compression vs. accuracy on a known LLM
+benchmark (LongBench, RULER).
+
+**File:** new crate `crates/attention-codec/` consuming
+`ndarray::hpc::codec::*`.
+
+**Dependency:** Plan A (rANS) for realistic compression numbers.
+
+**Why fourth:** highest-novelty load; depends on A7 to be convincing.
+
+### Plan E — 3DGS coefficient codec (splat-side compression)
+
+**Effort:** 2 workers × 3 weeks. Morton-sort a trained scene's
+Gaussians, build per-asset palette codebook via k-means over
+(color, scale), mode-code the residuals through `pack_leaf`, rANS
+through A7.
+
+**File:** new module `src/hpc/splat3d/codec.rs`.
+
+**Dependency:** Plan A (rANS), Plan B (A3-inter for LOD cascade).
+
+**Why fifth:** highest engineering value, but has external benchmark
+risk — Inria's PLY format has format-stability constraints we'd
+need to negotiate (or just ship a parallel format).
+
+### Plan F — Gradient streaming codec (federated SGD)
+
+**Effort:** 2 workers × 4 weeks. Workers emit `LeafCu` streams; the
+aggregator decodes and applies. Requires a `&mut u32` allocator
+generalised across worker pools (see E3).
+
+**File:** new crate `crates/grad-codec/`.
+
+**Dependency:** Plan A, Plan B.
+
+**Why sixth:** highest research novelty; lowest near-term ROI
+(federated SGD is a niche stack).
+
+## 6. Exploration paths
+
+Things that warrant a sprint or research session, not a single PR.
+Each has at least one unresolved question that disqualifies it from
+"integration plan" status.
+
+### X1. Carrier-agnostic 4-neighbour topology trait
+
+Design a `trait NeighbourTopology<const N: usize>` that
+`IntraContext` consumes generically. Cognitive: N=4 NEWS. 3DGS: N=4
+(prev/next-Morton, parent/child-LOD). Attention: N=4 (prev/next
+token, prev/next head). SGD: N=4 (prev/next iter, prev/next layer).
+Compile-time-resolved, zero-cost. **Open question:** does mode-coding
+generalise to N=6 (3D XYZ)? Two more `MergeDir` discriminants needed;
+bit-budget impact on the wire format.
+
+### X2. Hierarchical motion estimation as cross-tier prediction
+
+HEVC's hierarchical ME (4-tier coarse-to-fine pyramid) maps onto
+the BlockedGrid L1/L2/L3/L4 cascade. **Open question:** the cost
+function. HEVC uses SAD on luma; cognitive uses Hamming on
+Fingerprints; 3DGS uses PSNR on rendered tiles. Three cost
+functions, one search structure — is the hierarchical-ME logic
+worth the abstraction?
+
+### X3. CABAC vs. rANS for attention KV cache
+
+CABAC's serial dependency caps throughput at ~10⁸ symbols/sec on
+modern CPU. rANS gets ~10⁹. **Open question:** does the latency
+floor matter for attention's real bottleneck (memory bandwidth,
+not entropy decode)? Bench before committing to A7.
+
+### X4. SH coefficient intra-prediction in spectral space
+
+Predict L=2, L=3 SH from a learned linear function of L=0, L=1.
+**Open question:** is the linear function global or per-basin? Per-
+basin is more expensive but probably 2× better; need data to
+decide. Inria's stock 3DGS dataset (Mip-NeRF 360, T&T, Deep
+Blending) is the benchmark.
+
+### X5. Mode-coded LoRA
+
+E8 above. **Open question:** does Skip-heavy `ΔW` retain LoRA's
+fine-tuning quality? Run a controlled experiment on a Qwen3.5-7B
+checkpoint with LoRA rank 8 vs. mode-coded ΔW at the same byte
+budget. Measure on MMLU-redux + a downstream task.
+
+### X6. Unified `mode_decide + reduce` trait (E9)
+
+Generalise `predict_intra` so it's parameterised on the **reduction
+operator**: alpha-composite (3DGS), rANS-encode (codec),
+sum-reduce (SGD all-reduce), softmax (attention). **Open
+question:** does a single trait actually compose, or does each
+domain need its own bespoke variant? Risk: premature abstraction.
+
+### X7. Lance column substrate as the universal palette codebook backing store
+
+`SpoDistanceMatrices` at 611M lookups/sec, 388 KB RAM. If we
+extend it to handle (basin_centroid → idx) lookups for all four
+loads in § 1, we get one column-store serving cognitive cells,
+KV palettes, 3DGS palette, and gradient-pattern banks. **Open
+question:** the centroid distance function differs per load
+(Hamming for fingerprints, L2 for Gaussians, cosine for Q vectors,
+sign-vote for gradients). Does `SpoDistanceMatrices` accept
+pluggable metrics?
+
+### X8. AMX TDPBF16PS for batched EWA sandwich
+
+The `M · Σ · Mᵀ` operation on 16 Gaussians at a time fits AMX's
+16×16 BF16 tile exactly. **Open question:** the precision loss
+from BF16 vs. FP32 on 2D conic invertibility — preliminary lit
+search says fine, but needs Pillar-7-style probe before commit.
+
+## 7. Holy Grail material
+
+If all of § 5 + § 6 land, the following outcomes fall out. None
+are guaranteed; each is the "yes, that worked" branch.
+
+### HG1. **One codec, four loads.**
+
+A unified bytestream format codes cognitive cells, 3DGS scenes,
+KV caches, and gradient streams interchangeably. The Lance column
+substrate stores them all in the same Arrow-backed layout. A
+single `cargo install` ships compression for video-codec-equivalent
++ all four cognitive/ML loads.
+
+Marketing line: *"x265 was a codec for one signal. PR-X12 is a
+codec for the manifold of predictable codebook-coded signals."*
+
+### HG2. **Sub-1-bit-per-Gaussian 3DGS compression.**
+
+Stock 3DGS: ~250 bytes/Gaussian raw, ~50 bytes after PLY-trim.
+PR-X12 mode-coded + A7 rANS: ~3-8 bits/Gaussian for the dominant
+modes. **30-60× over current state of the art.** A 1M-Gaussian
+scene fits in ~500 KB instead of 50 MB. Streamable as a video.
+
+### HG3. **Bit-exact attention with tunable accuracy floor.**
+
+`predict_intra` over (Q, K) palette gives an attention sparsifier
+that is bit-exact at the "Escape always" setting and gradually
+loses precision as Skip/Merge/Delta dominate. The accuracy floor
+is a single knob (`escape_threshold: u8`) — no per-model tuning.
+Streaming-LLM, H2O, SnapKV become consumers of one codec.
+
+### HG4. **Federated SGD at 8-16× compression with zero accuracy loss.**
+
+Worker→aggregator gradient streams via `LeafCu`. Skip-mode kills
+noise; Merge-mode discovers parameter sharing online; Delta-mode
+gives QSGD; Escape-mode preserves outliers. The compression is
+free because the codec already exists.
+
+### HG5. **Lance column-substrate identity becomes the ground truth.**
+
+The same Arrow buffer feeds: cognitive cell storage, 3DGS Gaussian
+SoA, KV cache, gradient shards. The codec encodes the same bytes
+across all four. `lance-graph::SpoDistanceMatrices` becomes the
+universal palette codebook lookup. ndarray = hardware; lance =
+substrate; codec = compression; PR-X12 closes the substrate loop.
+
+### HG6. **The "splat3d × x265" bet pays out as one library.**
+
+The May splat3d sprint (PRs 1-7) gave a CPU-SIMD 3DGS renderer.
+PR-X12 gives the codec. Combined, the same library compresses,
+streams, decodes, and renders 3D scenes in real-time on a single
+core. **The combination is novel; neither half is.**
+
+## 8. Codec-side technical debt
+
+Honest accounting. PR-X12 shipped A2 + A3-intra; what we owe
+ourselves to make the rest of this doc bankable:
+
+### D-CODEC-1. A3-inter is unwritten. (P1)
+
+The `IntraContext` consumes 4 NEWS neighbours; the design doc
+calls for parent-tier + child-tier extension. Without inter
+prediction, the BlockedGrid L2/L3/L4 cascade contributes nothing
+to compression. **Plan B in § 5.**
+
+### D-CODEC-2. rANS A7 is unwritten. (P0 for any real benchmark)
+
+Without entropy coding, the per-mode bit budget is rounded to
+bytes. 2 bits/cell achievable becomes 2 bytes/cell shipped — 8×
+overhead. Plan A.
+
+### D-CODEC-3. λ-RDO A6 is unwritten. (P1)
+
+Mode-decision is greedy (cheapest-fit wire cost). Real codecs
+trade bits for distortion via λ-RDO. Without it, the codec
+cannot be tuned for accuracy/compression trade-off — the lossy
+Escape fallback is the only knob and it's binary.
+
+### D-CODEC-4. Stream framing A8 is unwritten. (P1)
+
+`pack_leaf` writes raw `LeafCu` records back-to-back. No frame
+boundaries, no CTU markers, no error recovery. Live streaming
+needs all three.
+
+### D-CODEC-5. The basin codebook is **not built**. (P1, blocks all loads)
+
+The codec assumes `basin_idx` comes from somewhere. For cognitive
+cells the somewhere is `OgitBridge` (downstream). For 3DGS,
+attention, SGD — the codebook construction is per-load, k-means
+over the carrier, no shared infra yet.
+
+### D-CODEC-6. The lossy Escape fallback is a footgun. (P3)
+
+Owner-review noted the docstring acknowledges the lie. Long-term:
+promote to a config field (E10). Short-term: docstring is fine.
+
+### D-CODEC-7. NEWS topology is hard-coded. (P2)
+
+`merge_dir_from_index` in `predict.rs:281` is a 4-way match. The
+codec is not generic over topology yet. Plan X1 — exploration.
+
+### D-CODEC-8. No SIMD-batched CTU sweep. (P2)
+
+`predict_intra` is scalar; per-CTU at 64×64 = 4096 cells, the
+SIMD opportunity is obvious (16 cells per `F32x16` lane). Deferred
+until reference + reconstruction parity test land.
+
+### D-CODEC-9. No `Result`-shaped error variant. (P3)
+
+`pack_leaf` returns `Option<usize>`. Real errors (buffer too
+short, mode-decision inconsistency) lose semantics. Promote to
+a typed `enum CodecError`.
+
+### D-CODEC-10. The mode 2-bit encoding pins us to ≤4 modes. (P3, architectural)
+
+`pack_header` puts 2 bits at bits 12-13 of u16, leaving 2 reserved
+high bits. Future "mode 5" (e.g., a 16-bit Delta variant for
+splat) needs to claim bit 14. **Plan the upgrade path in the
+design doc before shipping A7.**
+
+## 9. Stack-side technical debt when combining synergies
+
+The harder accounting. PR-X12 fits cleanly into ndarray. But when
+we wire the synergies of §§ 4-7, the **existing stack** has debts
+that get worse, not better, under multi-load pressure. Honest
+catalogue:
+
+### D-STACK-1. `BlockedGrid` block size is fixed at 64×64. (P1 if 3DGS lands)
+
+3DGS tiles in the splat3d crate are 16×16. The codec assumes 64×64
+CTUs. The pre-sprint prompt for `pr-x12` aligns them at L1 = 64×64
+of cognitive cells. **For 3DGS coefficient compression**, the
+natural CTU is one tile = 16×16. Mismatch: either generalise
+`Ctu` over block size (preferred, low cost) or maintain two block
+formats (technical debt). Decide before Plan E (3DGS codec).
+
+### D-STACK-2. The basin codebook lookup has no SIMD path. (P1)
+
+`SpoDistanceMatrices` at 611M lookups/sec is sequential; the codec
+needs **batched** lookup (1 CTU = 4096 cells × 4096 basins = 16M
+distance computes). Without SIMD, the encoder is lookup-bound at
+~10⁵ CTU/sec. With AVX-512 + AMX, 10⁷ CTU/sec achievable. **Bench
+before A6 RDO.**
+
+### D-STACK-3. `MergeDir`'s 4-way alphabet is wire-pinned. (P1 for X1)
+
+`cell_mode_discriminants_match_wire_codes` test pins MergeDir to
+`{N=0, E=1, W=2, S=3}` on the wire. If X1 generalises topology to
+N=6 or N=8, the wire format breaks. Plan the upgrade with a
+version byte in A8 stream framing.
+
+### D-STACK-4. `Fingerprint` is 64-bit only. (P2 for 3DGS)
+
+3DGS basin residual is 96 bits (6 × FP16). Either widen
+`Fingerprint` (touches truth/cascade/bf16_truth modules) or
+introduce a sibling type for splat (better — keep cognitive
+cells fingerprint-typed). The codec is type-generic enough to
+not care, but consumers will.
+
+### D-STACK-5. The `splat3d` PRs do not consume `codec`. (P2)
+
+Currently independent. Combining E9's "mode-decide + reduce" trait
+requires either (a) a shared trait crate or (b) a refactor of
+both. Decide before committing to Plan E.
+
+### D-STACK-6. Lance column substrate exists in `lance-graph`, not `ndarray`. (P1 for HG5)
+
+The HG5 "Lance is the substrate" outcome requires `ndarray::hpc::codec`
+to depend on `lance-graph::SpoDistanceMatrices`. Currently ndarray
+is the **dependency-bottom** of the stack. Two options: invert
+(ndarray depends on lance — wrong, breaks the layering rule from
+CLAUDE.md "Architecture Rule"), or introduce a third crate that
+both depend on. Probably the latter; needs a sprint.
+
+### D-STACK-7. The cognitive `splat.rs` in `lance-graph-contract` is sacred. (P0, do not touch)
+
+Per the sprint setup: that file is the contract. PR-X12 must never
+import or refactor it. If E4 (Fingerprint ≡ 3DGS first-6-floats)
+becomes provable, it'll be **tempting** to fold them. Don't. The
+abstraction boundary is load-bearing for the cognitive
+architecture, even if the bit patterns rhyme.
+
+### D-STACK-8. No backend dispatch in the codec. (P2)
+
+`pack_leaf` is one implementation. EWA, BLAS, MKL all have backend
+dispatch (`native` / `intel-mkl` / `openblas` features). The codec
+will need: scalar / SIMD / AMX backends for the SIMD-batched CTU
+sweep (D-CODEC-8). Plan when D-CODEC-8 lands.
+
+### D-STACK-9. The 4096-basin codebook size assumes "per-frame, reset between frames". (P3)
+
+For attention's KV cache, the "frame" is the (context-window,
+batch-element) tuple. For 3DGS, the "frame" is the entire trained
+scene (codebook is static after training). For SGD, the "frame"
+is one mini-batch. **Three different lifetimes, one type.** Either
+generalise lifetime (preferred) or document the discipline (likely).
+
+### D-STACK-10. The current PR-arc cadence is one PR per worker per day. (P2, organisational)
+
+The synergies in §§ 5-7 will require multi-worker coordinated
+sprints (e.g., Plan D = 2 workers × 2 weeks). The autoattended
+multi-agent protocol scales worker count, but the coordinator's
+state machine doesn't currently model multi-week dependencies.
+**Update the coordinator agent prompt before kicking off Plan D.**
+
+### D-STACK-11. AVX-512 is mandatory in `.cargo/config.toml`. (P1 for portability)
+
+CLAUDE.md: `target-cpu=x86-64-v4`. Plan F (federated SGD) implies
+multi-architecture (NEON workers, AVX2 workers). Either drop the
+mandatory AVX-512 or scope federated SGD to AVX-512 nodes only.
+
+### D-STACK-12. The cognitive `Base17` / `NarsTruth` / `TripleModel` types live in `lance-graph`. (P1 for HG3)
+
+HG3 (attention codec) wants to consume cognitive truth values
+(NarsTruth) to gate the Escape-or-Skip decision. Same dependency
+inversion as D-STACK-6.
+
+### D-STACK-13. No multi-domain benchmark harness. (P0 if we want to claim HG1)
+
+We have splat3d bench, codec tests, SpoDistanceMatrices bench
+separately. A combined "single-codec-four-loads" benchmark — one
+build, one binary, four scenarios — does not exist. Without it,
+HG1 is a claim, not a demonstration. **Build the harness before
+the marketing.**
+
+## 10. Sequencing summary
+
+If we commit to all of this, the order matters:
+
+```
+                                       Plan A (rANS A7)
+                                          │
+                                          ▼
+              ┌─────────────────┬────────────────────┬─────────────────┐
+              ▼                 ▼                    ▼                 ▼
+        Plan B (A3-inter)  Plan C (EWA SYRK)   Plan X8 (AMX BF16)   D-STACK-2 (SIMD lookup)
+              │                                                       │
+              ▼                                                       ▼
+         Plan E (3DGS codec) ◄──────────────────────────────── D-STACK-1 (block size)
+              │
+              ▼
+         Plan D (attention codec) ◄──────────────────────────── D-STACK-6/12 (third crate)
+              │
+              ▼
+         Plan F (gradient codec)
+              │
+              ▼
+         HG1-HG6 unlocked
+```
+
+Critical path: **A7 rANS** → everything else. Without it, no
+benchmark is convincing. Plan A is one worker for one week. Ship
+that next; the rest of this doc is just inventory until A7 lands.
+
+## 11. Compaction-preservation note
+
+Per CLAUDE.md § Compaction Preservation, this doc must survive
+summarisation. The blackboard entry should reference this file by
+path; do not inline the matrix. Key facts to retain across
+compaction:
+
+1. PR-X12 A2 + A3-intra shipped in PR-195 (master commits b39a5769,
+   b44fe59f). All review comments resolved or outdated.
+2. The four-load isomorphism (§ 2) is the architectural claim;
+   everything else is sequencing.
+3. The critical path is A7 rANS — without it, the codec is academic.
+4. The Lance column substrate identity (HG5) is the convergence
+   highway; both ndarray and lance-graph land there.
+5. The sacred file is `lance-graph-contract/src/splat.rs`. Never
+   touch even if the bit patterns rhyme (D-STACK-7).

From 242329853e61d08d0f2cc8bd3fb0259c2a9f2ac9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 07:35:56 +0000
Subject: [PATCH 5/5] =?UTF-8?q?fix(codec):=20address=20PR-195=20follow-up?=
 =?UTF-8?q?=20review=20=E2=80=94=20bijective=20pack=5Fleaf=20+=20BASIN=5FN?=
 =?UTF-8?q?ONE=20distinctness?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two real bugs flagged by CodeRabbit's outside-diff review on b44fe59f.

P1 — `pack_leaf` was non-bijective via unwrap_or fallbacks (mode.rs:194):
  Hand-constructing a `LeafCu { mode: Merge, merge_dir: None, .. }`
  serialized as `LeafCu::merge(.., MergeDir::North)` — silently rewriting
  malformed input into a different valid leaf and hiding upstream invariant
  breaks. Now uses `?` to short-circuit with None when required fields are
  missing for the mode (Merge needs merge_dir; Delta needs delta; Escape
  needs escape_idx). The `LeafCu::merge/delta/escape` constructors still
  enforce the invariants — only struct-literal bypass callers can hit the
  rejection.
  + 3 regression tests: leaf_pack_rejects_malformed_merge_without_dir,
    leaf_pack_rejects_malformed_delta_without_value,
    leaf_pack_rejects_malformed_escape_without_idx
  + pack_leaf docstring documents the bijective contract.

P1 — `BASIN_NONE` collided with `MAX_BASIN_IDX` at 4095 (mode.rs:69):
  Both equaled `(1 << 12) - 1`, so basin 4095 was ambiguous — a real
  basin and the "no basin" sentinel had the same encoded value.
  Now: MAX_BASIN_IDX = 4094 (highest real basin), BASIN_NONE = 4095
  (sentinel one slot above). Introduced private BASIN_FIELD_MASK = 0x0FFF
  for header packing/unpacking — independent of MAX_BASIN_IDX so BASIN_NONE
  still round-trips through the 12-bit field as a sentinel marker.
  pack_header/unpack_header switched from `& MAX_BASIN_IDX` (would wrongly
  clear bit 0 with 0xFFE) to `& BASIN_FIELD_MASK`.
  + 2 regression tests: basin_none_distinct_from_max_basin_idx,
    header_round_trips_max_basin_idx_and_basin_none_distinctly
  + MAX_BASIN_IDX + BASIN_NONE doctests updated to assert the new
    relationship.

Gates:
  cargo test --features codec --lib hpc::codec → 55 passed (+5)
  cargo test --features codec --doc hpc::codec → 15 passed
  cargo fmt --all -- --check → clean
  cargo clippy --features codec --lib -- -D warnings → clean
---
 src/hpc/codec/mode.rs | 132 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 112 insertions(+), 20 deletions(-)

diff --git a/src/hpc/codec/mode.rs b/src/hpc/codec/mode.rs
index f748b7bd..4d756812 100644
--- a/src/hpc/codec/mode.rs
+++ b/src/hpc/codec/mode.rs
@@ -59,23 +59,41 @@ use super::ctu::{CellMode, LeafCu, MergeDir};
 // Header pack / unpack (16-bit)
 // ════════════════════════════════════════════════════════════════════
 
-/// Maximum encodable `basin_idx`. Stored in the lower 12 bits of the
-/// header; values >= this constant overflow the header field.
+/// Maximum encodable real `basin_idx`. Equal to `(1 << 12) - 2 = 4094`
+/// so that the all-ones 12-bit pattern (`0xFFF = 4095`) is reserved as
+/// the [`BASIN_NONE`] sentinel — without that reservation, basin 4095
+/// would round-trip ambiguously with "no basin assigned".
+///
+/// The on-wire 12-bit field still holds any value `0..=0xFFF`; only the
+/// encoder's *valid-basin* range is restricted to `0..=MAX_BASIN_IDX`.
+/// [`BASIN_NONE`] is encodable in the header field too (when an encoder
+/// emits a "no basin" record), but it must never appear as a real basin
+/// codebook index.
 ///
 /// ```
-/// use ndarray::hpc::codec::MAX_BASIN_IDX;
-/// assert_eq!(MAX_BASIN_IDX, (1 << 12) - 1);
+/// use ndarray::hpc::codec::{BASIN_NONE, MAX_BASIN_IDX};
+/// assert_eq!(MAX_BASIN_IDX, (1 << 12) - 2);
+/// assert_eq!(MAX_BASIN_IDX, 4094);
+/// assert!(MAX_BASIN_IDX < BASIN_NONE);
 /// ```
-pub const MAX_BASIN_IDX: u16 = (1 << 12) - 1; // 4095
+pub const MAX_BASIN_IDX: u16 = (1 << 12) - 2; // 4094
 
 /// Tag inside the per-frame basin codebook for "no basin assigned"
-/// (encoder-side sentinel during mode decision).
+/// (encoder-side sentinel during mode decision). Equal to `0xFFF`
+/// (the all-ones 12-bit pattern) so it sits one slot above the highest
+/// real basin index ([`MAX_BASIN_IDX`]).
 ///
 /// ```
 /// use ndarray::hpc::codec::{BASIN_NONE, MAX_BASIN_IDX};
-/// assert_eq!(BASIN_NONE, MAX_BASIN_IDX);
+/// assert_eq!(BASIN_NONE, 4095);
+/// assert_eq!(BASIN_NONE, MAX_BASIN_IDX + 1);
 /// ```
-pub const BASIN_NONE: u16 = MAX_BASIN_IDX;
+pub const BASIN_NONE: u16 = (1 << 12) - 1;
+
+/// Private: 12-bit mask for the basin field of the packed header.
+/// Independent of [`MAX_BASIN_IDX`] so that [`BASIN_NONE`] (which sits
+/// in the 12-bit field but is not a real basin) still round-trips.
+const BASIN_FIELD_MASK: u16 = 0x0FFF;
 
 /// Pack `(mode, basin_idx)` into a 16-bit header.
 ///
@@ -91,7 +109,7 @@ pub const BASIN_NONE: u16 = MAX_BASIN_IDX;
 #[inline]
 pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 {
     let mode_bits = (mode as u16) & 0b11;
-    let basin_bits = basin_idx & MAX_BASIN_IDX;
+    let basin_bits = basin_idx & BASIN_FIELD_MASK;
     (mode_bits << 12) | basin_bits
 }
 
@@ -108,7 +126,7 @@ pub fn pack_header(mode: CellMode, basin_idx: u16) -> u16 {
 #[inline]
 pub fn unpack_header(packed: u16) -> (CellMode, u16) {
     let mode_bits = ((packed >> 12) & 0b11) as u8;
-    let basin_idx = packed & MAX_BASIN_IDX;
+    let basin_idx = packed & BASIN_FIELD_MASK;
     let mode = match mode_bits {
         0b00 => CellMode::Skip,
         0b01 => CellMode::Merge,
@@ -165,9 +183,15 @@ pub fn unpack_merge_dir(byte: u8) -> MergeDir {
 /// worst case) — callers iterating CTUs typically pre-allocate
 /// `6 * cell_count` and trim afterwards.
 ///
-/// Returns `None` if `out.len() < packed_byte_len(leaf.mode)` (insufficient
-/// capacity for the *mode's* width — Skip needs 2, Merge/Delta need 3,
-/// Escape needs 6).
+/// Returns `None` in two cases:
+/// - `out.len() < packed_byte_len(leaf.mode)` (insufficient capacity for
+///   the *mode's* width — Skip needs 2, Merge/Delta need 3, Escape needs 6).
+/// - `leaf` is structurally malformed for its mode: `Merge` without a
+///   `merge_dir`, `Delta` without a `delta`, or `Escape` without an
+///   `escape_idx`. The `LeafCu::merge` / `delta` / `escape` constructors
+///   enforce these invariants; only struct-literal callers bypassing the
+///   constructors can hit this case. Pack is therefore bijective on the
+///   well-formed `LeafCu` subset.
 ///
 /// Format:
 /// - Bytes 0-1: header (`pack_header(mode, basin_idx)`, LE)
@@ -191,22 +215,24 @@ pub fn pack_leaf(leaf: &LeafCu, out: &mut [u8]) -> Option<usize> {
     }
     let header = pack_header(leaf.mode, leaf.basin_idx);
     out[..2].copy_from_slice(&header.to_le_bytes());
+    // Per-mode tail. `?` rejects malformed `LeafCu`s (e.g. a hand-built
+    // `LeafCu { mode: Merge, merge_dir: None, .. }`) with `None` rather
+    // than silently rewriting them into a different valid leaf. The
+    // `LeafCu::merge/delta/escape` constructors enforce the invariants;
+    // only struct-literal callers bypassing those constructors hit
+    // these short-circuits.
     let tail_len = match leaf.mode {
         CellMode::Skip => 0,
         CellMode::Merge => {
-            // Caller guarantees `merge_dir.is_some()` for `Merge` mode
-            // (LeafCu::merge constructor enforces this). Fall back to
-            // North if the invariant is violated, to keep encoder
-            // robustness — the decoder will still produce a valid leaf.
-            out[2] = pack_merge_dir(leaf.merge_dir.unwrap_or(MergeDir::North));
+            out[2] = pack_merge_dir(leaf.merge_dir?);
             1
         }
         CellMode::Delta => {
-            out[2] = leaf.delta.unwrap_or(0);
+            out[2] = leaf.delta?;
             1
         }
         CellMode::Escape => {
-            let idx = leaf.escape_idx.unwrap_or(0);
+            let idx = leaf.escape_idx?;
             out[2..6].copy_from_slice(&idx.to_le_bytes());
             4
         }
@@ -372,6 +398,72 @@ mod tests {
         assert!(pack_leaf(&leaf, &mut buf).is_none());
     }
 
+    #[test]
+    fn leaf_pack_rejects_malformed_merge_without_dir() {
+        // Bypass `LeafCu::merge` constructor and hand-build a leaf with
+        // mode = Merge but merge_dir = None. The previous unwrap_or(North)
+        // behavior would silently coerce this into a valid leaf — now we
+        // reject with None instead.
+        let malformed = LeafCu {
+            mode: CellMode::Merge,
+            basin_idx: 10,
+            delta: None,
+            merge_dir: None,
+            escape_idx: None,
+        };
+        let mut buf = [0u8; 6];
+        assert!(pack_leaf(&malformed, &mut buf).is_none());
+    }
+
+    #[test]
+    fn leaf_pack_rejects_malformed_delta_without_value() {
+        let malformed = LeafCu {
+            mode: CellMode::Delta,
+            basin_idx: 10,
+            delta: None,
+            merge_dir: None,
+            escape_idx: None,
+        };
+        let mut buf = [0u8; 6];
+        assert!(pack_leaf(&malformed, &mut buf).is_none());
+    }
+
+    #[test]
+    fn leaf_pack_rejects_malformed_escape_without_idx() {
+        let malformed = LeafCu {
+            mode: CellMode::Escape,
+            basin_idx: 10,
+            delta: None,
+            merge_dir: None,
+            escape_idx: None,
+        };
+        let mut buf = [0u8; 6];
+        assert!(pack_leaf(&malformed, &mut buf).is_none());
+    }
+
+    #[test]
+    fn basin_none_distinct_from_max_basin_idx() {
+        // Regression for the BASIN_NONE/MAX_BASIN_IDX collision: the
+        // sentinel must sit one slot above the highest real basin so
+        // basin 4094 is unambiguously "a real basin" and 4095 is
+        // unambiguously "no basin assigned".
+        assert_eq!(MAX_BASIN_IDX, 4094);
+        assert_eq!(BASIN_NONE, 4095);
+        assert!(MAX_BASIN_IDX < BASIN_NONE);
+    }
+
+    #[test]
+    fn header_round_trips_max_basin_idx_and_basin_none_distinctly() {
+        // Both values fit in the 12-bit field; the encoder treats them
+        // as different. (Decoders that route on BASIN_NONE need to
+        // compare against the sentinel explicitly.)
+        let real = pack_header(CellMode::Skip, MAX_BASIN_IDX);
+        let none = pack_header(CellMode::Skip, BASIN_NONE);
+        assert_ne!(real, none);
+        assert_eq!(unpack_header(real), (CellMode::Skip, MAX_BASIN_IDX));
+        assert_eq!(unpack_header(none), (CellMode::Skip, BASIN_NONE));
+    }
+
     #[test]
     fn leaf_unpack_rejects_short_buffer() {
         // Header says Escape but only 2 bytes follow → not enough.