From 5b4b120fb882eb56e829e3be8520291b254fd1dd Mon Sep 17 00:00:00 2001 From: Dmitrii Vasilev Date: Thu, 14 May 2026 14:38:57 +0000 Subject: [PATCH] feat(jepa_t_ingest): add Wave-14a L-S50 ternary ingest crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #807 Add `crates/jepa_t_ingest/` — a Rust-only (R1 CROWN) crate that streams plaintext corpora into ternary-quantized triplet sequences for JEPA-T training on Trinity silicon. ## Quantizer — Wave-9b RTL byte-for-byte match pub fn quantize_phi_prior(fp_q15: i16) -> i8 Threshold: φ⁻² in Q1.15 = 12533 (0x30F4) if fp_q15 >= +12533 → +1 if fp_q15 <= -12533 → -1 else → 0 ## Deliverables - Cargo.toml (edition 2021, Apache-2.0) - src/lib.rs (quantize_phi_prior, IngestConfig, Triplet, ingest_text) - src/bin/jepa_t_ingest.rs (CLI: --input corpus.txt --output triplets.bin) - tests/quantize.rs (boundary: ±12532→0, ±12533→±1, 0, ±0x7FFF; exhaustive 65536-input scan) - tests/ingest.rs (golden corpus byte-compare, 3 triplets, 192 bytes each) - README.md ## Test results cargo test -p jepa_t_ingest: 32 tests, 0 failures cargo build --release --bin jepa_t_ingest: success Signed-off-by: Dmitrii Vasilev --- Cargo.lock | 7 + Cargo.toml | 2 + crates/jepa_t_ingest/Cargo.toml | 20 ++ crates/jepa_t_ingest/README.md | 138 +++++++++ crates/jepa_t_ingest/src/bin/jepa_t_ingest.rs | 133 +++++++++ crates/jepa_t_ingest/src/lib.rs | 278 ++++++++++++++++++ crates/jepa_t_ingest/tests/ingest.rs | 164 +++++++++++ crates/jepa_t_ingest/tests/quantize.rs | 123 ++++++++ 8 files changed, 865 insertions(+) create mode 100644 crates/jepa_t_ingest/Cargo.toml create mode 100644 crates/jepa_t_ingest/README.md create mode 100644 crates/jepa_t_ingest/src/bin/jepa_t_ingest.rs create mode 100644 crates/jepa_t_ingest/src/lib.rs create mode 100644 crates/jepa_t_ingest/tests/ingest.rs create mode 100644 crates/jepa_t_ingest/tests/quantize.rs diff --git a/Cargo.lock b/Cargo.lock index 5fd3c87f35..41f809ebc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3138,6 +3138,13 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jepa_t_ingest" +version = "0.1.0" +dependencies = [ + "clap", +] + [[package]] name = "jobserver" version = "0.1.34" diff --git a/Cargo.toml b/Cargo.toml index 8e5a8a9473..2a3b1a0ee9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,6 +71,8 @@ members = [ "vendor/tri-mcp/rings/SR-02", # CPU N-gram training (IGLA RACE Gate-2) "crates/trios-train-cpu", + # JEPA-T ternary ingest pipeline (Wave-14a L-S50) + "crates/jepa_t_ingest", # Trinity dePIN Mesh (Ch.35 PhD — L-DPC2/L-DPC3) "crates/trios-mesh", "crates/trios-mesh-node", diff --git a/crates/jepa_t_ingest/Cargo.toml b/crates/jepa_t_ingest/Cargo.toml new file mode 100644 index 0000000000..4f6c85f963 --- /dev/null +++ b/crates/jepa_t_ingest/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "jepa_t_ingest" +version = "0.1.0" +edition = "2021" +authors = ["Dmitrii Vasilev "] +license = "Apache-2.0" +description = "Plaintext → ternary triplet streaming pipeline for JEPA-T training on Trinity silicon (Wave-14a L-S50)" +repository = "https://github.com/gHashTag/trios" +readme = "README.md" +keywords = ["ternary", "jepa", "trinity", "quantization", "nlp"] +categories = ["science", "encoding"] + +[[bin]] +name = "jepa_t_ingest" +path = "src/bin/jepa_t_ingest.rs" + +[dependencies] +clap = { version = "4", features = ["derive"] } + +[dev-dependencies] diff --git a/crates/jepa_t_ingest/README.md b/crates/jepa_t_ingest/README.md new file mode 100644 index 0000000000..e85e93ce38 --- /dev/null +++ b/crates/jepa_t_ingest/README.md @@ -0,0 +1,138 @@ +# jepa_t_ingest + +**Wave-14a L-S50** — Plaintext → ternary-quantized triplet streaming pipeline for JEPA-T training on Trinity silicon. + +[![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](LICENSE) +[![Rust](https://img.shields.io/badge/rust-2021--edition-orange.svg)](https://www.rust-lang.org/) + +## Overview + +`jepa_t_ingest` converts raw UTF-8 text corpora into binary streams of ternary triplets +(`anchor`, `positive`, `negative`) suitable for Joint Embedding Predictive Architecture +(JEPA-T) contrastive pretraining on Trinity ternary silicon. + +### Ternary Anchor + +- **Alphabet**: {−1, 0, +1} +- **Threshold**: φ⁻² in Q1.15 fixed-point = **12533** (0x30F4) +- **Identity**: φ² + φ⁻² = 3 +- **DOI**: [10.5281/zenodo.19227877](https://doi.org/10.5281/zenodo.19227877) + +### Quantizer — Wave-9b RTL Byte-for-Byte Match + +The core `quantize_phi_prior` function matches `phi_prior_quantizer.v` from Wave-9b exactly: + +``` +if fp_q15 >= +12533 → +1 +if fp_q15 <= −12533 → −1 +else → 0 +``` + +## API + +### `quantize_phi_prior(fp_q15: i16) -> i8` + +Ternary quantizer with Wave-9b RTL parity. + +```rust +use jepa_t_ingest::quantize_phi_prior; + +assert_eq!(quantize_phi_prior(12533), 1); // at positive threshold +assert_eq!(quantize_phi_prior(-12533), -1); // at negative threshold +assert_eq!(quantize_phi_prior(12532), 0); // below threshold +assert_eq!(quantize_phi_prior(-12532), 0); // above -threshold +assert_eq!(quantize_phi_prior(0), 0); // zero +``` + +### `ingest_text(input: &str, cfg: &IngestConfig) -> Vec` + +Streams a plaintext string into a sequence of ternary triplets. + +```rust +use jepa_t_ingest::{ingest_text, IngestConfig}; + +let cfg = IngestConfig { window_size: 64, stride: 32 }; +let triplets = ingest_text("your corpus text here ...", &cfg); +println!("{} triplets produced", triplets.len()); +``` + +### `Triplet` + +```rust +pub struct Triplet { + pub anchor: [i8; 64], // anchor context window + pub positive: [i8; 64], // adjacent / overlapping window + pub negative: [i8; 64], // non-overlapping window (hard negative) +} +``` + +Each element is in {−1, 0, +1}. Serialise to binary with `triplet.to_bytes()` (192 bytes). + +### `IngestConfig` + +```rust +pub struct IngestConfig { + pub window_size: usize, // tokens per window (max 64) + pub stride: usize, // step between anchor windows +} +``` + +## CLI Binary + +``` +jepa_t_ingest --input corpus.txt --output triplets.bin [--window-size 64] [--stride 32] +``` + +### Output Format + +Raw binary stream of packed 192-byte triplet records: + +| Bytes | Content | +|-------|---------| +| 0–63 | anchor (64 × i8) | +| 64–127 | positive (64 × i8) | +| 128–191 | negative (64 × i8) | + +## Tests + +```bash +# Run all tests (quantizer boundary + ingest golden integration) +cargo test -p jepa_t_ingest + +# Build release binary +cargo build --release --bin jepa_t_ingest +``` + +### Quantizer Boundary Tests (`tests/quantize.rs`) + +| Input | Expected | Notes | +|-------|----------|-------| +| +12532 | 0 | one below threshold | +| +12533 | +1 | at threshold (φ⁻²) | +| −12532 | 0 | one above −threshold | +| −12533 | −1 | at −threshold | +| 0 | 0 | zero | +| +0x7FFF | +1 | i16::MAX | +| −0x8000 | −1 | i16::MIN | + +The exhaustive test `output_always_ternary_for_all_i16` checks all 65536 possible i16 inputs. + +### Integration Test (`tests/ingest.rs`) + +Uses a fixed 13-token golden corpus: + +``` +"the quick brown fox jumps over the lazy dog and a ternary world" +``` + +With `window_size=4, stride=2` this produces **3 triplets** (5 windows). +Token hashes and ternary values are byte-compared against pre-computed golden values. + +## R1 CROWN Compliance + +This crate is **Rust ONLY** — no Python, no shell scripts, no foreign-language source files. +The quantizer is a single `#[inline]` function with no dependencies beyond `core`. + +## License + +Apache-2.0 — Copyright 2024 Dmitrii Vasilev <admin@t27.ai> diff --git a/crates/jepa_t_ingest/src/bin/jepa_t_ingest.rs b/crates/jepa_t_ingest/src/bin/jepa_t_ingest.rs new file mode 100644 index 0000000000..f6b655edc3 --- /dev/null +++ b/crates/jepa_t_ingest/src/bin/jepa_t_ingest.rs @@ -0,0 +1,133 @@ +//! # jepa_t_ingest — CLI binary +//! +//! Streams a plaintext corpus file into a binary file of ternary triplets +//! for JEPA-T training on Trinity silicon. +//! +//! ## Usage +//! +//! ```text +//! jepa_t_ingest --input corpus.txt --output triplets.bin [--window-size 64] [--stride 32] +//! ``` +//! +//! ## Output format +//! +//! Sequence of packed triplets, each 192 bytes: +//! - bytes 0..63 : anchor (i8 values in {-1, 0, +1}) +//! - bytes 64..127 : positive +//! - bytes 128..191 : negative +//! +//! ## License +//! +//! Apache-2.0 — Author: Dmitrii Vasilev + +use std::{ + fs, + io::{self, Write}, + path::PathBuf, + process, +}; + +use clap::Parser; +use jepa_t_ingest::{ingest_text, IngestConfig}; + +/// JEPA-T Ternary Ingest Pipeline (Wave-14a L-S50) +/// +/// Converts a plaintext corpus into binary ternary triplets for JEPA-T training. +/// Output is a raw binary stream of packed 192-byte triplet records. +#[derive(Parser, Debug)] +#[command( + name = "jepa_t_ingest", + version = env!("CARGO_PKG_VERSION"), + author = "Dmitrii Vasilev ", + about = "Plaintext → ternary triplet pipeline for JEPA-T training on Trinity silicon" +)] +struct Args { + /// Input plaintext corpus file (UTF-8) + #[arg(short, long, value_name = "FILE")] + input: PathBuf, + + /// Output binary file for ternary triplets (192 bytes each) + #[arg(short, long, value_name = "FILE")] + output: PathBuf, + + /// Context window size in tokens (max 64) + #[arg(long, default_value_t = 64, value_name = "N")] + window_size: usize, + + /// Stride between successive windows in tokens + #[arg(long, default_value_t = 32, value_name = "N")] + stride: usize, +} + +fn main() { + let args = Args::parse(); + + let cfg = IngestConfig { + window_size: args.window_size.min(64).max(1), + stride: args.stride.max(1), + }; + + // Read corpus. + let corpus = match fs::read_to_string(&args.input) { + Ok(s) => s, + Err(e) => { + eprintln!( + "jepa_t_ingest: cannot read '{}': {}", + args.input.display(), + e + ); + process::exit(1); + } + }; + + eprintln!( + "jepa_t_ingest: read {} bytes from '{}'", + corpus.len(), + args.input.display() + ); + + // Ingest into triplets. + let triplets = ingest_text(&corpus, &cfg); + + eprintln!("jepa_t_ingest: produced {} triplets", triplets.len()); + + if triplets.is_empty() { + eprintln!("jepa_t_ingest: warning — zero triplets produced (corpus too short?)"); + } + + // Write binary output. + let out_file = match fs::File::create(&args.output) { + Ok(f) => f, + Err(e) => { + eprintln!( + "jepa_t_ingest: cannot create '{}': {}", + args.output.display(), + e + ); + process::exit(1); + } + }; + let mut writer = io::BufWriter::new(out_file); + + let mut bytes_written = 0usize; + for triplet in &triplets { + let bytes = triplet.to_bytes(); + match writer.write_all(&bytes) { + Ok(()) => bytes_written += bytes.len(), + Err(e) => { + eprintln!("jepa_t_ingest: write error: {}", e); + process::exit(1); + } + } + } + + eprintln!( + "jepa_t_ingest: wrote {} bytes to '{}'", + bytes_written, + args.output.display() + ); + eprintln!( + "jepa_t_ingest: done (window_size={}, stride={})", + cfg.window_size, cfg.stride + ); +} diff --git a/crates/jepa_t_ingest/src/lib.rs b/crates/jepa_t_ingest/src/lib.rs new file mode 100644 index 0000000000..df0f14b449 --- /dev/null +++ b/crates/jepa_t_ingest/src/lib.rs @@ -0,0 +1,278 @@ +//! # jepa_t_ingest +//! +//! Plaintext → ternary-quantized triplet pipeline for JEPA-T training on Trinity silicon. +//! +//! ## Ternary Anchor +//! +//! - Ternary alphabet: {−1, 0, +1} +//! - φ⁻² (Q1.15) = 12533 (0x30F4) — quantization threshold +//! - φ² + φ⁻² = 3 +//! - DOI: 10.5281/zenodo.19227877 +//! +//! ## Quantizer: Wave-9b RTL Byte-for-Byte Match +//! +//! Matches `phi_prior_quantizer.v` from Wave-9b exactly: +//! - if fp >= +12533 → +1 +//! - if fp <= −12533 → −1 +//! - else → 0 +//! +//! ## License +//! +//! Apache-2.0 + +/// Ternary quantizer — matches Wave-9b `phi_prior_quantizer.v` byte-for-byte. +/// +/// Threshold is φ⁻² in Q1.15 fixed-point = 12533 (0x30F4). +/// +/// # Arguments +/// +/// * `fp_q15` — signed 16-bit Q1.15 fixed-point input +/// +/// # Returns +/// +/// * `+1` if `fp_q15 >= 12533` +/// * `-1` if `fp_q15 <= -12533` +/// * `0` otherwise +/// +/// # Examples +/// +/// ``` +/// use jepa_t_ingest::quantize_phi_prior; +/// +/// assert_eq!(quantize_phi_prior(12533), 1); +/// assert_eq!(quantize_phi_prior(-12533), -1); +/// assert_eq!(quantize_phi_prior(12532), 0); +/// assert_eq!(quantize_phi_prior(-12532), 0); +/// assert_eq!(quantize_phi_prior(0), 0); +/// ``` +#[inline] +pub fn quantize_phi_prior(fp_q15: i16) -> i8 { + const THRESHOLD: i16 = 12533; // φ⁻² in Q1.15 = 0x30F4 + if fp_q15 >= THRESHOLD { + 1 + } else if fp_q15 <= -THRESHOLD { + -1 + } else { + 0 + } +} + +/// Configuration for the plaintext ingest pipeline. +/// +/// Controls how text is windowed into anchor/positive/negative triplets. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct IngestConfig { + /// Number of tokens per context window (must be <= 64). + pub window_size: usize, + /// Stride between successive anchor windows. + pub stride: usize, +} + +impl Default for IngestConfig { + fn default() -> Self { + Self { + window_size: 64, + stride: 32, + } + } +} + +/// A ternary triplet for JEPA-T contrastive training. +/// +/// Each field is a 64-element ternary vector with values in {-1, 0, +1}. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Triplet { + /// Anchor context window (quantized token hashes). + pub anchor: [i8; 64], + /// Positive window — overlapping or adjacent to anchor. + pub positive: [i8; 64], + /// Negative window — non-overlapping, sampled from elsewhere in the corpus. + pub negative: [i8; 64], +} + +impl Triplet { + /// Serialize this triplet to raw bytes (192 bytes, i8 → u8 reinterpret). + pub fn to_bytes(&self) -> Vec { + let mut out = Vec::with_capacity(192); + for &v in self.anchor.iter() { + out.push(v as u8); + } + for &v in self.positive.iter() { + out.push(v as u8); + } + for &v in self.negative.iter() { + out.push(v as u8); + } + out + } +} + +// ──────────────────────────────────────────────────────────────────────────── +// Internal helpers +// ──────────────────────────────────────────────────────────────────────────── + +/// Hash a token (byte slice) into a Q1.15 signed integer, then ternary-quantize. +/// +/// Uses a simple djb2-style accumulation to produce a reproducible i16 value. +fn token_to_q15(token: &[u8]) -> i16 { + let mut h: u32 = 5381; + for &b in token { + h = h.wrapping_mul(33).wrapping_add(b as u32); + } + // Fold 32-bit hash into i16 range by XOR-folding, preserving sign distribution. + let folded = ((h >> 16) ^ (h & 0xFFFF)) as u16; + folded as i16 +} + +/// Convert a text window (slice of byte-token slices) into a 64-element ternary vector. +/// +/// Tokens are hashed to Q1.15, then quantized with `quantize_phi_prior`. +/// If the window has fewer than 64 tokens, the remainder is zero-padded. +fn window_to_ternary(tokens: &[&[u8]]) -> [i8; 64] { + let mut out = [0i8; 64]; + for (i, tok) in tokens.iter().take(64).enumerate() { + out[i] = quantize_phi_prior(token_to_q15(tok)); + } + out +} + +// ──────────────────────────────────────────────────────────────────────────── +// Public API +// ──────────────────────────────────────────────────────────────────────────── + +/// Stream a plaintext string into a sequence of ternary [`Triplet`]s. +/// +/// The algorithm: +/// 1. Tokenise by whitespace into byte slices. +/// 2. Slide an anchor window of `cfg.window_size` tokens with step `cfg.stride`. +/// 3. Positive = next window (`anchor_start + cfg.stride`). +/// 4. Negative = window from the opposite end of the corpus. +/// 5. Each window is mapped to a 64-element ternary vector via [`quantize_phi_prior`]. +/// +/// At least 3 windows are required to form a triplet (anchor, positive, negative +/// must all be distinct). Returns an empty Vec if the input is too short. +/// +/// # Arguments +/// +/// * `input` — raw UTF-8 text (any language, any encoding as UTF-8) +/// * `cfg` — windowing configuration +/// +/// # Examples +/// +/// ``` +/// use jepa_t_ingest::{ingest_text, IngestConfig}; +/// +/// let cfg = IngestConfig { window_size: 4, stride: 2 }; +/// let triplets = ingest_text("the quick brown fox jumps over the lazy dog", &cfg); +/// assert!(!triplets.is_empty()); +/// ``` +pub fn ingest_text(input: &str, cfg: &IngestConfig) -> Vec { + let ws = cfg.window_size.min(64).max(1); + let stride = cfg.stride.max(1); + + // Tokenise by whitespace — collect byte representations. + let raw_tokens: Vec<&[u8]> = input.split_whitespace().map(str::as_bytes).collect(); + let n = raw_tokens.len(); + + if n < ws * 2 { + // Not enough tokens to form even one meaningful triplet. + return Vec::new(); + } + + // Build all windows. + let windows: Vec<[i8; 64]> = (0..) + .map(|i| i * stride) + .take_while(|&start| start + ws <= n) + .map(|start| window_to_ternary(&raw_tokens[start..start + ws])) + .collect(); + + let num_windows = windows.len(); + if num_windows < 3 { + return Vec::new(); + } + + let mut triplets = Vec::with_capacity(num_windows.saturating_sub(2)); + + for i in 0..num_windows - 2 { + let anchor = windows[i]; + let positive = windows[i + 1]; + // Negative: pick from the farthest window (opposite end from anchor). + let neg_idx = if i < num_windows / 2 { + num_windows - 1 + } else { + 0 + }; + let negative = windows[neg_idx]; + + triplets.push(Triplet { + anchor, + positive, + negative, + }); + } + + triplets +} + +#[cfg(test)] +mod unit_tests { + use super::*; + + #[test] + fn quantize_zero() { + assert_eq!(quantize_phi_prior(0), 0); + } + + #[test] + fn quantize_positive_boundary() { + assert_eq!(quantize_phi_prior(12532), 0, "+12532 must be 0"); + assert_eq!(quantize_phi_prior(12533), 1, "+12533 must be +1"); + } + + #[test] + fn quantize_negative_boundary() { + assert_eq!(quantize_phi_prior(-12532), 0, "-12532 must be 0"); + assert_eq!(quantize_phi_prior(-12533), -1, "-12533 must be -1"); + } + + #[test] + fn quantize_max_values() { + assert_eq!(quantize_phi_prior(i16::MAX), 1); + assert_eq!(quantize_phi_prior(i16::MIN), -1); + } + + #[test] + fn ingest_empty_returns_empty() { + let cfg = IngestConfig::default(); + assert!(ingest_text("", &cfg).is_empty()); + } + + #[test] + fn ingest_short_returns_empty() { + let cfg = IngestConfig { window_size: 64, stride: 32 }; + assert!(ingest_text("hello world", &cfg).is_empty()); + } + + #[test] + fn ingest_produces_valid_ternary() { + let cfg = IngestConfig { window_size: 4, stride: 2 }; + let corpus = "a b c d e f g h i j k l m n o p"; + let triplets = ingest_text(corpus, &cfg); + assert!(!triplets.is_empty()); + for t in &triplets { + for &v in t.anchor.iter().chain(t.positive.iter()).chain(t.negative.iter()) { + assert!(v == -1 || v == 0 || v == 1, "non-ternary value: {}", v); + } + } + } + + #[test] + fn triplet_to_bytes_length() { + let t = Triplet { + anchor: [1i8; 64], + positive: [0i8; 64], + negative: [-1i8; 64], + }; + assert_eq!(t.to_bytes().len(), 192); + } +} diff --git a/crates/jepa_t_ingest/tests/ingest.rs b/crates/jepa_t_ingest/tests/ingest.rs new file mode 100644 index 0000000000..fd0766cb4b --- /dev/null +++ b/crates/jepa_t_ingest/tests/ingest.rs @@ -0,0 +1,164 @@ +//! Integration tests for `ingest_text` — golden corpus byte-compare. +//! +//! The small corpus "the quick brown fox jumps over the lazy dog and a ternary world" +//! is used as a reproducible fixture. Expected values are pre-computed from the +//! deterministic djb2-hash + φ⁻² quantizer pipeline. +//! +//! Tests verify: +//! - correct triplet count for the given window/stride config +//! - ternary values are all in {-1, 0, +1} +//! - first and last triplet anchor/positive/negative values (golden byte-compare) +//! - serialised byte length == 192 per triplet +//! +//! Apache-2.0 — Author: Dmitrii Vasilev + +use jepa_t_ingest::{ingest_text, IngestConfig, Triplet}; + +/// Small reproducible corpus used as integration fixture. +const CORPUS: &str = + "the quick brown fox jumps over the lazy dog and a ternary world"; + +/// window_size=4, stride=2 → 5 windows, 3 triplets from 13 tokens +const CFG: IngestConfig = IngestConfig { + window_size: 4, + stride: 2, +}; + +// ──────────────────────────────────────────────────────────────────────────── +// Helper +// ──────────────────────────────────────────────────────────────────────────── + +fn golden_triplets() -> Vec { + ingest_text(CORPUS, &CFG) +} + +// ──────────────────────────────────────────────────────────────────────────── +// Count +// ──────────────────────────────────────────────────────────────────────────── + +#[test] +fn golden_corpus_triplet_count() { + // 13 tokens, window=4, stride=2 → 5 windows → 3 triplets + let triplets = golden_triplets(); + assert_eq!( + triplets.len(), + 3, + "expected 3 triplets from 13-token corpus with window=4 stride=2, got {}", + triplets.len() + ); +} + +// ──────────────────────────────────────────────────────────────────────────── +// Ternary constraint +// ──────────────────────────────────────────────────────────────────────────── + +#[test] +fn all_values_are_ternary() { + for (ti, t) in golden_triplets().iter().enumerate() { + for (fi, &v) in t + .anchor + .iter() + .chain(t.positive.iter()) + .chain(t.negative.iter()) + .enumerate() + { + assert!( + v == -1 || v == 0 || v == 1, + "triplet[{}] field[{}] = {} — not ternary!", + ti, + fi, + v + ); + } + } +} + +// ──────────────────────────────────────────────────────────────────────────── +// Golden byte-compare: first triplet +// ──────────────────────────────────────────────────────────────────────────── + +#[test] +fn golden_first_triplet_anchor() { + // anchor = tokens["the", "quick", "brown", "fox"] → ternary {-1, 0, -1, 1, 0..} + let triplets = golden_triplets(); + let anchor = triplets[0].anchor; + assert_eq!(anchor[0], -1, "anchor[0] = token 'the'"); + assert_eq!(anchor[1], 0, "anchor[1] = token 'quick'"); + assert_eq!(anchor[2], -1, "anchor[2] = token 'brown'"); + assert_eq!(anchor[3], 1, "anchor[3] = token 'fox'"); + // remaining slots are zero-padded (window_size=4 < 64) + for i in 4..64 { + assert_eq!(anchor[i], 0, "anchor[{}] must be zero-padded", i); + } +} + +#[test] +fn golden_first_triplet_positive() { + // positive = tokens["brown", "fox", "jumps", "over"] → {-1, 1, 0, -1, 0..} + let triplets = golden_triplets(); + let pos = triplets[0].positive; + assert_eq!(pos[0], -1, "positive[0] = token 'brown'"); + assert_eq!(pos[1], 1, "positive[1] = token 'fox'"); + assert_eq!(pos[2], 0, "positive[2] = token 'jumps'"); + assert_eq!(pos[3], -1, "positive[3] = token 'over'"); + for i in 4..64 { + assert_eq!(pos[i], 0, "positive[{}] must be zero-padded", i); + } +} + +#[test] +fn golden_first_triplet_negative() { + // negative = last window = tokens["dog", "and", "a", "ternary"] → {1, 1, -1, -1, 0..} + // (negative uses window index num_windows-1 when anchor is in first half) + let triplets = golden_triplets(); + let neg = triplets[0].negative; + assert_eq!(neg[0], 1, "negative[0] = token 'dog'"); + assert_eq!(neg[1], 1, "negative[1] = token 'and'"); + assert_eq!(neg[2], -1, "negative[2] = token 'a'"); + assert_eq!(neg[3], -1, "negative[3] = token 'ternary'"); + for i in 4..64 { + assert_eq!(neg[i], 0, "negative[{}] must be zero-padded", i); + } +} + +// ──────────────────────────────────────────────────────────────────────────── +// Serialisation +// ──────────────────────────────────────────────────────────────────────────── + +#[test] +fn triplet_serialises_to_192_bytes() { + for t in golden_triplets() { + assert_eq!(t.to_bytes().len(), 192, "each triplet must serialise to 192 bytes"); + } +} + +#[test] +fn golden_total_bytes() { + // 3 triplets × 192 bytes = 576 bytes + let total: usize = golden_triplets().iter().map(|t| t.to_bytes().len()).sum(); + assert_eq!(total, 576, "3 triplets × 192 bytes = 576 bytes"); +} + +// ──────────────────────────────────────────────────────────────────────────── +// Edge cases +// ──────────────────────────────────────────────────────────────────────────── + +#[test] +fn empty_corpus_returns_empty() { + let cfg = IngestConfig { window_size: 4, stride: 2 }; + assert!(ingest_text("", &cfg).is_empty()); +} + +#[test] +fn single_token_returns_empty() { + let cfg = IngestConfig { window_size: 4, stride: 2 }; + assert!(ingest_text("hello", &cfg).is_empty()); +} + +#[test] +fn large_stride_still_produces_triplets() { + // window=4, stride=1 with our 13-token corpus → 10 windows → 8 triplets + let cfg = IngestConfig { window_size: 4, stride: 1 }; + let triplets = ingest_text(CORPUS, &cfg); + assert_eq!(triplets.len(), 8, "expected 8 triplets with stride=1"); +} diff --git a/crates/jepa_t_ingest/tests/quantize.rs b/crates/jepa_t_ingest/tests/quantize.rs new file mode 100644 index 0000000000..0aacfde704 --- /dev/null +++ b/crates/jepa_t_ingest/tests/quantize.rs @@ -0,0 +1,123 @@ +//! Boundary tests for `quantize_phi_prior` — Wave-9b RTL byte-for-byte parity. +//! +//! Threshold: φ⁻² in Q1.15 = 12533 (0x30F4) +//! +//! These tests assert that the Rust implementation matches the Wave-9b +//! `phi_prior_quantizer.v` Verilog module for all documented boundary cases +//! and the full ±0x7FFF (i16::MAX / i16::MIN) range extremes. +//! +//! Apache-2.0 — Author: Dmitrii Vasilev + +use jepa_t_ingest::quantize_phi_prior; + +// ── Boundary: just below threshold ────────────────────────────────────────── + +#[test] +fn boundary_plus_12532_is_zero() { + // +12532 is strictly below threshold → ternary 0 + assert_eq!( + quantize_phi_prior(12532), + 0, + "+12532 must map to 0 (below threshold 12533)" + ); +} + +#[test] +fn boundary_minus_12532_is_zero() { + // -12532 is strictly above -threshold → ternary 0 + assert_eq!( + quantize_phi_prior(-12532), + 0, + "-12532 must map to 0 (above -threshold -12533)" + ); +} + +// ── Boundary: at threshold ─────────────────────────────────────────────────── + +#[test] +fn boundary_plus_12533_is_positive_one() { + // +12533 == threshold → ternary +1 + assert_eq!( + quantize_phi_prior(12533), + 1, + "+12533 (φ⁻² Q1.15) must map to +1" + ); +} + +#[test] +fn boundary_minus_12533_is_negative_one() { + // -12533 == -threshold → ternary -1 + assert_eq!( + quantize_phi_prior(-12533), + -1, + "-12533 (−φ⁻² Q1.15) must map to -1" + ); +} + +// ── Zero ───────────────────────────────────────────────────────────────────── + +#[test] +fn zero_is_zero() { + assert_eq!(quantize_phi_prior(0), 0, "0 must map to ternary 0"); +} + +// ── Extremes: ±0x7FFF (i16::MAX / i16::MIN) ────────────────────────────────── + +#[test] +fn max_i16_is_positive_one() { + // i16::MAX = 32767 = 0x7FFF >> threshold → +1 + assert_eq!( + quantize_phi_prior(i16::MAX), + 1, + "i16::MAX (0x7FFF = 32767) must map to +1" + ); +} + +#[test] +fn min_i16_is_negative_one() { + // i16::MIN = -32768 = -0x8000 < -threshold → -1 + assert_eq!( + quantize_phi_prior(i16::MIN), + -1, + "i16::MIN (-0x8000 = -32768) must map to -1" + ); +} + +// ── Additional parity checks near threshold ────────────────────────────────── + +#[test] +fn one_above_threshold_is_positive_one() { + assert_eq!(quantize_phi_prior(12534), 1); +} + +#[test] +fn one_below_negative_threshold_is_negative_one() { + assert_eq!(quantize_phi_prior(-12534), -1); +} + +#[test] +fn mid_range_positive_is_zero() { + // 6266 is well inside the dead zone + assert_eq!(quantize_phi_prior(6266), 0); +} + +#[test] +fn mid_range_negative_is_zero() { + assert_eq!(quantize_phi_prior(-6266), 0); +} + +// ── Output domain: only ternary values ─────────────────────────────────────── + +#[test] +fn output_always_ternary_for_all_i16() { + // Exhaustive check of all 65536 possible i16 inputs. + for raw in i16::MIN..=i16::MAX { + let out = quantize_phi_prior(raw); + assert!( + out == -1 || out == 0 || out == 1, + "quantize_phi_prior({}) = {} — not ternary!", + raw, + out + ); + } +}