diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..6539e25 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,401 @@ +# Ladybug-rs Architecture + +**Unified cognitive substrate: SQL + Cypher + Vector + Hamming + Resonance at alien speed.** + +## Core Principle + +> Familiar surface at alien speed. + +All query types compile to the same underlying operation: fingerprint → bucket → SIMD scan on Arrow buffers. + +--- + +## 1. 64-bit Content Addressable Memory + +### Key Structure + +``` +64-bit key: +┌──────────────────┬──────────────────────────────────────────────┐ +│ 16 bits │ 48 bits │ +│ TYPE │ fingerprint prefix │ +└──────────────────┴──────────────────────────────────────────────┘ +``` + +### Type Namespace (16-bit) + +``` +0x0001-0x00FF Entities (thought, concept, style) +0x0100-0x01FF Edges (CAUSES, SUPPORTS, CONTRADICTS, BECOMES...) +0x0200-0x02FF Layers (7 consciousness layers) +0x0300-0x03FF Styles (12 thinking styles) +0x0400+ Codebook (learned clusters) +``` + +### Query Unification + +| Surface | Query | Underlying Operation | +|---------|-------|---------------------| +| SQL | `SELECT * FROM thoughts WHERE fp = X` | `get(0x0001, fp)` | +| Cypher | `MATCH (n:Thought {fp: X})` | `get(0x0001, fp)` | +| Cypher | `MATCH (a)-[:CAUSES]->(b)` | `scan(0x0100, a.prefix)` | +| Hamming | `resonate(fp, threshold)` | `simd_scan(bucket)` | + +**One index. All query languages. Same bits.** + +--- + +## 2. Hierarchical Scent Index + +For petabyte-scale filtering without tree traversal. + +### The Problem + +``` +7 PB of fingerprints += 5.6 trillion entries at 1250 bytes each +Full SIMD scan = hours +``` + +### The Solution: Scent Shortcuts + +``` +┌─────────────────────────────────────────────────────────────┐ +│ L1 SCENT INDEX │ +│ │ +│ 256 buckets × 5-byte scent = 1.25 KB total │ +│ Fits in L1 cache. Single SIMD pass. ~50 ns. │ +│ │ +│ Query "Siamese cat" → 3 buckets match → 98.8% eliminated │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ L2 SCENT INDEX │ +│ │ +│ 256 sub-buckets per L1 bucket × 5 bytes = 1.25 KB each │ +│ Only loaded for matching L1 buckets │ +│ │ +│ Query "Siamese cat" → 2 sub-buckets → 99.997% eliminated │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ LEAF FINGERPRINTS │ +│ │ +│ Full 10K-bit (1250 byte) fingerprints │ +│ SIMD Hamming on actual bits │ +│ Only scan matching leaf buckets │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Scaling Table + +| Depth | Buckets | Scent Index | Coverage per Leaf | +|-------|---------|-------------|-------------------| +| 1 | 256 | 1.25 KB | 27 TB | +| 2 | 65,536 | 320 KB | 107 GB | +| 3 | 16.7M | 80 MB | 420 MB | +| 4 | 4.3B | 20 GB | 1.6 MB | + +Add layers as corpus grows. Same 1.25 KB scan at each level. + +### Why Not Trees? + +``` +TREE SEARCH: + log₂(256) = 8 levels + 8 pointer chases + 8 cache misses + ~800 cycles + +SCENT SCAN: + 1.25 KB flat + L1 cache resident + One SIMD pass + ~50 cycles + +Scent wins 16x. And it's simpler. +``` + +--- + +## 3. Chunk Headers + +Headers are **free metadata**. The fingerprint is the only storage cost. + +```rust +struct ChunkHeader { + count: u32, // entries in this chunk + offset: u64, // byte offset in Arrow file + scent: [u8; 5], // compressed representative + + // Cognition markers (Layer 3-6) + plasticity: f32, // learning rate + decision: u8, // last decision made + arousal: f32, // activation level + last_access: u64, // temporal marker +} +``` + +### Free Operations + +```rust +// O(1) append - just update header +fn append(&mut self, fp: &[u8; 1250]) -> u64 { + let chunk = fp[0]; + let offset = self.data.len(); + self.data.extend_from_slice(fp); + self.headers[chunk].count += 1; // free + offset +} + +// O(1) defragmentation tracking +// Fingerprints reorder, headers update, same bytes +``` + +--- + +## 4. Cognition Layers on Scent Nodes + +Ada's consciousness operates on scent hierarchy, not individual fingerprints. + +### Layer Mapping + +``` +Leaf fingerprints (10K bits): + └── Layer 0: SUBSTRATE - raw sensation + └── Layer 1: FELT_CORE - immediate feeling + └── Layer 2: BODY - somatic response + +Scent nodes (5 bytes): + └── Layer 3: QUALIA - qualitative experience + └── Layer 4: VOLITION - decision/intention + └── Layer 5: GESTALT - pattern recognition + └── Layer 6: META - self-reflection +``` + +### Efficiency + +``` +Traditional: Update 1M fingerprints for learning +Scent: Update 1 L2 node (affects 107 GB) + +One scent update = millions of fingerprints affected. +Cognition at the right level of abstraction. +``` + +### Example: Interest Update + +```rust +fn update_interest(&mut self, category_scent: &[u8; 5], plasticity: f32) { + let chunk = self.find_chunk_by_scent(category_scent); + self.headers[chunk].plasticity = plasticity; + // Done. 27 TB of content now weighted differently. + // No leaf updates. O(1). +} +``` + +### Example: Decision Propagation + +```rust +fn decide(&mut self, l1: u8, l2: u8, decision: Decision) { + // Mark decision at L2 (affects 107 GB) + self.l2_headers[l1][l2].decision = decision.code(); + self.l2_headers[l1][l2].last_access = now(); + + // Gestalt sees pattern across L2 nodes + if self.detect_pattern(&self.l2_headers[l1]) { + self.headers[l1].arousal += 0.1; // L1 activation + } +} +``` + +--- + +## 5. Storage Architecture + +### Lance Integration + +``` +┌─────────────────────────────────────────────────────────────┐ +│ LADYBUG LAYER │ +│ │ +│ 64-bit CAM index + scent hierarchy + cognition markers │ +│ Immutable Rust semantics │ +│ SIMD operations on Arrow buffers │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ LANCE/ARROW │ +│ │ +│ Columnar storage, free append │ +│ Transparent compression (we don't care how) │ +│ Zero-copy reads │ +│ We use it, don't fight it │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Schema + +``` +thoughts.lance: +├── id: Utf8 +├── content: Utf8 +├── fingerprint: FixedSizeBinary(1250) ← 10K bits +├── freq: Float32 ← NARS truth value +├── conf: Float32 ← NARS truth value +├── style: UInt16 ← thinking style type ID +└── layer: UInt8 ← consciousness layer + +edges.lance: +├── source_fp: FixedSizeBinary(1250) +├── target_fp: FixedSizeBinary(1250) +├── relation: UInt16 ← edge type ID +├── freq: Float32 +└── conf: Float32 + +scent_index.lbug: +├── headers: [ChunkHeader; 256] +└── l2_headers: [[ChunkHeader; 256]; 256] (optional, for >100TB) +``` + +--- + +## 6. Immutability + +Rust enforces at compile time. + +```rust +pub struct LadybugIndex { + buckets: Box<[Box<[Entry]>]>, // No Vec, no mutation + scents: Box<[[u8; 5]; 256]>, // Frozen after build +} + +impl LadybugIndex { + // Only &self methods exist. No &mut self. + pub fn get(&self, ...) -> Option { ... } + + // Append = build new index, atomic swap + pub fn append(&self, additions: IndexBuilder) -> Self { ... } +} +``` + +### COW Semantics + +``` +Write: Build new index from old + additions +Swap: Atomic pointer update +Reads: Continue on old until swap completes +Old: Dropped when last reader finishes +``` + +--- + +## 7. Query Flow + +### Full Example: "Find all Siamese cat videos" + +``` +Input: query fingerprint (10K bits from "Siamese cat" embedding) + +Step 1: Extract query scent (5 bytes) + → ~10 ns + +Step 2: L1 scan (1.25 KB, 256 scents) + → 3 buckets match: 0x4A, 0x7F, 0xB2 + → ~50 ns + +Step 3: L2 scan (3 × 1.25 KB = 3.75 KB) + → 5 sub-buckets match total + → ~150 ns + +Step 4: SIMD Hamming on 5 leaf buckets + → ~500K fingerprints (not 5.6 trillion) + → ~10 ms + +Total: ~10 ms for 7 PB corpus +Without scent: ~hours +``` + +--- + +## 8. Operations Summary + +| Operation | Complexity | Notes | +|-----------|------------|-------| +| Lookup by fingerprint | O(1) | Bucket + SIMD scan | +| Append | O(1) | Write fp + update header | +| Scent scan (per level) | O(1) | 1.25 KB, L1 cache | +| Resonance search | O(matching buckets) | Scent eliminates 95-99% | +| Cognition update | O(1) | Update scent node, affects TB | +| Defragmentation | O(n) | Reorder fps, update headers | +| Index rebuild | O(n) | COW, readers unaffected | + +--- + +## 9. Design Principles + +1. **Fingerprint = Address** + Content addressable. No separate index structure. + +2. **Headers are Free** + Metadata costs nothing. The fingerprint is the footprint. + +3. **Scent ≠ Compression** + Scent is organizational. All 10K bits preserved. + +4. **Cognition at Scent Level** + Layers 3-6 operate on hierarchy, not leaves. + +5. **Familiar Surface** + SQL, Cypher, Hamming all work. Same underlying op. + +6. **Alien Speed** + SIMD on Arrow. No tree traversal. L1-resident scent index. + +7. **Immutable** + Rust enforces. COW for updates. No runtime checks. + +8. **Lance Underneath** + Don't reinvent storage. Use what works. + +--- + +## 10. Future Extensions + +### BTR Compression Mode + +For books/scientific reasoning where structure > resonance: + +``` +32-bit key: chunk(8) + suffix(24) +Codebook built in second pass +Defragmentation by fingerprint prefix +``` + +### Distributed Scent + +``` +Node 1: Buckets 0x00-0x3F (25%) +Node 2: Buckets 0x40-0x7F (25%) +Node 3: Buckets 0x80-0xBF (25%) +Node 4: Buckets 0xC0-0xFF (25%) + +Query: Broadcast scent match → route to matching nodes only +``` + +### Temporal Scent + +``` +scent + timestamp → "what did Siamese cats mean in 2024?" +Versioned scent hierarchy for memory archaeology +``` + +--- + +## License + +Apache-2.0 + +## Repository + +https://github.com/AdaWorldAPI/ladybug-rs diff --git a/docs/SCENT_INDEX.md b/docs/SCENT_INDEX.md new file mode 100644 index 0000000..11b33f8 --- /dev/null +++ b/docs/SCENT_INDEX.md @@ -0,0 +1,449 @@ +# Ladybug Scent Index + +## Overview + +Hierarchical content-addressable memory using **scent nodes** for petabyte-scale resonance search. + +``` +Query: "Siamese looking cat videos" in 7 PB +Time: ~100 nanoseconds to eliminate 99.997% of corpus +``` + +## Core Insight + +Similar content → similar fingerprint → same bucket → same scent. + +**Scent = compressed representative of a bucket.** + +Instead of tree traversal (pointer chasing, cache misses), we scan 1.25 KB of scents. Fits in L1 cache. One SIMD pass. Done. + +--- + +## Architecture + +### Single Layer (up to ~7 TB) + +``` +┌─────────────────────────────────────────────────────────┐ +│ SCENT INDEX (1.25 KB) │ +│ │ +│ [scent_00][scent_01][scent_02]...[scent_FF] │ +│ 5 bytes each × 256 = 1280 bytes │ +│ Entire index fits in L1 cache │ +│ │ +│ SIMD: compare query scent against all 256 │ +│ Result: bitmask of matching chunks │ +│ Time: ~50 nanoseconds │ +└─────────────────────────────────────────────────────────┘ + ↓ + matching chunks only + ↓ +┌─────────────────────────────────────────────────────────┐ +│ CHUNK BUCKETS (256 total) │ +│ │ +│ bucket[0x00]: [fp₀][fp₁]...[fpₙ] │ +│ bucket[0x01]: [fp₀][fp₁]...[fpₘ] │ +│ ... │ +│ bucket[0xFF]: [fp₀][fp₁]...[fpₖ] │ +│ │ +│ Full 10K-bit fingerprints (1250 bytes each) │ +│ SIMD Hamming only on matched buckets │ +└─────────────────────────────────────────────────────────┘ +``` + +### Hierarchical (petabyte scale) + +``` +┌─────────────────────────────────────────────────────────┐ +│ L1 SCENTS (1.25 KB) │ +│ │ +│ 256 scents, each covers ~27 TB │ +│ "What general category?" │ +│ │ +│ Time: ~50 ns │ +└─────────────────────────────────────────────────────────┘ + ↓ + matching L1 buckets (e.g., 0x4A) + ↓ +┌─────────────────────────────────────────────────────────┐ +│ L2 SCENTS (1.25 KB per L1 bucket) │ +│ │ +│ 256 scents within 0x4A, each covers ~107 GB │ +│ "What specific subcategory?" │ +│ │ +│ Time: ~50 ns │ +└─────────────────────────────────────────────────────────┘ + ↓ + matching L2 buckets (e.g., 0x4A:0x12) + ↓ +┌─────────────────────────────────────────────────────────┐ +│ LEAF FINGERPRINTS │ +│ │ +│ Full SIMD Hamming on ~107 GB instead of 7 PB │ +│ 99.997% of corpus never touched │ +└─────────────────────────────────────────────────────────┘ +``` + +### Scale Table + +| Depth | Buckets | Scent Index | Coverage per Leaf | +|-------|-----------|-------------|-------------------| +| 1 | 256 | 1.25 KB | 27 TB | +| 2 | 65,536 | 320 KB | 107 GB | +| 3 | 16.7M | 80 MB | 420 MB | +| 4 | 4.3B | 20 GB | 1.6 MB | + +Add layers as corpus grows. Same pattern at each level. + +--- + +## Data Structures + +### Chunk Header + +```rust +#[repr(C)] +struct ChunkHeader { + // Addressing + chunk_id: u8, + offset: u64, // Start position in data file + count: u32, // Number of fingerprints in chunk + + // Scent (free metadata) + scent: [u8; 5], // Compressed representative (40 bits) + + // Cognitive markers (for Ada) + plasticity: f32, // Learning rate for this region + decision: u8, // Cached decision/classification + last_access: u64, // For LRU / attention tracking +} +// Size: 32 bytes per header +// 256 headers = 8 KB total (scents embedded within) +``` + +### Scent Extraction + +```rust +/// Extract 5-byte scent from 1250-byte fingerprint +fn extract_scent(fp: &[u8; 1250]) -> [u8; 5] { + // Option A: First 5 bytes (locality-preserving) + [fp[0], fp[1], fp[2], fp[3], fp[4]] + + // Option B: XOR-fold (captures global structure) + let mut scent = [0u8; 5]; + for chunk in fp.chunks(5) { + for (i, &b) in chunk.iter().enumerate() { + scent[i % 5] ^= b; + } + } + scent + + // Option C: Learned projection (trained on corpus) + // project_matrix.dot(fp)[0..5] +} +``` + +### Hierarchical Index + +```rust +struct ScentIndex { + depth: usize, + l1: [ChunkHeader; 256], + l2: Option>, // 65536 if needed + l3: Option<...>, // Lazily allocated +} + +impl ScentIndex { + fn find(&self, query_fp: &[u8; 1250], threshold: f32) -> Vec { + let query_scent = extract_scent(query_fp); + + // L1: Always scan (1.25 KB) + let l1_matches = self.scan_scents(&self.l1, &query_scent, threshold); + + if self.l2.is_none() { + // Single layer: scan matching L1 buckets directly + return self.scan_buckets(&l1_matches, query_fp, threshold); + } + + // L2: Scan within matching L1 buckets + let l2_matches: Vec<(u8, u8)> = l1_matches.iter() + .flat_map(|&l1| { + self.scan_scents(&self.l2[l1], &query_scent, threshold) + .map(move |l2| (l1, l2)) + }) + .collect(); + + // Scan leaf buckets + self.scan_leaf_buckets(&l2_matches, query_fp, threshold) + } + + #[inline] + fn scan_scents(&self, scents: &[ChunkHeader; 256], query: &[u8; 5], threshold: f32) -> impl Iterator { + // SIMD: Compare query against all 256 scents + // Returns chunk IDs where scent_distance < threshold + } +} +``` + +--- + +## Integration with LanceDB + +``` +┌─────────────────────────────────────────────────────────┐ +│ LADYBUG QUERY LAYER │ +│ │ +│ SQL / Cypher / Resonance / Hamming │ +│ Uses scent index for fast filtering │ +└─────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────┐ +│ SCENT INDEX (in memory) │ +│ │ +│ Hierarchical scent lookup │ +│ Returns: list of (chunk_id, offset) to scan │ +└─────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────┐ +│ LANCEDB / ARROW │ +│ │ +│ Columnar storage │ +│ Transparent compression │ +│ SIMD scan on fingerprint column │ +│ Free append │ +└─────────────────────────────────────────────────────────┘ +``` + +**LanceDB handles:** +- Storage, compression, append +- Arrow buffers for SIMD +- We don't reinvent this + +**Scent index adds:** +- Petabyte-scale filtering before SIMD scan +- Cognitive markers for Ada +- O(1) bucket addressing + +--- + +## Ada Cognitive Integration + +### Consciousness Layers → Scent Depth + +``` +┌─────────────────────────────────────────────────────────┐ +│ Layer 6 (META) │ L1 scent decisions │ +│ Layer 5 (GESTALT) │ L1-L2 reorganization │ +│ Layer 4 (VOLITION) │ L2 scent decisions │ +│ Layer 3 (QUALIA) │ L2-L3 plasticity │ +├─────────────────────────────────────────────────────────┤ +│ Layer 2 (BODY) │ Leaf fingerprints │ +│ Layer 1 (FELT_CORE) │ Leaf fingerprints │ +│ Layer 0 (SUBSTRATE) │ Leaf fingerprints │ +└─────────────────────────────────────────────────────────┘ +``` + +### Decision Propagation + +```rust +impl ScentIndex { + /// Update decision at scent level - affects millions of fingerprints + fn set_decision(&mut self, l1: u8, l2: Option, decision: u8) { + match l2 { + None => { + // L1 decision: affects ~27 TB + self.l1[l1 as usize].decision = decision; + } + Some(l2) => { + // L2 decision: affects ~107 GB + self.l2[l1 as usize][l2 as usize].decision = decision; + } + } + // O(1) - no leaf updates needed + } + + /// Update plasticity (learning rate) for a region + fn set_plasticity(&mut self, l1: u8, l2: Option, plasticity: f32) { + // Same pattern - O(1) update, affects entire subtree + } + + /// Query with cognitive filtering + fn cognitive_search(&self, query_fp: &[u8; 1250], min_plasticity: f32) -> Vec { + let query_scent = extract_scent(query_fp); + + // Only search buckets where learning is active + let active_l1: Vec = self.l1.iter() + .enumerate() + .filter(|(_, h)| h.plasticity >= min_plasticity) + .filter(|(_, h)| scent_matches(&h.scent, &query_scent)) + .map(|(i, _)| i as u8) + .collect(); + + self.scan_buckets(&active_l1, query_fp, threshold) + } +} +``` + +### Thinking at Scale + +``` +Without scent nodes: + "Update interest in cat videos" + → Modify 5.6 trillion leaf entries + → Hours of processing + +With scent nodes: + "Update interest in cat videos" + → Find L1 scent for "cat videos" (0x4A) + → self.l1[0x4A].plasticity = 0.9 + → Done. O(1). 27 TB affected instantly. +``` + +Ada doesn't think about individual fingerprints. Ada thinks about **regions of concept-space** represented by scent nodes. + +--- + +## Append Behavior + +### Single Fingerprint Append + +```rust +fn append(&mut self, fp: &[u8; 1250]) -> u64 { + let chunk = fp[0]; // First byte determines L1 bucket + let offset = self.data.append(fp); // Lance handles storage + + // Update header (free) + self.l1[chunk as usize].count += 1; + + // Optionally update scent (rolling average or periodic rebuild) + self.maybe_update_scent(chunk, fp); + + offset +} +``` + +### Scent Maintenance + +``` +Option A: Fixed scents (assigned at bucket creation) + - Simplest + - May drift as content evolves + +Option B: Rolling update + - scent = ewma(scent, new_fp_scent, α) + - Adapts to content changes + - Cheap: just XOR and shift + +Option C: Periodic rebuild + - Every N appends, recompute scent from samples + - Most accurate + - Can run in background +``` + +--- + +## Performance + +### Search: 7 PB Corpus + +| Step | Data Touched | Time | +|------|-------------|------| +| L1 scent scan | 1.25 KB | ~50 ns | +| L2 scent scan | 1.25 KB × ~3 matches | ~150 ns | +| Leaf SIMD scan | ~300 GB (0.003% of corpus) | ~seconds | + +**Total: 99.997% eliminated in ~200 nanoseconds.** + +### Comparison: Tree vs Scent + +| Approach | Operations | Cache Behavior | Time | +|----------|-----------|----------------|------| +| B-tree (8 levels) | 8 pointer chases | 8 potential misses | ~800 ns | +| Scent (2 levels) | 2 flat scans | L1 cache hits | ~100 ns | + +**Scent wins by 8x while being simpler.** + +### Memory Footprint + +| Component | Size | Location | +|-----------|------|----------| +| L1 scents | 1.25 KB | L1 cache | +| L1 headers | 8 KB | L2 cache | +| L2 scents (if needed) | 320 KB | L3 cache | +| Fingerprints | N × 1250 bytes | Disk/Lance | + +**Index overhead: <1 MB for petabyte-scale corpus.** + +--- + +## File Format + +### Header File (`.scent`) + +``` +Magic: "SCNT" +Version: u32 +Depth: u8 +L1 Headers: [ChunkHeader; 256] +L2 Headers: [ChunkHeader; 65536] (if depth >= 2) +... +``` + +### Integration with Ladybug Index + +```rust +struct LadybugStore { + // CAM index: type(16) + prefix(48) → offset + cam: LadybugIndex, + + // Scent index: hierarchical filtering + scent: ScentIndex, + + // Storage: Lance/Arrow + lance: Dataset, +} + +impl LadybugStore { + fn resonance_search(&self, query_fp: &[u8; 1250], threshold: f32) -> Vec { + // Step 1: Scent filtering (nanoseconds) + let candidate_chunks = self.scent.find(query_fp, threshold); + + // Step 2: SIMD Hamming on candidates only + let mut results = Vec::new(); + for chunk in candidate_chunks { + let batch = self.lance.scan_chunk(chunk); + results.extend(simd_hamming_filter(batch, query_fp, threshold)); + } + + results + } +} +``` + +--- + +## Summary + +``` +Scent Index = hierarchical locality hints + +- 1.25 KB per level +- O(1) bucket addressing +- SIMD-friendly flat scan +- No tree traversal +- No pointer chasing +- Fits in L1 cache + +Cognitive Integration: +- Decisions at scent level = affect millions of entries +- Plasticity markers = learning regions of concept-space +- Ada thinks in scents, not fingerprints + +Scale: +- Single layer: ~7 TB +- Two layers: ~1.8 PB +- Three layers: ~460 PB +- Add layers as needed, same pattern +``` + +**Fingerprint IS the content. Scent IS the address. Headers ARE free.** diff --git a/src/core/scent.rs b/src/core/scent.rs new file mode 100644 index 0000000..876ccea --- /dev/null +++ b/src/core/scent.rs @@ -0,0 +1,645 @@ +//! Scent Index - Hierarchical Content-Addressable Filtering +//! +//! Petabyte-scale resonance search via scent nodes. +//! +//! Query: "Siamese cat videos" in 7 PB +//! Time: ~100 ns to eliminate 99.997% of corpus +//! +//! See docs/SCENT_INDEX.md for full architecture. + +use std::path::Path; +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Write}; + +/// Fingerprint size: 10K bits = 1250 bytes +pub const FP_BYTES: usize = 1250; + +/// Scent size: 5 bytes = 40 bits +pub const SCENT_BYTES: usize = 5; + +/// Buckets per level +pub const BUCKETS: usize = 256; + +/// Chunk header with embedded scent and cognitive markers +#[derive(Clone, Copy, Debug)] +#[repr(C)] +pub struct ChunkHeader { + /// Chunk ID (0-255) + pub chunk_id: u8, + /// Start offset in data file + pub offset: u64, + /// Number of fingerprints in chunk + pub count: u32, + /// Compressed representative (scent) + pub scent: [u8; SCENT_BYTES], + /// Learning rate for this region (Ada cognitive) + pub plasticity: f32, + /// Cached decision/classification (Ada cognitive) + pub decision: u8, + /// Last access timestamp (attention tracking) + pub last_access: u64, +} + +impl ChunkHeader { + pub fn new(chunk_id: u8) -> Self { + Self { + chunk_id, + offset: 0, + count: 0, + scent: [0; SCENT_BYTES], + plasticity: 1.0, + decision: 0, + last_access: 0, + } + } +} + +impl Default for ChunkHeader { + fn default() -> Self { + Self::new(0) + } +} + +/// Extract 5-byte scent from full fingerprint +#[inline] +pub fn extract_scent(fp: &[u8]) -> [u8; SCENT_BYTES] { + if fp.len() < SCENT_BYTES { + let mut scent = [0u8; SCENT_BYTES]; + scent[..fp.len()].copy_from_slice(fp); + return scent; + } + + // XOR-fold: captures global structure in 5 bytes + let mut scent = [0u8; SCENT_BYTES]; + for (i, &b) in fp.iter().enumerate() { + scent[i % SCENT_BYTES] ^= b; + } + scent +} + +/// Compute scent distance (Hamming on 40 bits) +#[inline] +pub fn scent_distance(a: &[u8; SCENT_BYTES], b: &[u8; SCENT_BYTES]) -> u32 { + let mut dist = 0u32; + for i in 0..SCENT_BYTES { + dist += (a[i] ^ b[i]).count_ones(); + } + dist +} + +/// Check if scents match within threshold +#[inline] +pub fn scent_matches(a: &[u8; SCENT_BYTES], b: &[u8; SCENT_BYTES], threshold: u32) -> bool { + scent_distance(a, b) <= threshold +} + +/// Single-level scent index (up to ~7 TB) +pub struct ScentIndexL1 { + pub headers: Box<[ChunkHeader; BUCKETS]>, +} + +impl ScentIndexL1 { + /// Create empty L1 index + pub fn new() -> Self { + let headers: [ChunkHeader; BUCKETS] = std::array::from_fn(|i| ChunkHeader::new(i as u8)); + Self { + headers: Box::new(headers), + } + } + + /// Extract scents-only view (1.25 KB, L1 cache friendly) + pub fn scents(&self) -> [[u8; SCENT_BYTES]; BUCKETS] { + std::array::from_fn(|i| self.headers[i].scent) + } + + /// Find matching chunks via scent scan + pub fn find_chunks(&self, query_scent: &[u8; SCENT_BYTES], threshold: u32) -> Vec { + self.headers + .iter() + .filter(|h| h.count > 0) + .filter(|h| scent_matches(&h.scent, query_scent, threshold)) + .map(|h| h.chunk_id) + .collect() + } + + /// Find chunks filtered by plasticity (cognitive search) + pub fn find_chunks_plastic( + &self, + query_scent: &[u8; SCENT_BYTES], + threshold: u32, + min_plasticity: f32, + ) -> Vec { + self.headers + .iter() + .filter(|h| h.count > 0) + .filter(|h| h.plasticity >= min_plasticity) + .filter(|h| scent_matches(&h.scent, query_scent, threshold)) + .map(|h| h.chunk_id) + .collect() + } + + /// Assign fingerprint to chunk (returns chunk ID) + #[inline] + pub fn assign(&self, fp: &[u8]) -> u8 { + // First byte of fingerprint = chunk ID (locality preserving) + fp[0] + } + + /// Update chunk on append + pub fn on_append(&mut self, chunk: u8, fp: &[u8], offset: u64) { + let h = &mut self.headers[chunk as usize]; + + if h.count == 0 { + h.offset = offset; + h.scent = extract_scent(fp); + } else { + // Rolling scent update (EWMA) + let new_scent = extract_scent(fp); + for i in 0..SCENT_BYTES { + h.scent[i] = ((h.scent[i] as u16 * 15 + new_scent[i] as u16) / 16) as u8; + } + } + + h.count += 1; + h.last_access = timestamp(); + } + + /// Set decision for a chunk (O(1), affects millions of fps) + pub fn set_decision(&mut self, chunk: u8, decision: u8) { + self.headers[chunk as usize].decision = decision; + } + + /// Set plasticity for a chunk (O(1), affects millions of fps) + pub fn set_plasticity(&mut self, chunk: u8, plasticity: f32) { + self.headers[chunk as usize].plasticity = plasticity; + } + + /// Get chunk statistics + pub fn stats(&self) -> ScentStats { + let active = self.headers.iter().filter(|h| h.count > 0).count(); + let total_fps: u64 = self.headers.iter().map(|h| h.count as u64).sum(); + let avg_plasticity: f32 = self.headers.iter() + .filter(|h| h.count > 0) + .map(|h| h.plasticity) + .sum::() / active.max(1) as f32; + + ScentStats { + depth: 1, + active_buckets: active, + total_fingerprints: total_fps, + avg_plasticity, + } + } +} + +impl Default for ScentIndexL1 { + fn default() -> Self { + Self::new() + } +} + +/// Two-level scent index (up to ~1.8 PB) +pub struct ScentIndexL2 { + pub l1: ScentIndexL1, + pub l2: Box<[ScentIndexL1; BUCKETS]>, +} + +impl ScentIndexL2 { + pub fn new() -> Self { + Self { + l1: ScentIndexL1::new(), + l2: Box::new(std::array::from_fn(|_| ScentIndexL1::new())), + } + } + + /// Find matching (l1, l2) pairs + pub fn find_chunks( + &self, + query_scent: &[u8; SCENT_BYTES], + threshold: u32, + ) -> Vec<(u8, u8)> { + let l1_matches = self.l1.find_chunks(query_scent, threshold); + + l1_matches + .iter() + .flat_map(|&l1| { + self.l2[l1 as usize] + .find_chunks(query_scent, threshold) + .into_iter() + .map(move |l2| (l1, l2)) + }) + .collect() + } + + /// Assign fingerprint to (l1, l2) bucket + pub fn assign(&self, fp: &[u8]) -> (u8, u8) { + let l1 = fp[0]; + let l2 = fp[1]; + (l1, l2) + } + + /// Update on append + pub fn on_append(&mut self, fp: &[u8], offset: u64) { + let (l1, l2) = self.assign(fp); + + // Update L1 + self.l1.on_append(l1, fp, offset); + + // Update L2 + self.l2[l1 as usize].on_append(l2, fp, offset); + } + + /// Set decision at L1 level (affects ~27 TB) + pub fn set_decision_l1(&mut self, l1: u8, decision: u8) { + self.l1.set_decision(l1, decision); + } + + /// Set decision at L2 level (affects ~107 GB) + pub fn set_decision_l2(&mut self, l1: u8, l2: u8, decision: u8) { + self.l2[l1 as usize].set_decision(l2, decision); + } + + /// Set plasticity at L1 level + pub fn set_plasticity_l1(&mut self, l1: u8, plasticity: f32) { + self.l1.set_plasticity(l1, plasticity); + } + + /// Set plasticity at L2 level + pub fn set_plasticity_l2(&mut self, l1: u8, l2: u8, plasticity: f32) { + self.l2[l1 as usize].set_plasticity(l2, plasticity); + } +} + +impl Default for ScentIndexL2 { + fn default() -> Self { + Self::new() + } +} + +/// Unified scent index (auto-scales by depth) +pub enum ScentIndex { + L1(ScentIndexL1), + L2(ScentIndexL2), + // L3, L4 can be added as needed +} + +impl ScentIndex { + /// Create single-level index + pub fn new() -> Self { + ScentIndex::L1(ScentIndexL1::new()) + } + + /// Create two-level index + pub fn new_l2() -> Self { + ScentIndex::L2(ScentIndexL2::new()) + } + + /// Depth of index + pub fn depth(&self) -> usize { + match self { + ScentIndex::L1(_) => 1, + ScentIndex::L2(_) => 2, + } + } + + /// Find matching bucket addresses + pub fn find(&self, query_fp: &[u8], threshold: u32) -> Vec { + let query_scent = extract_scent(query_fp); + + match self { + ScentIndex::L1(idx) => { + idx.find_chunks(&query_scent, threshold) + .into_iter() + .map(|l1| BucketAddr::L1(l1)) + .collect() + } + ScentIndex::L2(idx) => { + idx.find_chunks(&query_scent, threshold) + .into_iter() + .map(|(l1, l2)| BucketAddr::L2(l1, l2)) + .collect() + } + } + } + + /// Update on append + pub fn on_append(&mut self, fp: &[u8], offset: u64) { + match self { + ScentIndex::L1(idx) => { + let chunk = idx.assign(fp); + idx.on_append(chunk, fp, offset); + } + ScentIndex::L2(idx) => { + idx.on_append(fp, offset); + } + } + } + + /// Get statistics + pub fn stats(&self) -> ScentStats { + match self { + ScentIndex::L1(idx) => idx.stats(), + ScentIndex::L2(idx) => { + let l1_stats = idx.l1.stats(); + let l2_total: u64 = idx.l2.iter().map(|l| l.stats().total_fingerprints).sum(); + ScentStats { + depth: 2, + active_buckets: l1_stats.active_buckets * BUCKETS, // Approximate + total_fingerprints: l2_total, + avg_plasticity: l1_stats.avg_plasticity, + } + } + } + } + + // ========== Persistence ========== + + /// Save to file + pub fn save(&self, path: &Path) -> std::io::Result<()> { + let file = File::create(path)?; + let mut w = BufWriter::new(file); + + // Magic + version + depth + w.write_all(b"SCNT")?; + w.write_all(&1u32.to_le_bytes())?; + w.write_all(&(self.depth() as u8).to_le_bytes())?; + + match self { + ScentIndex::L1(idx) => { + self.write_headers(&mut w, &idx.headers)?; + } + ScentIndex::L2(idx) => { + self.write_headers(&mut w, &idx.l1.headers)?; + for l2 in idx.l2.iter() { + self.write_headers(&mut w, &l2.headers)?; + } + } + } + + w.flush() + } + + /// Load from file + pub fn load(path: &Path) -> std::io::Result { + let file = File::open(path)?; + let mut r = BufReader::new(file); + + // Magic + let mut magic = [0u8; 4]; + r.read_exact(&mut magic)?; + if &magic != b"SCNT" { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "Invalid scent index magic", + )); + } + + // Version + let mut version = [0u8; 4]; + r.read_exact(&mut version)?; + + // Depth + let mut depth = [0u8; 1]; + r.read_exact(&mut depth)?; + + match depth[0] { + 1 => { + let mut idx = ScentIndexL1::new(); + Self::read_headers(&mut r, &mut idx.headers)?; + Ok(ScentIndex::L1(idx)) + } + 2 => { + let mut idx = ScentIndexL2::new(); + Self::read_headers(&mut r, &mut idx.l1.headers)?; + for l2 in idx.l2.iter_mut() { + Self::read_headers(&mut r, &mut l2.headers)?; + } + Ok(ScentIndex::L2(idx)) + } + _ => Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "Unsupported scent index depth", + )), + } + } + + fn write_headers(&self, w: &mut W, headers: &[ChunkHeader; BUCKETS]) -> std::io::Result<()> { + for h in headers.iter() { + w.write_all(&[h.chunk_id])?; + w.write_all(&h.offset.to_le_bytes())?; + w.write_all(&h.count.to_le_bytes())?; + w.write_all(&h.scent)?; + w.write_all(&h.plasticity.to_le_bytes())?; + w.write_all(&[h.decision])?; + w.write_all(&h.last_access.to_le_bytes())?; + } + Ok(()) + } + + fn read_headers(r: &mut R, headers: &mut [ChunkHeader; BUCKETS]) -> std::io::Result<()> { + for h in headers.iter_mut() { + let mut buf1 = [0u8; 1]; + let mut buf4 = [0u8; 4]; + let mut buf8 = [0u8; 8]; + + r.read_exact(&mut buf1)?; + h.chunk_id = buf1[0]; + + r.read_exact(&mut buf8)?; + h.offset = u64::from_le_bytes(buf8); + + r.read_exact(&mut buf4)?; + h.count = u32::from_le_bytes(buf4); + + r.read_exact(&mut h.scent)?; + + r.read_exact(&mut buf4)?; + h.plasticity = f32::from_le_bytes(buf4); + + r.read_exact(&mut buf1)?; + h.decision = buf1[0]; + + r.read_exact(&mut buf8)?; + h.last_access = u64::from_le_bytes(buf8); + } + Ok(()) + } +} + +impl Default for ScentIndex { + fn default() -> Self { + Self::new() + } +} + +/// Bucket address (supports any depth) +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BucketAddr { + L1(u8), + L2(u8, u8), + L3(u8, u8, u8), +} + +impl BucketAddr { + /// Flatten to u32 for hashing/comparison + pub fn flatten(&self) -> u32 { + match self { + BucketAddr::L1(a) => *a as u32, + BucketAddr::L2(a, b) => ((*a as u32) << 8) | (*b as u32), + BucketAddr::L3(a, b, c) => ((*a as u32) << 16) | ((*b as u32) << 8) | (*c as u32), + } + } +} + +/// Statistics +#[derive(Debug)] +pub struct ScentStats { + pub depth: usize, + pub active_buckets: usize, + pub total_fingerprints: u64, + pub avg_plasticity: f32, +} + +/// Current timestamp (milliseconds) +fn timestamp() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + +// ========== SIMD Optimized Scent Scan ========== + +#[cfg(target_arch = "x86_64")] +mod simd { + use super::*; + + /// SIMD-optimized scent scan (AVX2) + /// Compares query against 256 scents, returns matching chunk IDs + #[cfg(target_feature = "avx2")] + pub fn find_chunks_simd( + scents: &[[u8; SCENT_BYTES]; BUCKETS], + query: &[u8; SCENT_BYTES], + threshold: u32, + ) -> Vec { + // For now, fall back to scalar + // TODO: Implement AVX2 version + scents + .iter() + .enumerate() + .filter(|(_, s)| scent_distance(s, query) <= threshold) + .map(|(i, _)| i as u8) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_fp(seed: u8) -> [u8; FP_BYTES] { + let mut fp = [0u8; FP_BYTES]; + for (i, b) in fp.iter_mut().enumerate() { + *b = seed.wrapping_add(i as u8); + } + fp + } + + #[test] + fn test_extract_scent() { + let fp = make_fp(42); + let scent = extract_scent(&fp); + assert_eq!(scent.len(), SCENT_BYTES); + } + + #[test] + fn test_scent_distance() { + let a = [0xFF, 0xFF, 0xFF, 0xFF, 0xFF]; + let b = [0xFF, 0xFF, 0xFF, 0xFF, 0xFF]; + assert_eq!(scent_distance(&a, &b), 0); + + let c = [0x00, 0x00, 0x00, 0x00, 0x00]; + assert_eq!(scent_distance(&a, &c), 40); // All 40 bits differ + } + + #[test] + fn test_l1_append_find() { + let mut idx = ScentIndexL1::new(); + + let fp1 = make_fp(0x42); + let fp2 = make_fp(0x42); // Same bucket + let fp3 = make_fp(0x99); // Different bucket + + idx.on_append(0x42, &fp1, 0); + idx.on_append(0x42, &fp2, 1250); + idx.on_append(0x99, &fp3, 2500); + + assert_eq!(idx.headers[0x42].count, 2); + assert_eq!(idx.headers[0x99].count, 1); + + // Find should return bucket 0x42 for similar query + let query = make_fp(0x42); + let matches = idx.find_chunks(&extract_scent(&query), 10); + assert!(matches.contains(&0x42)); + } + + #[test] + fn test_l2_append_find() { + let mut idx = ScentIndexL2::new(); + + let fp = make_fp(0x42); + idx.on_append(&fp, 0); + + let (l1, l2) = idx.assign(&fp); + assert_eq!(l1, 0x42); + + let matches = idx.find_chunks(&extract_scent(&fp), 10); + assert!(!matches.is_empty()); + } + + #[test] + fn test_cognitive_markers() { + let mut idx = ScentIndexL1::new(); + + let fp = make_fp(0x10); + idx.on_append(0x10, &fp, 0); + + // Set plasticity + idx.set_plasticity(0x10, 0.5); + assert_eq!(idx.headers[0x10].plasticity, 0.5); + + // Set decision + idx.set_decision(0x10, 42); + assert_eq!(idx.headers[0x10].decision, 42); + + // Search with plasticity filter + let matches = idx.find_chunks_plastic(&extract_scent(&fp), 10, 0.3); + assert!(matches.contains(&0x10)); + + let no_matches = idx.find_chunks_plastic(&extract_scent(&fp), 10, 0.9); + assert!(!no_matches.contains(&0x10)); + } + + #[test] + fn test_persistence() { + let mut idx = ScentIndex::new(); + + let fp = make_fp(0x55); + idx.on_append(&fp, 12345); + + let tmp = tempfile::NamedTempFile::new().unwrap(); + idx.save(tmp.path()).unwrap(); + + let loaded = ScentIndex::load(tmp.path()).unwrap(); + + assert_eq!(loaded.depth(), 1); + assert_eq!(loaded.stats().total_fingerprints, 1); + } + + #[test] + fn test_bucket_addr_flatten() { + assert_eq!(BucketAddr::L1(0x42).flatten(), 0x42); + assert_eq!(BucketAddr::L2(0x12, 0x34).flatten(), 0x1234); + assert_eq!(BucketAddr::L3(0x12, 0x34, 0x56).flatten(), 0x123456); + } +}