Skip to content

Commit b15cdc9

Browse files
authored
Merge pull request #71 from AdaWorldAPI/claude/qwen-claude-reverse-eng-vHuHv
bench: 4096-head SPO throughput — 611M lookups/sec, 18K tokens/sec
2 parents eff0b79 + 5d6e23f commit b15cdc9

1 file changed

Lines changed: 63 additions & 0 deletions

File tree

src/hpc/palette_distance.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,4 +420,67 @@ mod tests {
420420
let spo = SpoDistanceMatrices::build(&pal, &pal, &pal);
421421
assert_eq!(spo.byte_size(), 3 * 32 * 32 * 2);
422422
}
423+
424+
#[test]
425+
fn test_4096_head_spo_throughput() {
426+
// Build 256-entry palette
427+
let pal = make_palette(256);
428+
let spo = SpoDistanceMatrices::build(&pal, &pal, &pal);
429+
430+
// 4096 heads = 64×64, each with S/P/O palette index
431+
let mut heads_s = [0u8; 4096];
432+
let mut heads_p = [0u8; 4096];
433+
let mut heads_o = [0u8; 4096];
434+
for i in 0..4096 {
435+
heads_s[i] = (i % 256) as u8;
436+
heads_p[i] = ((i * 7) % 256) as u8;
437+
heads_o[i] = ((i * 13) % 256) as u8;
438+
}
439+
440+
// Benchmark: 4096 × 64 SPO lookups (one row attending to 64 targets)
441+
let start = std::time::Instant::now();
442+
let mut total_dist = 0u64;
443+
let iterations = 100;
444+
for _ in 0..iterations {
445+
for row in 0..64 {
446+
for col in 0..64 {
447+
let i = row * 64 + col;
448+
for target in 0..64 {
449+
let j = row * 64 + target;
450+
total_dist += spo.spo_distance(
451+
heads_s[i], heads_p[i], heads_o[i],
452+
heads_s[j], heads_p[j], heads_o[j],
453+
) as u64;
454+
}
455+
}
456+
}
457+
}
458+
let elapsed = start.elapsed();
459+
let total_lookups = 64u64 * 64 * 64 * iterations as u64;
460+
let lookups_per_sec = total_lookups as f64 / elapsed.as_secs_f64();
461+
let ns_per_lookup = elapsed.as_nanos() as f64 / total_lookups as f64;
462+
463+
// Pearl 2³: multiply by 8 projections
464+
let pearl_ns = ns_per_lookup * 8.0 / 3.0; // each projection uses 1-3 planes
465+
let tokens_per_sec_spo = 1e9 / (ns_per_lookup * 64.0 * 64.0); // one token = full 64×64 pass
466+
let tokens_per_sec_pearl = 1e9 / (pearl_ns * 64.0 * 64.0);
467+
468+
eprintln!();
469+
eprintln!("═══ Qwen3.5 + Opus 4.6: 4096-Head SPO Benchmark ═══");
470+
eprintln!(" Palette: 256 entries, SPO matrices: {} KB", spo.byte_size() / 1024);
471+
eprintln!(" Lookups: {} total ({} iterations × 64×64×64)", total_lookups, iterations);
472+
eprintln!(" Time: {:.3}ms", elapsed.as_secs_f64() * 1000.0);
473+
eprintln!(" Rate: {:.0} M lookups/sec", lookups_per_sec / 1e6);
474+
eprintln!(" Latency: {:.1} ns/lookup (SPO, 3 planes)", ns_per_lookup);
475+
eprintln!(" Pearl: {:.1} ns/lookup (8 projections avg)", pearl_ns);
476+
eprintln!();
477+
eprintln!(" Token throughput:");
478+
eprintln!(" SPO only: {:.0} tokens/sec (64×64 attention per token)", tokens_per_sec_spo);
479+
eprintln!(" Pearl 2³: {:.0} tokens/sec (8 projections per head)", tokens_per_sec_pearl);
480+
eprintln!(" Triple model: {:.0} tokens/sec (self+user+impact)", tokens_per_sec_pearl / 3.0);
481+
eprintln!();
482+
eprintln!(" Memory: {} KB SPO tables + 4 KB head indices = {} KB total",
483+
spo.byte_size() / 1024, spo.byte_size() / 1024 + 4);
484+
eprintln!(" (blackhole: {})", total_dist); // prevent optimizer from eliding
485+
}
423486
}

0 commit comments

Comments
 (0)