From 44255ed22a2a0986ed3e130c36d6cb5447def92e Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 04:00:49 +0000 Subject: [PATCH 1/3] =?UTF-8?q?bench(splat3d):=20EWA-SYRK=20crossover=20?= =?UTF-8?q?=E2=80=94=20kill-or-justify=20the=20BLAS-backend=20premise?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds benches/ewa_syrk_crossover.rs (W1 PR #3 of the cross-session program). Tests the 3DGS-EWA-SYRK-BLAS-MKL plan's premise — "projection is a BLAS workload in disguise → MKL/OpenBLAS/AMX backend for the covariance sandwich" — with a number instead of an assertion. Compares M·N·Mᵀ three ways over N = 1k/100k/1M: scalar hand upper-triangle sandwich, per element simd_x16 shipped SoA F32x16 sandwich_x16 (the renderer's kernel) gemm_shape two dense 3×3 matmuls per element — the shape a per-matrix CPU BLAS call imposes, in-process (no FFI ⇒ lower bound on cblas) plus project_batch end-to-end throughput. Result (target-cpu=x86-64-v3, AVX-512 via runtime dispatch): simd_x16 ~2× faster than BOTH scalar and gemm_shape at every size; no crossover up to 1M; gemm_shape ≈ scalar. Real MKL/cblas adds FFI on top. Verdict: the EWA-SYRK *backend* is a pessimization at 3×3/2×3 — fused SoA SIMD already wins. The plan row is idea-only (the sandwich IS SYRK-shaped) but the actionable backend is killed by measurement. Corroborates PR-3's predicted 1.5-2× SIMD-vs-scalar. Full numbers + verdict in benches/RESULTS.md. Evidence-grounded per the program: source (project.rs/spd3.rs whole-read) + measurement, not the plan-as-authority. Steelman (shared-W batched GEMM) left as a documented follow-up. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u --- Cargo.toml | 5 + benches/RESULTS.md | 56 +++++++++++ benches/ewa_syrk_crossover.rs | 181 ++++++++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 benches/ewa_syrk_crossover.rs diff --git a/Cargo.toml b/Cargo.toml index ee7cc8f7..8fe11272 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -175,6 +175,11 @@ name = "splat3d_bench" harness = false required-features = ["splat3d"] +[[bench]] +name = "ewa_syrk_crossover" +harness = false +required-features = ["splat3d"] + [features] default = ["std", "hpc-extras"] diff --git a/benches/RESULTS.md b/benches/RESULTS.md index 883716fc..e3345edc 100644 --- a/benches/RESULTS.md +++ b/benches/RESULTS.md @@ -123,3 +123,59 @@ The demo binary `examples/splat3d_flex.rs` and integration test Full-pipeline frame-time numbers (p50/p95/p99) await a Inria bicycle scene download — left as a follow-up for the dedicated benchmarking session against real-world data. + +## EWA-SYRK crossover — kill-or-justify the BLAS-backend premise + +Bench: `benches/ewa_syrk_crossover.rs`. Tests whether the +`3DGS-EWA-SYRK-BLAS-MKL` plan's premise — "projection is a BLAS workload +in disguise → route the covariance sandwich through an MKL/OpenBLAS/AMX +backend" — holds for the **3×3** EWA sandwich `Σ' = M·Σ·Mᵀ`. + +### Hardware / build + +Container, default features, `.cargo/config.toml` `target-cpu=x86-64-v3` +(AVX2 baseline); `sandwich_x16`'s per-function `#[target_feature(avx512f)]` ++ LazyLock runtime dispatch selects AVX-512 on this host. No RUSTFLAGS. + +```bash +cargo bench --features splat3d --bench ewa_syrk_crossover +``` + +### `M·N·Mᵀ` sandwich — three kernel shapes (Melem/s, higher = better) + +| N | scalar | `simd_x16` | `gemm_shape` (BLAS-shape) | +|---|---|---|---| +| 1 024 | 88.9 | **179.4** | 90.5 | +| 100 000 | 84.1 | **170.0** | 85.6 | +| 1 000 000 | 85.6 | **164.5** | 86.8 | + +`gemm_shape` = two dense 3×3 matmuls per element (the shape a per-matrix +BLAS call imposes), **in-process, no FFI**. + +### `project_batch` end-to-end (Melem/s) + +| N | throughput | +|---|---| +| 1 024 | 21.7 | +| 100 000 | ~19.2 | + +(full pipeline incl. scalar SH eval; the sandwich is a fraction of this.) + +### Verdict — BLAS backend NOT justified at 3×3 + +- `gemm_shape` is statistically identical to `scalar` and **~2× slower than + the shipped `simd_x16`** at every size 1k→1M. **No crossover**; the gap is + flat, not closing with batch size. +- `gemm_shape` carries **no FFI** — a real `cblas`/MKL call adds marshalling + + dispatch on top, so it can only be worse. There is no efficient CPU + batched-3×3 SYRK (that pattern is a GPU one). +- ⇒ The EWA-SYRK *backend* (native/MKL/OpenBLAS/AMX dispatch for the + covariance sandwich) is a **pessimization** at 3×3/2×3: fused SoA SIMD + already wins. The plan row is **idea-only** — the sandwich *is* + SYRK-shaped (true) but the actionable backend is killed by measurement. +- Corroborates PR-3's predicted "1.5-2× SIMD-vs-scalar": `simd_x16` is ~2× + over scalar at large N (transpose amortised, unlike the transpose-bound + N=16 PR-1 microbench). +- Steelman left open: `W·Σ·Wᵀ` has a *shared* `W` across gaussians → a + batched shared-`W` GEMM is the one form that could differ; benched as a + follow-up. Per-gaussian `J·Σ·Jᵀ` does not batch that way. diff --git a/benches/ewa_syrk_crossover.rs b/benches/ewa_syrk_crossover.rs new file mode 100644 index 00000000..6a5dbf96 --- /dev/null +++ b/benches/ewa_syrk_crossover.rs @@ -0,0 +1,181 @@ +//! EWA-SYRK crossover bench — kill-or-justify the "3DGS projection is a +//! BLAS workload in disguise" premise of `3DGS-EWA-SYRK-BLAS-MKL-plan.md`. +//! +//! # The question +//! +//! The plan proposes routing the EWA covariance sandwich `Σ' = M·Σ·Mᵀ` +//! through a batched SYRK/GEMM backend (native / MKL / OpenBLAS / AMX). But +//! the sandwich matrices are **3×3** (world→camera) and **2×3** (camera→ +//! image), and `splat3d::project` already runs them 16-wide via the +//! `crate::simd` SoA polyfill. Batched *tiny* GEMM is exactly the regime +//! where CPU BLAS loses to a fused SIMD unroll (per-call overhead dominates; +//! there is no efficient CPU batched-3×3 SYRK — that pattern is a GPU one). +//! +//! This bench measures it instead of asserting it. +//! +//! # What it compares (same `M·N·Mᵀ` math, three kernel shapes) +//! +//! - `scalar` — the hand-unrolled upper-triangle `spd3::sandwich`, per element. +//! - `simd_x16` — the shipped `sandwich_x16` SoA path (the renderer's kernel). +//! - `gemm_shape` — two dense 3×3 matmuls per element (`M·N` then `·Mᵀ`), +//! with no SoA fusion. This models the **shape a CPU BLAS +//! backend imposes** — one small dense GEMM per matrix. +//! +//! `gemm_shape` runs **in-process, with no FFI**, so it is a *lower bound* +//! on a real `cblas`/MKL path: a genuine BLAS call adds argument marshalling +//! + dispatch overhead on top. If `gemm_shape` already loses to `simd_x16`, +//! the BLAS backend loses by more. +//! +//! Plus `project_batch` end-to-end throughput at the same batch sizes, so +//! the sandwich cost is seen in proportion to the full pipeline. +//! +//! Sweep: 1 024 / 100 000 / 1 000 000 (all exact multiples of 16, no tail). +//! +//! Caveat (honest scope): this measures the *per-element* kernel shape. The +//! one genuinely batchable step is `W·Σ·Wᵀ`, where `W` (the camera view +//! rotation) is shared across all gaussians — a shared-`W` batched GEMM is a +//! steelman left as a follow-up. The per-image Jacobian `J` differs per +//! gaussian, so `J·Σ·Jᵀ` does not batch that way. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use ndarray::hpc::splat3d::{project_batch, sandwich, sandwich_x16, Camera, Gaussian3D, GaussianBatch, ProjectedBatch, Spd3}; + +const SIZES: [usize; 3] = [1_024, 100_000, 1_000_000]; + +#[inline] +fn rng(s: &mut u32) -> f32 { + *s ^= *s << 13; + *s ^= *s >> 17; + *s ^= *s << 5; + (*s as f32) / (u32::MAX as f32) +} + +/// `n` distinct SPD pairs via `from_scale_quat` (entry-wise different across +/// all 6 SoA channels so the SIMD transpose does real work). +fn build_spd_pairs(n: usize) -> (Vec, Vec) { + let mut st = 0x1234_5678u32; + let mut ms = Vec::with_capacity(n); + let mut ns = Vec::with_capacity(n); + for _ in 0..n { + let sm = [0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st)]; + let sn = [0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st)]; + let mut qm = [rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5]; + let mut qn = [rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5]; + normalize_quat(&mut qm); + normalize_quat(&mut qn); + ms.push(Spd3::from_scale_quat(sm, qm)); + ns.push(Spd3::from_scale_quat(sn, qn)); + } + (ms, ns) +} + +fn normalize_quat(q: &mut [f32; 4]) { + let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]).sqrt().max(1e-12); + for v in q.iter_mut() { + *v /= n; + } +} + +/// `M·N·Mᵀ` via two dense 3×3 matmuls — the shape a per-matrix BLAS call +/// imposes (no upper-triangle fusion, no SoA). Output upper triangle, off- +/// diagonals averaged to match `sandwich`'s symmetrisation. +#[inline] +fn sandwich_gemm_shape(m: &Spd3, n: &Spd3) -> Spd3 { + let a = m.to_rows(); + let b = n.to_rows(); + let mut t = [[0.0f32; 3]; 3]; + for i in 0..3 { + for j in 0..3 { + t[i][j] = a[i][0] * b[0][j] + a[i][1] * b[1][j] + a[i][2] * b[2][j]; + } + } + // R = T · Mᵀ (Mᵀ[k][j] = A[j][k]) + let mut r = [[0.0f32; 3]; 3]; + for i in 0..3 { + for j in 0..3 { + r[i][j] = t[i][0] * a[j][0] + t[i][1] * a[j][1] + t[i][2] * a[j][2]; + } + } + Spd3::new(r[0][0], 0.5 * (r[0][1] + r[1][0]), 0.5 * (r[0][2] + r[2][0]), r[1][1], 0.5 * (r[1][2] + r[2][1]), r[2][2]) +} + +fn build_gaussians(n: usize) -> GaussianBatch { + let mut st = 0x9E37_79B9u32; + let mut batch = GaussianBatch::with_capacity(n); + for _ in 0..n { + let mut g = Gaussian3D::unit(); + g.mean = [4.0 * rng(&mut st) - 2.0, 4.0 * rng(&mut st) - 2.0, 1.0 + 5.0 * rng(&mut st)]; + g.scale = [0.05 + 0.2 * rng(&mut st); 3]; + g.quat = [1.0, 0.0, 0.0, 0.0]; + g.opacity = 0.5 + 0.5 * rng(&mut st); + batch.push(g); + } + batch +} + +fn bench_sandwich_paths(c: &mut Criterion) { + let mut grp = c.benchmark_group("ewa_sandwich_paths"); + grp.sample_size(20); + for &n in &SIZES { + let (ms, ns) = build_spd_pairs(n); + grp.throughput(Throughput::Elements(n as u64)); + + grp.bench_with_input(BenchmarkId::new("scalar", n), &n, |b, &n| { + let mut out = vec![Spd3::ZERO; n]; + b.iter(|| { + for i in 0..n { + out[i] = sandwich(black_box(&ms[i]), black_box(&ns[i])); + } + black_box(&out[n - 1]); + }); + }); + + grp.bench_with_input(BenchmarkId::new("simd_x16", n), &n, |b, &n| { + let mut out = vec![Spd3::ZERO; n]; + b.iter(|| { + let mc = ms.chunks_exact(16); + let nc = ns.chunks_exact(16); + let oc = out.chunks_exact_mut(16); + for ((mm, nn), oo) in mc.zip(nc).zip(oc) { + let ma: &[Spd3; 16] = mm.try_into().unwrap(); + let na: &[Spd3; 16] = nn.try_into().unwrap(); + let oa: &mut [Spd3; 16] = oo.try_into().unwrap(); + sandwich_x16(black_box(ma), black_box(na), oa); + } + black_box(&out[n - 1]); + }); + }); + + grp.bench_with_input(BenchmarkId::new("gemm_shape", n), &n, |b, &n| { + let mut out = vec![Spd3::ZERO; n]; + b.iter(|| { + for i in 0..n { + out[i] = sandwich_gemm_shape(black_box(&ms[i]), black_box(&ns[i])); + } + black_box(&out[n - 1]); + }); + }); + } + grp.finish(); +} + +fn bench_project_batch(c: &mut Criterion) { + let mut grp = c.benchmark_group("ewa_project_batch"); + grp.sample_size(10); + let cam = Camera::identity_at_origin(1920, 1080); + for &n in &SIZES { + let batch = build_gaussians(n); + let mut out = ProjectedBatch::with_capacity(batch.capacity); + grp.throughput(Throughput::Elements(n as u64)); + grp.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + project_batch(black_box(&batch), black_box(&cam), &mut out); + black_box(out.len); + }); + }); + } + grp.finish(); +} + +criterion_group!(ewa_syrk, bench_sandwich_paths, bench_project_batch); +criterion_main!(ewa_syrk); From 94b000999d67f059a554ab17d9abcab6d7411255 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 04:07:13 +0000 Subject: [PATCH 2/3] bench(splat3d): re-run EWA-SYRK crossover at target-cpu=x86-64-v4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correction: the prior RESULTS reported v3 numbers and wrongly attributed AVX-512 to runtime dispatch. F32x16 is compile-time-selected by target-cpu, so v3 measured AVX2. Benches must run at the project's deployment tier v4 (AVX-512 native, F32x16 = __m512); committed .cargo/config.toml stays v3 for GitHub/CI portability, overridden locally via RUSTFLAGS=-Ctarget-cpu=x86-64-v4. v4 numbers (Melem/s): simd_x16 175/170/172 vs scalar 85/76/82 vs gemm_shape 90/85/87 at 1k/100k/1M. Verdict unchanged and tier-robust (v3 within ~5%): simd_x16 ~2x over both scalar and the BLAS-shape, no crossover — the EWA-SYRK backend is a pessimization at 3x3. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u --- benches/RESULTS.md | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/benches/RESULTS.md b/benches/RESULTS.md index e3345edc..68aff2ef 100644 --- a/benches/RESULTS.md +++ b/benches/RESULTS.md @@ -133,33 +133,37 @@ backend" — holds for the **3×3** EWA sandwich `Σ' = M·Σ·Mᵀ`. ### Hardware / build -Container, default features, `.cargo/config.toml` `target-cpu=x86-64-v3` -(AVX2 baseline); `sandwich_x16`'s per-function `#[target_feature(avx512f)]` -+ LazyLock runtime dispatch selects AVX-512 on this host. No RUSTFLAGS. +Container, AVX-512F+BW+VL. The committed `.cargo/config.toml` pins +`target-cpu=x86-64-v3` (for GitHub/CI portability); **benches are run at the +project's deployment tier `x86-64-v4`** (AVX-512 native — `F32x16` is a +single `__m512`), via the documented override: ```bash -cargo bench --features splat3d --bench ewa_syrk_crossover +RUSTFLAGS="-Ctarget-cpu=x86-64-v4" \ + cargo bench --features splat3d --bench ewa_syrk_crossover ``` -### `M·N·Mᵀ` sandwich — three kernel shapes (Melem/s, higher = better) +### `M·N·Mᵀ` sandwich — three kernel shapes (Melem/s, higher = better) @ v4 | N | scalar | `simd_x16` | `gemm_shape` (BLAS-shape) | |---|---|---|---| -| 1 024 | 88.9 | **179.4** | 90.5 | -| 100 000 | 84.1 | **170.0** | 85.6 | -| 1 000 000 | 85.6 | **164.5** | 86.8 | +| 1 024 | 85.2 | **175.2** | 90.1 | +| 100 000 | 76.3 | **169.6** | 85.4 | +| 1 000 000 | 81.9 | **172.0** | 87.1 | `gemm_shape` = two dense 3×3 matmuls per element (the shape a per-matrix -BLAS call imposes), **in-process, no FFI**. +BLAS call imposes), **in-process, no FFI**. The v3 baseline is within ~5% of +these v4 numbers for this transpose-bound 6-field kernel — the verdict is +tier-robust. -### `project_batch` end-to-end (Melem/s) +### `project_batch` end-to-end @ v4 | N | throughput | |---|---| -| 1 024 | 21.7 | -| 100 000 | ~19.2 | +| 1 024 | 12.1 Melem/s (84 µs) | -(full pipeline incl. scalar SH eval; the sandwich is a fraction of this.) +(full pipeline incl. scalar `sh_eval_deg3` per visible gaussian — SH eval +dominates; the covariance sandwich is a small fraction of this.) ### Verdict — BLAS backend NOT justified at 3×3 From d522a323e6c95c3a9158d4921145eed19f81632d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 04:28:41 +0000 Subject: [PATCH 3/3] style(bench): rustfmt ewa_syrk_crossover + canonical -C target-cpu in RESULTS Fixes CI cargo fmt --all --check (rustfmt 1.95.0: wrap the use import, normalize_quat method chain, and Spd3::new args) and the CodeRabbit nit on RESULTS.md (canonical `-C target-cpu` spelling with a space). Docs/format only; no behavior change. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u --- benches/RESULTS.md | 2 +- benches/ewa_syrk_crossover.rs | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/benches/RESULTS.md b/benches/RESULTS.md index 68aff2ef..01ce65c7 100644 --- a/benches/RESULTS.md +++ b/benches/RESULTS.md @@ -139,7 +139,7 @@ project's deployment tier `x86-64-v4`** (AVX-512 native — `F32x16` is a single `__m512`), via the documented override: ```bash -RUSTFLAGS="-Ctarget-cpu=x86-64-v4" \ +RUSTFLAGS="-C target-cpu=x86-64-v4" \ cargo bench --features splat3d --bench ewa_syrk_crossover ``` diff --git a/benches/ewa_syrk_crossover.rs b/benches/ewa_syrk_crossover.rs index 6a5dbf96..bf42e9a3 100644 --- a/benches/ewa_syrk_crossover.rs +++ b/benches/ewa_syrk_crossover.rs @@ -38,7 +38,9 @@ //! gaussian, so `J·Σ·Jᵀ` does not batch that way. use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use ndarray::hpc::splat3d::{project_batch, sandwich, sandwich_x16, Camera, Gaussian3D, GaussianBatch, ProjectedBatch, Spd3}; +use ndarray::hpc::splat3d::{ + project_batch, sandwich, sandwich_x16, Camera, Gaussian3D, GaussianBatch, ProjectedBatch, Spd3, +}; const SIZES: [usize; 3] = [1_024, 100_000, 1_000_000]; @@ -70,7 +72,9 @@ fn build_spd_pairs(n: usize) -> (Vec, Vec) { } fn normalize_quat(q: &mut [f32; 4]) { - let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]).sqrt().max(1e-12); + let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]) + .sqrt() + .max(1e-12); for v in q.iter_mut() { *v /= n; } @@ -96,7 +100,14 @@ fn sandwich_gemm_shape(m: &Spd3, n: &Spd3) -> Spd3 { r[i][j] = t[i][0] * a[j][0] + t[i][1] * a[j][1] + t[i][2] * a[j][2]; } } - Spd3::new(r[0][0], 0.5 * (r[0][1] + r[1][0]), 0.5 * (r[0][2] + r[2][0]), r[1][1], 0.5 * (r[1][2] + r[2][1]), r[2][2]) + Spd3::new( + r[0][0], + 0.5 * (r[0][1] + r[1][0]), + 0.5 * (r[0][2] + r[2][0]), + r[1][1], + 0.5 * (r[1][2] + r[2][1]), + r[2][2], + ) } fn build_gaussians(n: usize) -> GaussianBatch {