From 44255ed22a2a0986ed3e130c36d6cb5447def92e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 04:00:49 +0000
Subject: [PATCH 1/3] =?UTF-8?q?bench(splat3d):=20EWA-SYRK=20crossover=20?=
 =?UTF-8?q?=E2=80=94=20kill-or-justify=20the=20BLAS-backend=20premise?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds benches/ewa_syrk_crossover.rs (W1 PR #3 of the cross-session program).
Tests the 3DGS-EWA-SYRK-BLAS-MKL plan's premise — "projection is a BLAS
workload in disguise → MKL/OpenBLAS/AMX backend for the covariance sandwich"
— with a number instead of an assertion.

Compares M·N·Mᵀ three ways over N = 1k/100k/1M:
  scalar       hand upper-triangle sandwich, per element
  simd_x16     shipped SoA F32x16 sandwich_x16 (the renderer's kernel)
  gemm_shape   two dense 3×3 matmuls per element — the shape a per-matrix
               CPU BLAS call imposes, in-process (no FFI ⇒ lower bound on cblas)
plus project_batch end-to-end throughput.

Result (target-cpu=x86-64-v3, AVX-512 via runtime dispatch):
  simd_x16 ~2× faster than BOTH scalar and gemm_shape at every size; no
  crossover up to 1M; gemm_shape ≈ scalar. Real MKL/cblas adds FFI on top.

Verdict: the EWA-SYRK *backend* is a pessimization at 3×3/2×3 — fused SoA
SIMD already wins. The plan row is idea-only (the sandwich IS SYRK-shaped)
but the actionable backend is killed by measurement. Corroborates PR-3's
predicted 1.5-2× SIMD-vs-scalar. Full numbers + verdict in benches/RESULTS.md.

Evidence-grounded per the program: source (project.rs/spd3.rs whole-read) +
measurement, not the plan-as-authority. Steelman (shared-W batched GEMM)
left as a documented follow-up.

https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
---
 Cargo.toml                    |   5 +
 benches/RESULTS.md            |  56 +++++++++++
 benches/ewa_syrk_crossover.rs | 181 ++++++++++++++++++++++++++++++++++
 3 files changed, 242 insertions(+)
 create mode 100644 benches/ewa_syrk_crossover.rs

diff --git a/Cargo.toml b/Cargo.toml
index ee7cc8f7..8fe11272 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -175,6 +175,11 @@ name = "splat3d_bench"
 harness = false
 required-features = ["splat3d"]
 
+[[bench]]
+name = "ewa_syrk_crossover"
+harness = false
+required-features = ["splat3d"]
+
 [features]
 default = ["std", "hpc-extras"]
 
diff --git a/benches/RESULTS.md b/benches/RESULTS.md
index 883716fc..e3345edc 100644
--- a/benches/RESULTS.md
+++ b/benches/RESULTS.md
@@ -123,3 +123,59 @@ The demo binary `examples/splat3d_flex.rs` and integration test
 Full-pipeline frame-time numbers (p50/p95/p99) await a Inria bicycle
 scene download — left as a follow-up for the dedicated benchmarking
 session against real-world data.
+
+## EWA-SYRK crossover — kill-or-justify the BLAS-backend premise
+
+Bench: `benches/ewa_syrk_crossover.rs`. Tests whether the
+`3DGS-EWA-SYRK-BLAS-MKL` plan's premise — "projection is a BLAS workload
+in disguise → route the covariance sandwich through an MKL/OpenBLAS/AMX
+backend" — holds for the **3×3** EWA sandwich `Σ' = M·Σ·Mᵀ`.
+
+### Hardware / build
+
+Container, default features, `.cargo/config.toml` `target-cpu=x86-64-v3`
+(AVX2 baseline); `sandwich_x16`'s per-function `#[target_feature(avx512f)]`
++ LazyLock runtime dispatch selects AVX-512 on this host. No RUSTFLAGS.
+
+```bash
+cargo bench --features splat3d --bench ewa_syrk_crossover
+```
+
+### `M·N·Mᵀ` sandwich — three kernel shapes (Melem/s, higher = better)
+
+| N | scalar | `simd_x16` | `gemm_shape` (BLAS-shape) |
+|---|---|---|---|
+| 1 024 | 88.9 | **179.4** | 90.5 |
+| 100 000 | 84.1 | **170.0** | 85.6 |
+| 1 000 000 | 85.6 | **164.5** | 86.8 |
+
+`gemm_shape` = two dense 3×3 matmuls per element (the shape a per-matrix
+BLAS call imposes), **in-process, no FFI**.
+
+### `project_batch` end-to-end (Melem/s)
+
+| N | throughput |
+|---|---|
+| 1 024 | 21.7 |
+| 100 000 | ~19.2 |
+
+(full pipeline incl. scalar SH eval; the sandwich is a fraction of this.)
+
+### Verdict — BLAS backend NOT justified at 3×3
+
+- `gemm_shape` is statistically identical to `scalar` and **~2× slower than
+  the shipped `simd_x16`** at every size 1k→1M. **No crossover**; the gap is
+  flat, not closing with batch size.
+- `gemm_shape` carries **no FFI** — a real `cblas`/MKL call adds marshalling
+  + dispatch on top, so it can only be worse. There is no efficient CPU
+  batched-3×3 SYRK (that pattern is a GPU one).
+- ⇒ The EWA-SYRK *backend* (native/MKL/OpenBLAS/AMX dispatch for the
+  covariance sandwich) is a **pessimization** at 3×3/2×3: fused SoA SIMD
+  already wins. The plan row is **idea-only** — the sandwich *is*
+  SYRK-shaped (true) but the actionable backend is killed by measurement.
+- Corroborates PR-3's predicted "1.5-2× SIMD-vs-scalar": `simd_x16` is ~2×
+  over scalar at large N (transpose amortised, unlike the transpose-bound
+  N=16 PR-1 microbench).
+- Steelman left open: `W·Σ·Wᵀ` has a *shared* `W` across gaussians → a
+  batched shared-`W` GEMM is the one form that could differ; benched as a
+  follow-up. Per-gaussian `J·Σ·Jᵀ` does not batch that way.
diff --git a/benches/ewa_syrk_crossover.rs b/benches/ewa_syrk_crossover.rs
new file mode 100644
index 00000000..6a5dbf96
--- /dev/null
+++ b/benches/ewa_syrk_crossover.rs
@@ -0,0 +1,181 @@
+//! EWA-SYRK crossover bench — kill-or-justify the "3DGS projection is a
+//! BLAS workload in disguise" premise of `3DGS-EWA-SYRK-BLAS-MKL-plan.md`.
+//!
+//! # The question
+//!
+//! The plan proposes routing the EWA covariance sandwich `Σ' = M·Σ·Mᵀ`
+//! through a batched SYRK/GEMM backend (native / MKL / OpenBLAS / AMX). But
+//! the sandwich matrices are **3×3** (world→camera) and **2×3** (camera→
+//! image), and `splat3d::project` already runs them 16-wide via the
+//! `crate::simd` SoA polyfill. Batched *tiny* GEMM is exactly the regime
+//! where CPU BLAS loses to a fused SIMD unroll (per-call overhead dominates;
+//! there is no efficient CPU batched-3×3 SYRK — that pattern is a GPU one).
+//!
+//! This bench measures it instead of asserting it.
+//!
+//! # What it compares (same `M·N·Mᵀ` math, three kernel shapes)
+//!
+//! - `scalar`     — the hand-unrolled upper-triangle `spd3::sandwich`, per element.
+//! - `simd_x16`   — the shipped `sandwich_x16` SoA path (the renderer's kernel).
+//! - `gemm_shape` — two dense 3×3 matmuls per element (`M·N` then `·Mᵀ`),
+//!                  with no SoA fusion. This models the **shape a CPU BLAS
+//!                  backend imposes** — one small dense GEMM per matrix.
+//!
+//! `gemm_shape` runs **in-process, with no FFI**, so it is a *lower bound*
+//! on a real `cblas`/MKL path: a genuine BLAS call adds argument marshalling
+//! + dispatch overhead on top. If `gemm_shape` already loses to `simd_x16`,
+//! the BLAS backend loses by more.
+//!
+//! Plus `project_batch` end-to-end throughput at the same batch sizes, so
+//! the sandwich cost is seen in proportion to the full pipeline.
+//!
+//! Sweep: 1 024 / 100 000 / 1 000 000 (all exact multiples of 16, no tail).
+//!
+//! Caveat (honest scope): this measures the *per-element* kernel shape. The
+//! one genuinely batchable step is `W·Σ·Wᵀ`, where `W` (the camera view
+//! rotation) is shared across all gaussians — a shared-`W` batched GEMM is a
+//! steelman left as a follow-up. The per-image Jacobian `J` differs per
+//! gaussian, so `J·Σ·Jᵀ` does not batch that way.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use ndarray::hpc::splat3d::{project_batch, sandwich, sandwich_x16, Camera, Gaussian3D, GaussianBatch, ProjectedBatch, Spd3};
+
+const SIZES: [usize; 3] = [1_024, 100_000, 1_000_000];
+
+#[inline]
+fn rng(s: &mut u32) -> f32 {
+    *s ^= *s << 13;
+    *s ^= *s >> 17;
+    *s ^= *s << 5;
+    (*s as f32) / (u32::MAX as f32)
+}
+
+/// `n` distinct SPD pairs via `from_scale_quat` (entry-wise different across
+/// all 6 SoA channels so the SIMD transpose does real work).
+fn build_spd_pairs(n: usize) -> (Vec<Spd3>, Vec<Spd3>) {
+    let mut st = 0x1234_5678u32;
+    let mut ms = Vec::with_capacity(n);
+    let mut ns = Vec::with_capacity(n);
+    for _ in 0..n {
+        let sm = [0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st)];
+        let sn = [0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st), 0.3 + 1.5 * rng(&mut st)];
+        let mut qm = [rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5];
+        let mut qn = [rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5, rng(&mut st) - 0.5];
+        normalize_quat(&mut qm);
+        normalize_quat(&mut qn);
+        ms.push(Spd3::from_scale_quat(sm, qm));
+        ns.push(Spd3::from_scale_quat(sn, qn));
+    }
+    (ms, ns)
+}
+
+fn normalize_quat(q: &mut [f32; 4]) {
+    let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]).sqrt().max(1e-12);
+    for v in q.iter_mut() {
+        *v /= n;
+    }
+}
+
+/// `M·N·Mᵀ` via two dense 3×3 matmuls — the shape a per-matrix BLAS call
+/// imposes (no upper-triangle fusion, no SoA). Output upper triangle, off-
+/// diagonals averaged to match `sandwich`'s symmetrisation.
+#[inline]
+fn sandwich_gemm_shape(m: &Spd3, n: &Spd3) -> Spd3 {
+    let a = m.to_rows();
+    let b = n.to_rows();
+    let mut t = [[0.0f32; 3]; 3];
+    for i in 0..3 {
+        for j in 0..3 {
+            t[i][j] = a[i][0] * b[0][j] + a[i][1] * b[1][j] + a[i][2] * b[2][j];
+        }
+    }
+    // R = T · Mᵀ  (Mᵀ[k][j] = A[j][k])
+    let mut r = [[0.0f32; 3]; 3];
+    for i in 0..3 {
+        for j in 0..3 {
+            r[i][j] = t[i][0] * a[j][0] + t[i][1] * a[j][1] + t[i][2] * a[j][2];
+        }
+    }
+    Spd3::new(r[0][0], 0.5 * (r[0][1] + r[1][0]), 0.5 * (r[0][2] + r[2][0]), r[1][1], 0.5 * (r[1][2] + r[2][1]), r[2][2])
+}
+
+fn build_gaussians(n: usize) -> GaussianBatch {
+    let mut st = 0x9E37_79B9u32;
+    let mut batch = GaussianBatch::with_capacity(n);
+    for _ in 0..n {
+        let mut g = Gaussian3D::unit();
+        g.mean = [4.0 * rng(&mut st) - 2.0, 4.0 * rng(&mut st) - 2.0, 1.0 + 5.0 * rng(&mut st)];
+        g.scale = [0.05 + 0.2 * rng(&mut st); 3];
+        g.quat = [1.0, 0.0, 0.0, 0.0];
+        g.opacity = 0.5 + 0.5 * rng(&mut st);
+        batch.push(g);
+    }
+    batch
+}
+
+fn bench_sandwich_paths(c: &mut Criterion) {
+    let mut grp = c.benchmark_group("ewa_sandwich_paths");
+    grp.sample_size(20);
+    for &n in &SIZES {
+        let (ms, ns) = build_spd_pairs(n);
+        grp.throughput(Throughput::Elements(n as u64));
+
+        grp.bench_with_input(BenchmarkId::new("scalar", n), &n, |b, &n| {
+            let mut out = vec![Spd3::ZERO; n];
+            b.iter(|| {
+                for i in 0..n {
+                    out[i] = sandwich(black_box(&ms[i]), black_box(&ns[i]));
+                }
+                black_box(&out[n - 1]);
+            });
+        });
+
+        grp.bench_with_input(BenchmarkId::new("simd_x16", n), &n, |b, &n| {
+            let mut out = vec![Spd3::ZERO; n];
+            b.iter(|| {
+                let mc = ms.chunks_exact(16);
+                let nc = ns.chunks_exact(16);
+                let oc = out.chunks_exact_mut(16);
+                for ((mm, nn), oo) in mc.zip(nc).zip(oc) {
+                    let ma: &[Spd3; 16] = mm.try_into().unwrap();
+                    let na: &[Spd3; 16] = nn.try_into().unwrap();
+                    let oa: &mut [Spd3; 16] = oo.try_into().unwrap();
+                    sandwich_x16(black_box(ma), black_box(na), oa);
+                }
+                black_box(&out[n - 1]);
+            });
+        });
+
+        grp.bench_with_input(BenchmarkId::new("gemm_shape", n), &n, |b, &n| {
+            let mut out = vec![Spd3::ZERO; n];
+            b.iter(|| {
+                for i in 0..n {
+                    out[i] = sandwich_gemm_shape(black_box(&ms[i]), black_box(&ns[i]));
+                }
+                black_box(&out[n - 1]);
+            });
+        });
+    }
+    grp.finish();
+}
+
+fn bench_project_batch(c: &mut Criterion) {
+    let mut grp = c.benchmark_group("ewa_project_batch");
+    grp.sample_size(10);
+    let cam = Camera::identity_at_origin(1920, 1080);
+    for &n in &SIZES {
+        let batch = build_gaussians(n);
+        let mut out = ProjectedBatch::with_capacity(batch.capacity);
+        grp.throughput(Throughput::Elements(n as u64));
+        grp.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
+            b.iter(|| {
+                project_batch(black_box(&batch), black_box(&cam), &mut out);
+                black_box(out.len);
+            });
+        });
+    }
+    grp.finish();
+}
+
+criterion_group!(ewa_syrk, bench_sandwich_paths, bench_project_batch);
+criterion_main!(ewa_syrk);

From 94b000999d67f059a554ab17d9abcab6d7411255 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 04:07:13 +0000
Subject: [PATCH 2/3] bench(splat3d): re-run EWA-SYRK crossover at
 target-cpu=x86-64-v4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Correction: the prior RESULTS reported v3 numbers and wrongly attributed
AVX-512 to runtime dispatch. F32x16 is compile-time-selected by target-cpu,
so v3 measured AVX2. Benches must run at the project's deployment tier v4
(AVX-512 native, F32x16 = __m512); committed .cargo/config.toml stays v3 for
GitHub/CI portability, overridden locally via RUSTFLAGS=-Ctarget-cpu=x86-64-v4.

v4 numbers (Melem/s): simd_x16 175/170/172 vs scalar 85/76/82 vs gemm_shape
90/85/87 at 1k/100k/1M. Verdict unchanged and tier-robust (v3 within ~5%):
simd_x16 ~2x over both scalar and the BLAS-shape, no crossover — the
EWA-SYRK backend is a pessimization at 3x3.

https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
---
 benches/RESULTS.md | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/benches/RESULTS.md b/benches/RESULTS.md
index e3345edc..68aff2ef 100644
--- a/benches/RESULTS.md
+++ b/benches/RESULTS.md
@@ -133,33 +133,37 @@ backend" — holds for the **3×3** EWA sandwich `Σ' = M·Σ·Mᵀ`.
 
 ### Hardware / build
 
-Container, default features, `.cargo/config.toml` `target-cpu=x86-64-v3`
-(AVX2 baseline); `sandwich_x16`'s per-function `#[target_feature(avx512f)]`
-+ LazyLock runtime dispatch selects AVX-512 on this host. No RUSTFLAGS.
+Container, AVX-512F+BW+VL. The committed `.cargo/config.toml` pins
+`target-cpu=x86-64-v3` (for GitHub/CI portability); **benches are run at the
+project's deployment tier `x86-64-v4`** (AVX-512 native — `F32x16` is a
+single `__m512`), via the documented override:
 
 ```bash
-cargo bench --features splat3d --bench ewa_syrk_crossover
+RUSTFLAGS="-Ctarget-cpu=x86-64-v4" \
+  cargo bench --features splat3d --bench ewa_syrk_crossover
 ```
 
-### `M·N·Mᵀ` sandwich — three kernel shapes (Melem/s, higher = better)
+### `M·N·Mᵀ` sandwich — three kernel shapes (Melem/s, higher = better) @ v4
 
 | N | scalar | `simd_x16` | `gemm_shape` (BLAS-shape) |
 |---|---|---|---|
-| 1 024 | 88.9 | **179.4** | 90.5 |
-| 100 000 | 84.1 | **170.0** | 85.6 |
-| 1 000 000 | 85.6 | **164.5** | 86.8 |
+| 1 024 | 85.2 | **175.2** | 90.1 |
+| 100 000 | 76.3 | **169.6** | 85.4 |
+| 1 000 000 | 81.9 | **172.0** | 87.1 |
 
 `gemm_shape` = two dense 3×3 matmuls per element (the shape a per-matrix
-BLAS call imposes), **in-process, no FFI**.
+BLAS call imposes), **in-process, no FFI**. The v3 baseline is within ~5% of
+these v4 numbers for this transpose-bound 6-field kernel — the verdict is
+tier-robust.
 
-### `project_batch` end-to-end (Melem/s)
+### `project_batch` end-to-end @ v4
 
 | N | throughput |
 |---|---|
-| 1 024 | 21.7 |
-| 100 000 | ~19.2 |
+| 1 024 | 12.1 Melem/s (84 µs) |
 
-(full pipeline incl. scalar SH eval; the sandwich is a fraction of this.)
+(full pipeline incl. scalar `sh_eval_deg3` per visible gaussian — SH eval
+dominates; the covariance sandwich is a small fraction of this.)
 
 ### Verdict — BLAS backend NOT justified at 3×3
 

From d522a323e6c95c3a9158d4921145eed19f81632d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 04:28:41 +0000
Subject: [PATCH 3/3] style(bench): rustfmt ewa_syrk_crossover + canonical -C
 target-cpu in RESULTS

Fixes CI cargo fmt --all --check (rustfmt 1.95.0: wrap the use import,
normalize_quat method chain, and Spd3::new args) and the CodeRabbit nit on
RESULTS.md (canonical `-C target-cpu` spelling with a space). Docs/format
only; no behavior change.

https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
---
 benches/RESULTS.md            |  2 +-
 benches/ewa_syrk_crossover.rs | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/benches/RESULTS.md b/benches/RESULTS.md
index 68aff2ef..01ce65c7 100644
--- a/benches/RESULTS.md
+++ b/benches/RESULTS.md
@@ -139,7 +139,7 @@ project's deployment tier `x86-64-v4`** (AVX-512 native — `F32x16` is a
 single `__m512`), via the documented override:
 
 ```bash
-RUSTFLAGS="-Ctarget-cpu=x86-64-v4" \
+RUSTFLAGS="-C target-cpu=x86-64-v4" \
   cargo bench --features splat3d --bench ewa_syrk_crossover
 ```
 
diff --git a/benches/ewa_syrk_crossover.rs b/benches/ewa_syrk_crossover.rs
index 6a5dbf96..bf42e9a3 100644
--- a/benches/ewa_syrk_crossover.rs
+++ b/benches/ewa_syrk_crossover.rs
@@ -38,7 +38,9 @@
 //! gaussian, so `J·Σ·Jᵀ` does not batch that way.
 
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
-use ndarray::hpc::splat3d::{project_batch, sandwich, sandwich_x16, Camera, Gaussian3D, GaussianBatch, ProjectedBatch, Spd3};
+use ndarray::hpc::splat3d::{
+    project_batch, sandwich, sandwich_x16, Camera, Gaussian3D, GaussianBatch, ProjectedBatch, Spd3,
+};
 
 const SIZES: [usize; 3] = [1_024, 100_000, 1_000_000];
 
@@ -70,7 +72,9 @@ fn build_spd_pairs(n: usize) -> (Vec<Spd3>, Vec<Spd3>) {
 }
 
 fn normalize_quat(q: &mut [f32; 4]) {
-    let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]).sqrt().max(1e-12);
+    let n = (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3])
+        .sqrt()
+        .max(1e-12);
     for v in q.iter_mut() {
         *v /= n;
     }
@@ -96,7 +100,14 @@ fn sandwich_gemm_shape(m: &Spd3, n: &Spd3) -> Spd3 {
             r[i][j] = t[i][0] * a[j][0] + t[i][1] * a[j][1] + t[i][2] * a[j][2];
         }
     }
-    Spd3::new(r[0][0], 0.5 * (r[0][1] + r[1][0]), 0.5 * (r[0][2] + r[2][0]), r[1][1], 0.5 * (r[1][2] + r[2][1]), r[2][2])
+    Spd3::new(
+        r[0][0],
+        0.5 * (r[0][1] + r[1][0]),
+        0.5 * (r[0][2] + r[2][0]),
+        r[1][1],
+        0.5 * (r[1][2] + r[2][1]),
+        r[2][2],
+    )
 }
 
 fn build_gaussians(n: usize) -> GaussianBatch {