Merge pull request #176 from AdaWorldAPI/claude/pr-x-phase3-neon-tiers

AdaWorldAPI · web-flow · commit 96d01ce9e2d3 · 2026-05-20T16:16:26.000+02:00
feat(simd): Phase 3 scaffold — NEON tier flavors (baseline / dotprod / bf16)
diff --git a/.cargo/config-apple-m2.toml b/.cargo/config-apple-m2.toml
@@ -0,0 +1,20 @@
+[build]
+# Apple M2 / M3 / M4 — ARMv8.6-A+ with BF16, dotprod, fp16, i8mm.
+# Use with:
+#   cargo --config .cargo/config-apple-m2.toml build --target=aarch64-apple-darwin
+#
+# Targets the BF16 tier — see `src/simd_neon_bf16.rs` for the silicon
+# table, runtime detection (`sysctl hw.optional.arm.FEAT_BF16`), the
+# BFMMLA / BFDOT intrinsic family, and the asm-byte fallback path that
+# stable Rust 1.95 must use until `vbfdotq_f32` stabilizes (issue
+# #117222).
+#
+# Also works on:
+#   - Apple M3 (target-cpu=apple-m3) — same ARMv8.6-A baseline
+#   - Apple M4 — adds SVE2, can override with -Ctarget-cpu=apple-m4
+#   - Snapdragon X Elite / X Plus on macOS-like targets (use cortex-x4)
+#
+# DOES NOT target Apple M1 — M1 is ARMv8.5-A and lacks BF16. M1 should
+# use the dotprod tier (config-pi5.toml-shaped, target-cpu=apple-m1).
+[target.aarch64-apple-darwin]
+rustflags = ["-Ctarget-cpu=apple-m2", "-Ctarget-feature=+bf16,+dotprod,+fp16,+i8mm"]
diff --git a/.cargo/config-graviton.toml b/.cargo/config-graviton.toml
@@ -0,0 +1,19 @@
+[build]
+# AWS Graviton 3 / 3E / 4 (Neoverse V1 / V2) — ARMv8.4-A+ with BF16
+# (V1: optional, V2: mandatory) + SVE / SVE2.
+# Use with:
+#   cargo --config .cargo/config-graviton.toml build --target=aarch64-unknown-linux-gnu
+#
+# Targets the BF16 tier — see `src/simd_neon_bf16.rs`. Graviton 3 (V1)
+# also adds SVE 256-bit; Graviton 4 (V2) adds SVE2 + BFMMLA + i8mm.
+#
+# Also works on:
+#   - Cortex-X3 / X4 / X925 generic Linux servers
+#   - Ampere Altra (V1-class — same baseline)
+#   - NVIDIA Grace (V2 — same as Graviton 4)
+#
+# For ARMv9 cores with SVE2 you may want a separate config-sve2.toml
+# later that adds `+sve2` and routes through a future
+# `src/simd_neon_sve2.rs` (not in Phase 3 scope).
+[target.aarch64-unknown-linux-gnu]
+rustflags = ["-Ctarget-cpu=neoverse-v2", "-Ctarget-feature=+bf16,+dotprod,+fp16,+i8mm"]
diff --git a/.cargo/config-pi5.toml b/.cargo/config-pi5.toml
@@ -0,0 +1,15 @@
+[build]
+# Raspberry Pi 5 (BCM2712, Cortex-A76) — ARMv8.2-A with dotprod + fp16.
+# Use with:
+#   cargo --config .cargo/config-pi5.toml build --target=aarch64-unknown-linux-gnu
+#
+# Targets the dotprod/fp16 tier — see `src/simd_neon_dotprod.rs` for the
+# silicon table, runtime detection, and stub map. Also works on:
+#   - Orange Pi 5 (Rockchip RK3588, Cortex-A76)
+#   - Anything reporting `Features: ... asimddp asimdhp ...` in
+#     /proc/cpuinfo without `bf16`.
+#
+# For Apple M2+ / Snapdragon X / Graviton 4, use config-apple-m2.toml
+# (BF16 tier — see src/simd_neon_bf16.rs).
+[target.aarch64-unknown-linux-gnu]
+rustflags = ["-Ctarget-cpu=cortex-a76", "-Ctarget-feature=+dotprod,+fp16"]
diff --git a/src/lib.rs b/src/lib.rs
@@ -266,6 +266,24 @@ pub mod simd_amx;
 #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
 pub mod simd_neon;
 
+// NEON tier scaffolds — Phase 3 of the SIMD integration plan
+// (.claude/knowledge/simd-dispatch-architecture.md § 6).
+//
+// Each file documents the silicon, the runtime + compile-time detection
+// path, and stubs out the F16 / BF16 wrappers with intrinsic maps for
+// future implementation. Current state: scaffolds only — the actual
+// NEON code still lives in `simd_neon.rs::aarch64_simd` and gets
+// migrated tier-by-tier as the Phase 3 sprints land.
+#[cfg(all(target_arch = "aarch64", feature = "std"))]
+#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
+pub mod simd_neon_baseline;
+#[cfg(all(target_arch = "aarch64", feature = "std"))]
+#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
+pub mod simd_neon_dotprod;
+#[cfg(all(target_arch = "aarch64", feature = "std"))]
+#[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
+pub mod simd_neon_bf16;
+
 #[cfg(feature = "std")]
 #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)]
 pub mod simd_wasm;
diff --git a/src/simd_neon_baseline.rs b/src/simd_neon_baseline.rs
@@ -0,0 +1,81 @@
+//! NEON baseline tier — ARMv8.0-A `+neon` only.
+//!
+//! # Silicon
+//!
+//! Every aarch64 CPU since ARMv8.0-A ships NEON unconditionally. This
+//! tier is the lowest common denominator — the floor every other tier
+//! builds on. Concretely:
+//!
+//! - **Raspberry Pi 3** (BCM2837, Cortex-A53) — ARMv8.0-A
+//! - **Raspberry Pi 4** (BCM2711, Cortex-A72) — ARMv8.0-A
+//! - **Pi CM3 / CM4 (4 GB)** — same A72 silicon
+//! - Anything reporting `Features: ... asimd ...` in `/proc/cpuinfo`
+//!   without `asimddp`, `asimdfhm`, `asimdhp`, or `bf16`.
+//!
+//! # What you get
+//!
+//! Native 128-bit lanes: `float32x4_t`, `float64x2_t`, `int8x16_t`,
+//! `uint8x16_t`, `int16x8_t`, `uint16x8_t`, `int32x4_t`, `int64x2_t`,
+//! `uint32x4_t`, `uint64x2_t`. Standard NEON arithmetic — `vaddq_*`,
+//! `vsubq_*`, `vmulq_*`, `vfmaq_*`, `vminq_*`, `vmaxq_*`, gather /
+//! scatter via `vld1q_*` / `vst1q_*`, lane select, reduce.
+//!
+//! # What you do NOT get
+//!
+//! - **dotprod** (SDOT/UDOT) → see `simd_neon_dotprod.rs`
+//! - **fp16 arithmetic** (`vfmlaq_f16`, `vaddq_f16`) → see
+//!   `simd_neon_dotprod.rs`
+//! - **bf16** (`vbfdotq_f32`, `vbfmlalbq_f32`) → see `simd_neon_bf16.rs`
+//! - **SVE2** (variable-length vectors) → not in any current tier file.
+//!
+//! # 512-bit composed wrappers
+//!
+//! `crate::simd::F32x16` / `U8x64` etc. on aarch64 compose 4× 128-bit
+//! NEON registers into one logical wrapper, e.g.
+//! `pub struct F32x16(pub [float32x4_t; 4])`. The four loads/stores
+//! pipeline well on dual-issue cores (A72, A76, M-series).
+//!
+//! # Cargo config
+//!
+//! No special flags needed. `aarch64-unknown-linux-gnu` / `aarch64-
+//! apple-darwin` already enable NEON. Pi 3/4 cross-builds:
+//! `cargo build --target=aarch64-unknown-linux-gnu` from any host.
+//!
+//! # Status
+//!
+//! Scaffold only — placeholder for Phase 3 implementation. The actual
+//! 128-bit native wrappers (I8x16, I16x8, U8x16, U16x8, U32x4, U64x2,
+//! I32x4, I64x2) currently live in `src/simd_neon.rs::aarch64_simd`.
+//! That code moves here once the tier split lands.
+//!
+//! Composed 512-bit wrappers (`F32x16` = `[float32x4_t; 4]` etc.) for
+//! the 8 missing int types (U8x64, I8x64, I16x32, I32x16, I64x8,
+//! U16x32, U32x16, U64x8) are TODO — currently dispatched to
+//! `simd_scalar.rs` via the `scalar::*` fallback at `simd.rs:1593-95`.
+
+#![cfg(all(target_arch = "aarch64", feature = "std"))]
+
+// TODO(Phase-3): move the existing `pub mod aarch64_simd` block from
+// `src/simd_neon.rs` (lines 463-1126 of master @ 3c20392f) into this
+// file. Then re-export from `simd_neon.rs` for backwards compatibility
+// during the migration window. Same pattern as Phase 4 used to extract
+// `simd_scalar.rs` via `#[path]` declaration.
+
+// TODO(Phase-3): add the 8 missing 512-bit composed wrappers as
+// `[neon_native; 4]`. Apply the `avx2_int_type!`-equivalent macro
+// pattern to generate them mechanically — name it `neon_int_type!` and
+// keep the API surface identical to the AVX-512 / AVX2 / nightly arms.
+//
+//   neon_int_type!(U8x64,  u8,  64, uint8x16_t, vaddq_u8,  vsubq_u8);
+//   neon_int_type!(I8x64,  i8,  64, int8x16_t,  vaddq_s8,  vsubq_s8);
+//   neon_int_type!(U16x32, u16, 32, uint16x8_t, vaddq_u16, vsubq_u16);
+//   neon_int_type!(I16x32, i16, 32, int16x8_t,  vaddq_s16, vsubq_s16);
+//   neon_int_type!(U32x16, u32, 16, uint32x4_t, vaddq_u32, vsubq_u32);
+//   neon_int_type!(I32x16, i32, 16, int32x4_t,  vaddq_s32, vsubq_s32);
+//   neon_int_type!(U64x8,  u64, 8,  uint64x2_t, vaddq_u64, vsubq_u64);
+//   neon_int_type!(I64x8,  i64, 8,  int64x2_t,  vaddq_s64, vsubq_s64);
+
+// TODO(Phase-3): copy the existing F32x16 / F64x8 paired-load impls
+// from `src/simd_neon.rs::aarch64_simd::{F32x16, F64x8, F32Mask16,
+// F64Mask8}` here. They already use the composed `[float32x4_t; 4]` /
+// `[float64x2_t; 4]` layout this tier expects.
diff --git a/src/simd_neon_bf16.rs b/src/simd_neon_bf16.rs
@@ -0,0 +1,204 @@
+//! NEON + BF16 tier — ARMv8.6-A `+bf16` (or ARMv8.2-A + optional `+bf16`).
+//!
+//! Builds on `simd_neon_dotprod.rs`. Adds the BF16 instruction family:
+//! BFDOT, BFMMLA, BFMLALB, BFMLALT, BFCVT. These are the bf16 cousins
+//! of dotprod — same 4× int8 throughput shape, but for the half-the-
+//! width bfloat16 type that LLM inference standardized on.
+//!
+//! # Silicon
+//!
+//! - **Apple M2 / M3 / M4** (Avalanche/Blizzard, Everest/Sawtooth,
+//!   Tupai/Donan) — ARMv8.6-A+. BF16 always on. `sysctl
+//!   hw.optional.arm.FEAT_BF16` returns 1. M1 does NOT have BF16 — it's
+//!   ARMv8.5-A.
+//! - **Snapdragon X Elite / X Plus** (Cortex-X4/X3 cores, Oryon
+//!   prime) — ARMv8.7-A. BF16 always on.
+//! - **Cortex-A510 / A520 / A710 / A720 / X2 / X3 / X4 / X925** —
+//!   ARMv9.0-A+. BF16 always on.
+//! - **NVIDIA Grace** (Neoverse V2) — ARMv9-A. BF16 on.
+//! - **AWS Graviton 3 / 3E / 4** (Neoverse V1/V2) — V1 added BF16 as
+//!   optional ARMv8.4-A extension; V2 makes it mandatory.
+//! - **Ampere One (M-series)** — ARMv8.6-A+. BF16 on.
+//!
+//! # NOT in this tier
+//!
+//! - Apple M1 (ARMv8.5-A, no BF16) — falls back to `simd_neon_dotprod.rs`
+//! - Raspberry Pi 5 (Cortex-A76, ARMv8.2-A, no BF16) — `simd_neon_dotprod.rs`
+//! - Any Pi 3/4 / Cortex-A53/A72 — `simd_neon_baseline.rs`
+//!
+//! # How to detect at runtime
+//!
+//! - **Linux**: `/proc/cpuinfo` Features line should show `bf16`.
+//!   `getauxval(AT_HWCAP2) & HWCAP2_BF16` (bit 14).
+//!   `std::arch::is_aarch64_feature_detected!("bf16")` — recommended.
+//! - **macOS**: `sysctl hw.optional.arm.FEAT_BF16` → `1` means yes.
+//!   On M2+ it's always 1; on M1 it's 0.
+//! - **Windows ARM64**: `IsProcessorFeaturePresent(PF_ARM_V83_BF16)`
+//!   (constant added in Win11 24H2 SDK).
+//!
+//! # How to detect at compile time
+//!
+//! Cargo config flags:
+//! - `-Ctarget-feature=+bf16` — enables BF16 intrinsics + cfg gate.
+//! - `-Ctarget-cpu=apple-m2` — implies bf16 + everything else.
+//! - `-Ctarget-cpu=neoverse-v2` — Graviton 4 baseline.
+//! - `-Ctarget-cpu=cortex-x4` — Snapdragon X Elite / Cortex-X4 cores.
+//!
+//! Inside Rust:
+//!
+//! ```ignore
+//! #[cfg(all(target_arch = "aarch64", target_feature = "bf16"))]
+//! pub use crate::simd_neon_bf16::{BF16x8, BF16x16, bfdot, bfmmla};
+//! ```
+//!
+//! # What you get
+//!
+//! ## BF16 dot-product / matrix-multiply
+//!
+//! - `vbfdotq_f32(acc, a, b)` — 2×(2×bf16·2×bf16) → 2×f32, accumulated
+//!   into 4×f32 register. The bf16 analogue of `vdotq_s32`.
+//! - `vbfmmlaq_f32(acc, a, b)` — 2×2 outer product BFMMLA. The crown
+//!   jewel for transformer GEMM — accumulates a full 2×2 f32 tile per
+//!   instruction. 8 bf16 mults + 4 f32 adds per cycle on M2.
+//! - `vbfmlalbq_f32` / `vbfmlaltq_f32` — bottom / top half multiply-
+//!   accumulate, lane-by-lane variant of BFDOT.
+//! - `vbfmlalbq_laned_f32` — broadcast one lane across all bf16
+//!   multiplications. Useful for matvec.
+//!
+//! ## BF16 conversion
+//!
+//! - `vcvt_bf16_f32` / `vcvtq_low_bf16_f32` / `vcvtq_high_bf16_f32` —
+//!   pack 4×f32 → 4×bf16. Hardware rounding (no manual RNE needed
+//!   like the AVX-512BF16 `_mm512_cvtne2ps_pbh` path in
+//!   `simd_avx512.rs`).
+//! - Scalar f32 ↔ bf16: trivial high-16-bit slice (the scalar paths in
+//!   `src/simd.rs:1604-1626` work everywhere, including this tier).
+//!
+//! # Composed wrapper shapes
+//!
+//! - `BF16x8` = `bfloat16x8_t` — native 128-bit register, 8 bf16 lanes.
+//!   Matches AVX-512BF16 `BF16x8 = __m128bh` in shape.
+//! - `BF16x16` = `[bfloat16x8_t; 2]` — two 128-bit registers, 16 bf16
+//!   lanes. Matches AVX-512BF16 `BF16x16 = __m256bh` in shape.
+//!
+//! # Cargo configs
+//!
+//! ```toml
+//! # .cargo/config-apple-m2.toml — Apple M2/M3/M4
+//! [build]
+//! target = "aarch64-apple-darwin"
+//! [target.aarch64-apple-darwin]
+//! rustflags = ["-Ctarget-cpu=apple-m2", "-Ctarget-feature=+bf16,+dotprod,+fp16"]
+//! ```
+//!
+//! ```toml
+//! # .cargo/config-graviton.toml — AWS Graviton 3/4
+//! [build]
+//! target = "aarch64-unknown-linux-gnu"
+//! [target.aarch64-unknown-linux-gnu]
+//! rustflags = ["-Ctarget-cpu=neoverse-v2", "-Ctarget-feature=+bf16"]
+//! ```
+//!
+//! ```toml
+//! # .cargo/config-snapdragon-x.toml — Snapdragon X Elite (Win/Linux)
+//! [build]
+//! target = "aarch64-pc-windows-msvc"   # or aarch64-unknown-linux-gnu
+//! rustflags = ["-Ctarget-cpu=cortex-x4", "-Ctarget-feature=+bf16,+i8mm"]
+//! ```
+//!
+//! # Stable-Rust constraint
+//!
+//! Same as the FP16 tier: `bfloat16x8_t` exists in `core::arch::aarch64`
+//! on stable, but the intrinsics (`vbfdotq_f32`, `vbfmmlaq_f32`, ...)
+//! are nightly-only (issue #117222). Two paths on stable 1.95:
+//!
+//!   1. **asm! byte encoding** — same pattern as `src/simd_amx.rs`
+//!      uses for AMX. Example:
+//!      ```ignore
+//!      // BFDOT v0.4s, v1.8h, v2.8h
+//!      asm!(".inst 0x4e41ec00", inout("v0") acc, in("v1") a, in("v2") b);
+//!      // BFMMLA v0.4s, v1.8h, v2.8h
+//!      asm!(".inst 0x6e42ec01", inout("v0") acc, in("v1") a, in("v2") b);
+//!      ```
+//!      Verify the encoding with `aarch64-linux-gnu-objdump --disassemble`
+//!      on a reference compile.
+//!   2. **Round-trip through f32** — convert bf16 → f32 (scalar bit-
+//!      shift), use the existing `vfmaq_f32` from baseline NEON, convert
+//!      back. Loses the 4× throughput; only as a correctness anchor for
+//!      the asm path.
+//!
+//! Path (1) is the only one worth shipping. The asm-byte fallback IS
+//! how `simd_amx.rs` ships AMX on stable Rust today — same pattern.
+
+#![cfg(all(target_arch = "aarch64", feature = "std"))]
+
+// ─── BF16 stubs ──────────────────────────────────────────────────────
+
+/// Placeholder for the BF16 8-lane native wrapper.
+///
+/// Real implementation: `pub struct BF16x8(pub bfloat16x8_t)`. API
+/// surface mirrors `simd_avx512::BF16x8`:
+/// - `splat(bits: u16) -> Self` (broadcast bf16 bit pattern across 8 lanes)
+/// - `from_slice(s: &[u16]) -> Self` (load 8 raw bf16 bits as u16s)
+/// - `to_array(self) -> [u16; 8]`
+/// - `dot_f32(self, other: Self, acc: F32x4) -> F32x4` — wraps BFDOT
+/// - `cvt_to_f32_lo(self) -> F32x4`, `cvt_to_f32_hi(self) -> F32x4`
+///
+/// Without `target_feature = "bf16"`, this falls back to round-trip
+/// through f32 (slow). With the feature on, it uses asm-byte BFDOT.
+pub struct BF16x8Stub;
+
+/// Placeholder for the BF16 16-lane composed wrapper.
+///
+/// Real implementation: `pub struct BF16x16(pub [bfloat16x8_t; 2])`.
+/// API mirror of `simd_avx512::BF16x16`. The 16-lane variant is the
+/// natural width for matmul tile rows in transformer attention.
+pub struct BF16x16Stub;
+
+impl BF16x8Stub {
+    pub fn unimplemented() -> ! {
+        unimplemented!(
+            "BF16x8 NEON bf16-tier implementation TODO. See \
+             src/simd_neon_bf16.rs module docs for the BFDOT / BFMMLA \
+             asm-byte encoding (stable Rust 1.95 can't reach the \
+             nightly-only vbfdotq_f32 intrinsic). Reference: \
+             src/simd_amx.rs's `.byte` pattern."
+        )
+    }
+}
+
+impl BF16x16Stub {
+    pub fn unimplemented() -> ! {
+        unimplemented!(
+            "BF16x16 NEON bf16-tier implementation TODO. Two-half \
+             composed wrapper [bfloat16x8_t; 2] — see module docs."
+        )
+    }
+}
+
+// ─── BFMMLA: the prize intrinsic ─────────────────────────────────────
+//
+// BFMMLA is the most important instruction this tier unlocks. It
+// computes a 2×2 outer-product matrix multiply of bf16 inputs,
+// accumulating into a 2×2 f32 tile. One instruction = 8 bf16 mults +
+// 4 f32 adds. On Apple M2 the throughput is ~32 GFLOP/s per core in
+// bf16-matmul-bound kernels.
+//
+// Encoding for `BFMMLA Vd.4s, Vn.8h, Vm.8h`: 0x6e40_ec00 | (Vm << 16)
+// | (Vn << 5) | Vd. Use a `bfmmla!` macro to emit the asm-byte for any
+// (acc, a, b) v-register triple.
+//
+// TODO(Phase-3): implement `bfmmla(acc: F32x4, a: BF16x8, b: BF16x8)
+// -> F32x4` as the primary export. The rest of the BF16 API builds on
+// it (BFDOT is BFMMLA's diagonal, BFMLALB/T are its half-slices).
+
+// ─── BFDOT: same shape as DotProd, but bf16 ──────────────────────────
+//
+// Where `vdotq_s32(acc, a, b)` does 4×(4×i8·4×i8) → 4×i32, BFDOT does
+// 2×(2×bf16·2×bf16) → 2×f32 accumulated into 4×f32. The bf16 analogue
+// is HALF the lane count per output (2 vs 4) because bf16 is twice as
+// wide as i8.
+//
+// TODO(Phase-3): implement `bfdot(acc: F32x4, a: BF16x8, b: BF16x8)
+// -> F32x4`. Asm-byte for `BFDOT Vd.4s, Vn.8h, Vm.8h`:
+//   0x4e40_ec00 | (Vm << 16) | (Vn << 5) | Vd
diff --git a/src/simd_neon_dotprod.rs b/src/simd_neon_dotprod.rs