Merge pull request #173 from AdaWorldAPI/claude/pr-x-phase2-nightly-dispatch

AdaWorldAPI · web-flow · commit e9a4d7348cd7 · 2026-05-20T15:15:03.000+02:00
feat(simd): Phase 2 — wire simd_nightly into crate::simd::* dispatch + matrix fix
diff --git a/.claude/knowledge/simd-dispatch-architecture.md b/.claude/knowledge/simd-dispatch-architecture.md
@@ -128,26 +128,36 @@ chooses the source; the cargo config chooses how `simd.rs` chooses.
 
 ## 4. Parity matrix — typed lane primitives per backend
 
-Legend: ✅ native, 🟡 composed wrapper (two-half / four-quarter), 🔵
-scalar polyfill via `core::simd`, ❌ missing, ⛔ N/A for this arch.
+Legend: ✅ native, 🟡 composed wrapper (two-half / four-quarter), 🟠
+scalar polyfill (struct exists with full API but storage is `[$elem;
+$lanes]` — no SIMD execution), 🔵 portable-SIMD polyfill via
+`core::simd`, ❌ missing, ⛔ N/A for this arch.
+
+(Reality check 2026-05-20: many AVX2 int rows previously marked ❌ are
+actually 🟠 — `simd_avx2.rs` ships them via the `avx2_int_type!` macro
+as scalar-storage structs that match the AVX-512 API surface. The
+arithmetic is plain Rust under the hood; only the FLOAT wrappers in
+this column are true two-half SIMD composites. Filling in real AVX2
+vectorization for the int wrappers is its own piece of tech debt
+tracked as TD-SIMD-3.)
 
 | Lane type | `simd_avx512` (v4) | `simd_avx2` (v3) | `simd_neon` (aarch64) | `simd_nightly` | `scalar` |
 |---|---|---|---|---|---|
 | `F32x16` | ✅ `__m512` | 🟡 `(f32x8, f32x8)` | 🟡 `[float32x4_t; 4]` | 🔵 `core::simd::f32x16` | ✅ `[f32; 16]` |
 | `F32x8` | ✅ `__m256` | ❌ | ⛔ | 🔵 | ✅ |
 | `F64x8` | ✅ `__m512d` | 🟡 `(f64x4, f64x4)` | 🟡 `[float64x2_t; 4]` | 🔵 | ✅ |
 | `F64x4` | ✅ `__m256d` | ❌ | ⛔ | 🔵 | ✅ |
-| `U8x64` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
+| `U8x64` | ✅ `__m512i` | 🟠 `[u8; 64]` polyfill | ❌ | 🔵 | ✅ |
 | `U8x32` | ✅ `__m256i` | ✅ `__m256i` | ❌ | 🔵 | ✅ |
-| `U16x32` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `U32x16` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `U64x8` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I8x32` | ✅ `__m256i` | ❌ | ❌ | 🔵 | ✅ |
-| `I8x64` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I16x16` | ✅ `__m256i` | ❌ | ❌ | 🔵 | ✅ |
-| `I16x32` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I32x16` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I64x8` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
+| `U16x32` | ✅ `__m512i` | 🟠 `[u16; 32]` polyfill | ❌ | 🔵 | ✅ |
+| `U32x16` | ✅ `__m512i` | 🟠 `[u32; 16]` polyfill | ❌ | 🔵 | ✅ |
+| `U64x8` | ✅ `__m512i` | 🟠 `[u64; 8]` polyfill | ❌ | 🔵 | ✅ |
+| `I8x32` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ |
+| `I8x64` | ✅ `__m512i` | 🟠 `[i8; 64]` polyfill | ❌ | 🔵 | ✅ |
+| `I16x16` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ |
+| `I16x32` | ✅ `__m512i` | 🟠 `[i16; 32]` polyfill | ❌ | 🔵 | ✅ |
+| `I32x16` | ✅ `__m512i` | 🟠 `[i32; 16]` polyfill | ❌ | 🔵 | ✅ |
+| `I64x8` | ✅ `__m512i` | 🟠 `[i64; 8]` polyfill | ❌ | 🔵 | ✅ |
 | `BF16x8` | ✅ `__m128bh` | ❌ | ❌ | 🔵 | ✅ |
 | `BF16x16` | ✅ `__m256bh` | ❌ | ❌ | 🔵 | ✅ |
 | `F16x16` | ❌ | 🟡 `F16Scaler` (scalar) | ❌ | 🔵 | ✅ |
diff --git a/src/simd.rs b/src/simd.rs
@@ -210,23 +210,22 @@ pub const PREFERRED_I16_LANES: usize = 16;
 //   * aarch64 → simd_neon backend.
 //   * everything else (wasm32, riscv, etc.) → scalar fallback.
 
-// Note on the `nightly-simd` feature: it adds the `crate::simd_nightly`
-// module (a portable-simd backend wrapping `core::simd`) but does NOT
-// replace the intrinsics dispatch below. The polyfill ships full
-// type-parity with production (PR #146): 24 types covering F32x8/16,
-// F64x4/8, BF16x8/16, F16x16, I8x32/64, I16x16/32, I32x16, I64x8,
-// U8x32/64, U16x32, U32x8/16, U64x4/8, plus the F32/F64 mask types —
-// matches the 24 types defined in `simd_avx2.rs` + `simd_avx512.rs`.
-// Consumers who want miri-runnable SIMD code import from `simd_nightly`
-// explicitly today (e.g. `use ndarray::simd_nightly::F32x16`).
-//
-// The remaining work for Miri-clean coverage of `hpc::*` is wiring this
-// file's `pub use crate::simd_{avx512,avx2,neon}::*` re-exports to
-// route through `simd_nightly` under `cfg(miri)`. Once that lands,
-// every `use crate::simd::F32x16` call site becomes miri-checkable
-// without source changes. The polyfill itself is no longer the bottleneck.
+// Nightly-simd dispatch — when `feature = "nightly-simd"` is on, the
+// `crate::simd_nightly` portable backend (wrapping `core::simd::*`)
+// REPLACES the intrinsics arms below. This is a compile-time-dispatch
+// choice: opt in via `cargo +nightly --features nightly-simd ...` and
+// the same `use crate::simd::F32x16` call sites become miri-runnable.
+// No target_arch constraint — `core::simd` is portable, so this arm
+// is the one true backend on wasm32 / riscv / aarch64 / x86_64 alike
+// as soon as `nightly-simd` is on.
+#[cfg(feature = "nightly-simd")]
+pub use crate::simd_nightly::{
+    f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u16x32, u32x16, u32x8, u64x4, u64x8,
+    u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8,
+    I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64,
+};
 
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))]
 pub use crate::simd_avx512::{
     f32x16,
     f32x8,
@@ -276,7 +275,7 @@ pub use crate::simd_avx512::{bf16_to_f32_batch, bf16_to_f32_scalar, f32_to_bf16_
 #[cfg(target_arch = "x86_64")]
 pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne};
 // BF16 SIMD types only available when avx512bf16 is enabled at compile time
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16", not(feature = "nightly-simd")))]
 pub use crate::simd_avx512::{BF16x16, BF16x8};
 
 // AVX2 baseline arm — selected by the `x86-64-v3` cargo default. The
@@ -290,10 +289,18 @@ pub use crate::simd_avx512::{BF16x16, BF16x8};
 // `RUSTFLAGS="-D warnings"` env, which overrides our v3 config.toml,
 // landing on x86-64 baseline → the previous tighter `avx2` predicate
 // left no matching arm).
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+#[cfg(all(
+    target_arch = "x86_64",
+    not(target_feature = "avx512f"),
+    not(feature = "nightly-simd")
+))]
 pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32};
 
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+#[cfg(all(
+    target_arch = "x86_64",
+    not(target_feature = "avx512f"),
+    not(feature = "nightly-simd")
+))]
 pub use crate::simd_avx2::{
     f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32,
     I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
@@ -304,14 +311,14 @@ pub use crate::simd_avx2::{
 // AVX2 ops, and on AVX-512 builds it's the half-register companion to
 // U8x64. Lives in simd_avx2.rs (single source of truth) and is re-exported
 // from both tier branches.
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(target_arch = "x86_64", not(feature = "nightly-simd")))]
 pub use crate::simd_avx2::{u8x32, U8x32};
 
 // ============================================================================
 // Non-x86: scalar fallback types with identical API
 // ============================================================================
 
-#[cfg(not(target_arch = "x86_64"))]
+#[cfg(all(not(target_arch = "x86_64"), not(feature = "nightly-simd")))]
 pub(crate) mod scalar {
     use core::fmt;
     use core::ops::{
@@ -1587,15 +1594,19 @@ pub(crate) mod scalar {
 // in simd_neon::aarch64_simd (verified 2026-04-30, agent A7 — burn parity item 9).
 // Integer + 256-bit float types still come from the scalar fallback; they're
 // not on the critical path for f32 BLAS-1 / VML kernels.
-#[cfg(target_arch = "aarch64")]
+#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
 pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8};
-#[cfg(target_arch = "aarch64")]
+#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
 pub use scalar::{
     f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64,
 };
 
 // Other non-x86 targets (wasm, riscv, etc.): full scalar fallback.
-#[cfg(all(not(target_arch = "x86_64"), not(target_arch = "aarch64")))]
+#[cfg(all(
+    not(target_arch = "x86_64"),
+    not(target_arch = "aarch64"),
+    not(feature = "nightly-simd")
+))]
 pub use scalar::{
     f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16,
     F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64,
diff --git a/src/simd_nightly/mod.rs b/src/simd_nightly/mod.rs
@@ -43,3 +43,43 @@ pub use i_word_types::{I16x16, I16x32, I32x16, I64x8};
 pub use masks::{F32Mask16, F32Mask8, F64Mask4, F64Mask8};
 pub use u8_types::{U8x32, U8x64};
 pub use u_word_types::{U16x32, U32x16, U32x8, U64x4, U64x8};
+
+// Lowercase aliases — match the std::simd convention used by
+// `simd_avx2.rs`, `simd_avx512.rs`, and the scalar fallback in
+// `simd_scalar.rs`. Consumer docs and downstream code import names like
+// `crate::simd::f32x16`; without these aliases, enabling `nightly-simd`
+// would silently break those imports (codex P1 on PR #173).
+#[allow(non_camel_case_types)]
+pub type f32x16 = F32x16;
+#[allow(non_camel_case_types)]
+pub type f32x8 = F32x8;
+#[allow(non_camel_case_types)]
+pub type f64x8 = F64x8;
+#[allow(non_camel_case_types)]
+pub type f64x4 = F64x4;
+#[allow(non_camel_case_types)]
+pub type u8x64 = U8x64;
+#[allow(non_camel_case_types)]
+pub type u8x32 = U8x32;
+#[allow(non_camel_case_types)]
+pub type u16x32 = U16x32;
+#[allow(non_camel_case_types)]
+pub type u32x16 = U32x16;
+#[allow(non_camel_case_types)]
+pub type u32x8 = U32x8;
+#[allow(non_camel_case_types)]
+pub type u64x8 = U64x8;
+#[allow(non_camel_case_types)]
+pub type u64x4 = U64x4;
+#[allow(non_camel_case_types)]
+pub type i8x64 = I8x64;
+#[allow(non_camel_case_types)]
+pub type i8x32 = I8x32;
+#[allow(non_camel_case_types)]
+pub type i16x32 = I16x32;
+#[allow(non_camel_case_types)]
+pub type i16x16 = I16x16;
+#[allow(non_camel_case_types)]
+pub type i32x16 = I32x16;
+#[allow(non_camel_case_types)]
+pub type i64x8 = I64x8;