Skip to content

Commit e9a4d73

Browse files
authored
Merge pull request #173 from AdaWorldAPI/claude/pr-x-phase2-nightly-dispatch
feat(simd): Phase 2 — wire simd_nightly into crate::simd::* dispatch + matrix fix
2 parents a18366a + a7ca029 commit e9a4d73

3 files changed

Lines changed: 97 additions & 36 deletions

File tree

.claude/knowledge/simd-dispatch-architecture.md

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -128,26 +128,36 @@ chooses the source; the cargo config chooses how `simd.rs` chooses.
128128

129129
## 4. Parity matrix — typed lane primitives per backend
130130

131-
Legend: ✅ native, 🟡 composed wrapper (two-half / four-quarter), 🔵
132-
scalar polyfill via `core::simd`, ❌ missing, ⛔ N/A for this arch.
131+
Legend: ✅ native, 🟡 composed wrapper (two-half / four-quarter), 🟠
132+
scalar polyfill (struct exists with full API but storage is `[$elem;
133+
$lanes]` — no SIMD execution), 🔵 portable-SIMD polyfill via
134+
`core::simd`, ❌ missing, ⛔ N/A for this arch.
135+
136+
(Reality check 2026-05-20: many AVX2 int rows previously marked ❌ are
137+
actually 🟠 — `simd_avx2.rs` ships them via the `avx2_int_type!` macro
138+
as scalar-storage structs that match the AVX-512 API surface. The
139+
arithmetic is plain Rust under the hood; only the FLOAT wrappers in
140+
this column are true two-half SIMD composites. Filling in real AVX2
141+
vectorization for the int wrappers is its own piece of tech debt
142+
tracked as TD-SIMD-3.)
133143

134144
| Lane type | `simd_avx512` (v4) | `simd_avx2` (v3) | `simd_neon` (aarch64) | `simd_nightly` | `scalar` |
135145
|---|---|---|---|---|---|
136146
| `F32x16` |`__m512` | 🟡 `(f32x8, f32x8)` | 🟡 `[float32x4_t; 4]` | 🔵 `core::simd::f32x16` |`[f32; 16]` |
137147
| `F32x8` |`__m256` ||| 🔵 ||
138148
| `F64x8` |`__m512d` | 🟡 `(f64x4, f64x4)` | 🟡 `[float64x2_t; 4]` | 🔵 ||
139149
| `F64x4` |`__m256d` ||| 🔵 ||
140-
| `U8x64` |`__m512i` | || 🔵 ||
150+
| `U8x64` |`__m512i` | 🟠 `[u8; 64]` polyfill || 🔵 ||
141151
| `U8x32` |`__m256i` |`__m256i` || 🔵 ||
142-
| `U16x32` |`__m512i` | || 🔵 ||
143-
| `U32x16` |`__m512i` | || 🔵 ||
144-
| `U64x8` |`__m512i` | || 🔵 ||
145-
| `I8x32` |`__m256i` | || 🔵 ||
146-
| `I8x64` |`__m512i` | || 🔵 ||
147-
| `I16x16` |`__m256i` | || 🔵 ||
148-
| `I16x32` |`__m512i` | || 🔵 ||
149-
| `I32x16` |`__m512i` | || 🔵 ||
150-
| `I64x8` |`__m512i` | || 🔵 ||
152+
| `U16x32` |`__m512i` | 🟠 `[u16; 32]` polyfill || 🔵 ||
153+
| `U32x16` |`__m512i` | 🟠 `[u32; 16]` polyfill || 🔵 ||
154+
| `U64x8` |`__m512i` | 🟠 `[u64; 8]` polyfill || 🔵 ||
155+
| `I8x32` |`__m256i` | `__m256i` (in `simd_avx512`) || 🔵 ||
156+
| `I8x64` |`__m512i` | 🟠 `[i8; 64]` polyfill || 🔵 ||
157+
| `I16x16` |`__m256i` | `__m256i` (in `simd_avx512`) || 🔵 ||
158+
| `I16x32` |`__m512i` | 🟠 `[i16; 32]` polyfill || 🔵 ||
159+
| `I32x16` |`__m512i` | 🟠 `[i32; 16]` polyfill || 🔵 ||
160+
| `I64x8` |`__m512i` | 🟠 `[i64; 8]` polyfill || 🔵 ||
151161
| `BF16x8` |`__m128bh` ||| 🔵 ||
152162
| `BF16x16` |`__m256bh` ||| 🔵 ||
153163
| `F16x16` || 🟡 `F16Scaler` (scalar) || 🔵 ||

src/simd.rs

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -210,23 +210,22 @@ pub const PREFERRED_I16_LANES: usize = 16;
210210
// * aarch64 → simd_neon backend.
211211
// * everything else (wasm32, riscv, etc.) → scalar fallback.
212212

213-
// Note on the `nightly-simd` feature: it adds the `crate::simd_nightly`
214-
// module (a portable-simd backend wrapping `core::simd`) but does NOT
215-
// replace the intrinsics dispatch below. The polyfill ships full
216-
// type-parity with production (PR #146): 24 types covering F32x8/16,
217-
// F64x4/8, BF16x8/16, F16x16, I8x32/64, I16x16/32, I32x16, I64x8,
218-
// U8x32/64, U16x32, U32x8/16, U64x4/8, plus the F32/F64 mask types —
219-
// matches the 24 types defined in `simd_avx2.rs` + `simd_avx512.rs`.
220-
// Consumers who want miri-runnable SIMD code import from `simd_nightly`
221-
// explicitly today (e.g. `use ndarray::simd_nightly::F32x16`).
222-
//
223-
// The remaining work for Miri-clean coverage of `hpc::*` is wiring this
224-
// file's `pub use crate::simd_{avx512,avx2,neon}::*` re-exports to
225-
// route through `simd_nightly` under `cfg(miri)`. Once that lands,
226-
// every `use crate::simd::F32x16` call site becomes miri-checkable
227-
// without source changes. The polyfill itself is no longer the bottleneck.
213+
// Nightly-simd dispatch — when `feature = "nightly-simd"` is on, the
214+
// `crate::simd_nightly` portable backend (wrapping `core::simd::*`)
215+
// REPLACES the intrinsics arms below. This is a compile-time-dispatch
216+
// choice: opt in via `cargo +nightly --features nightly-simd ...` and
217+
// the same `use crate::simd::F32x16` call sites become miri-runnable.
218+
// No target_arch constraint — `core::simd` is portable, so this arm
219+
// is the one true backend on wasm32 / riscv / aarch64 / x86_64 alike
220+
// as soon as `nightly-simd` is on.
221+
#[cfg(feature = "nightly-simd")]
222+
pub use crate::simd_nightly::{
223+
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u16x32, u32x16, u32x8, u64x4, u64x8,
224+
u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8,
225+
I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64,
226+
};
228227

229-
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
228+
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))]
230229
pub use crate::simd_avx512::{
231230
f32x16,
232231
f32x8,
@@ -276,7 +275,7 @@ pub use crate::simd_avx512::{bf16_to_f32_batch, bf16_to_f32_scalar, f32_to_bf16_
276275
#[cfg(target_arch = "x86_64")]
277276
pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne};
278277
// BF16 SIMD types only available when avx512bf16 is enabled at compile time
279-
#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
278+
#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16", not(feature = "nightly-simd")))]
280279
pub use crate::simd_avx512::{BF16x16, BF16x8};
281280

282281
// AVX2 baseline arm — selected by the `x86-64-v3` cargo default. The
@@ -290,10 +289,18 @@ pub use crate::simd_avx512::{BF16x16, BF16x8};
290289
// `RUSTFLAGS="-D warnings"` env, which overrides our v3 config.toml,
291290
// landing on x86-64 baseline → the previous tighter `avx2` predicate
292291
// left no matching arm).
293-
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
292+
#[cfg(all(
293+
target_arch = "x86_64",
294+
not(target_feature = "avx512f"),
295+
not(feature = "nightly-simd")
296+
))]
294297
pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32};
295298

296-
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
299+
#[cfg(all(
300+
target_arch = "x86_64",
301+
not(target_feature = "avx512f"),
302+
not(feature = "nightly-simd")
303+
))]
297304
pub use crate::simd_avx2::{
298305
f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32,
299306
I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
@@ -304,14 +311,14 @@ pub use crate::simd_avx2::{
304311
// AVX2 ops, and on AVX-512 builds it's the half-register companion to
305312
// U8x64. Lives in simd_avx2.rs (single source of truth) and is re-exported
306313
// from both tier branches.
307-
#[cfg(target_arch = "x86_64")]
314+
#[cfg(all(target_arch = "x86_64", not(feature = "nightly-simd")))]
308315
pub use crate::simd_avx2::{u8x32, U8x32};
309316

310317
// ============================================================================
311318
// Non-x86: scalar fallback types with identical API
312319
// ============================================================================
313320

314-
#[cfg(not(target_arch = "x86_64"))]
321+
#[cfg(all(not(target_arch = "x86_64"), not(feature = "nightly-simd")))]
315322
pub(crate) mod scalar {
316323
use core::fmt;
317324
use core::ops::{
@@ -1587,15 +1594,19 @@ pub(crate) mod scalar {
15871594
// in simd_neon::aarch64_simd (verified 2026-04-30, agent A7 — burn parity item 9).
15881595
// Integer + 256-bit float types still come from the scalar fallback; they're
15891596
// not on the critical path for f32 BLAS-1 / VML kernels.
1590-
#[cfg(target_arch = "aarch64")]
1597+
#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
15911598
pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8};
1592-
#[cfg(target_arch = "aarch64")]
1599+
#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
15931600
pub use scalar::{
15941601
f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64,
15951602
};
15961603

15971604
// Other non-x86 targets (wasm, riscv, etc.): full scalar fallback.
1598-
#[cfg(all(not(target_arch = "x86_64"), not(target_arch = "aarch64")))]
1605+
#[cfg(all(
1606+
not(target_arch = "x86_64"),
1607+
not(target_arch = "aarch64"),
1608+
not(feature = "nightly-simd")
1609+
))]
15991610
pub use scalar::{
16001611
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16,
16011612
F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64,

src/simd_nightly/mod.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,43 @@ pub use i_word_types::{I16x16, I16x32, I32x16, I64x8};
4343
pub use masks::{F32Mask16, F32Mask8, F64Mask4, F64Mask8};
4444
pub use u8_types::{U8x32, U8x64};
4545
pub use u_word_types::{U16x32, U32x16, U32x8, U64x4, U64x8};
46+
47+
// Lowercase aliases — match the std::simd convention used by
48+
// `simd_avx2.rs`, `simd_avx512.rs`, and the scalar fallback in
49+
// `simd_scalar.rs`. Consumer docs and downstream code import names like
50+
// `crate::simd::f32x16`; without these aliases, enabling `nightly-simd`
51+
// would silently break those imports (codex P1 on PR #173).
52+
#[allow(non_camel_case_types)]
53+
pub type f32x16 = F32x16;
54+
#[allow(non_camel_case_types)]
55+
pub type f32x8 = F32x8;
56+
#[allow(non_camel_case_types)]
57+
pub type f64x8 = F64x8;
58+
#[allow(non_camel_case_types)]
59+
pub type f64x4 = F64x4;
60+
#[allow(non_camel_case_types)]
61+
pub type u8x64 = U8x64;
62+
#[allow(non_camel_case_types)]
63+
pub type u8x32 = U8x32;
64+
#[allow(non_camel_case_types)]
65+
pub type u16x32 = U16x32;
66+
#[allow(non_camel_case_types)]
67+
pub type u32x16 = U32x16;
68+
#[allow(non_camel_case_types)]
69+
pub type u32x8 = U32x8;
70+
#[allow(non_camel_case_types)]
71+
pub type u64x8 = U64x8;
72+
#[allow(non_camel_case_types)]
73+
pub type u64x4 = U64x4;
74+
#[allow(non_camel_case_types)]
75+
pub type i8x64 = I8x64;
76+
#[allow(non_camel_case_types)]
77+
pub type i8x32 = I8x32;
78+
#[allow(non_camel_case_types)]
79+
pub type i16x32 = I16x32;
80+
#[allow(non_camel_case_types)]
81+
pub type i16x16 = I16x16;
82+
#[allow(non_camel_case_types)]
83+
pub type i32x16 = I32x16;
84+
#[allow(non_camel_case_types)]
85+
pub type i64x8 = I64x8;

0 commit comments

Comments
 (0)