Skip to content

Commit 2ef97c0

Browse files
committed
feat(simd): missing-lanes sweep — U16x16/U32x8/U64x4/I32x8/I64x4 across all backends
PR #178's matrix audit surfaced five 256-bit int lane types that were either entirely missing or stranded in `simd_nightly` only. Adds them across every backend so `crate::simd::{U16x16, U32x8, U64x4, I32x8, I64x4}` resolves uniformly on v3 / v4 / native / nightly / scalar / aarch64 paths. `src/simd_avx2.rs` + 5× `avx2_int_type!` instantiations producing scalar-storage `[$elem; $lanes]` polyfills (align 64). Same macro pattern as the existing 512-bit polyfills (U8x64, U16x32, …). Native AVX2 `__m256i` upgrades are TD-SIMD-3. + 5× lowercase aliases (`u16x16 = U16x16`, etc.) matching the std::simd convention used by every other lane type in the file. `src/simd_scalar.rs` + 5× `impl_int_type!` instantiations mirroring the AVX2 polyfills above. Consumers on non-x86/non-aarch64 (wasm32, riscv, thumb) reach the same type names through `crate::simd::*`. + Lowercase aliases. `src/simd_avx512.rs` + Re-export of the new types from `simd_avx2` so the v4 dispatch arm in `simd.rs` can surface them without forking the macro into this file. Both files are already gated on `target_arch = "x86_64"`, so the re-export is cheap. Native `__m256i` upgrades here are TD-SIMD-3 (same story as the v3 polyfills). `src/simd_nightly/u_word_types.rs` + `U16x16` wrapper backed by `core::simd::u16x16`. Same API surface as the existing 32-/16-/8-lane wrappers — splat, from_slice, from_array, to_array, copy_to_slice, reduce_{sum,min,max}, simd_min/max, cmpeq_mask, cmpgt_mask, Default. `src/simd_nightly/i_word_types.rs` + `I32x8` and `I64x4` wrappers backed by `core::simd::{i32x8, i64x4}`. Same API surface as siblings; PartialEq via array compare. `src/simd_nightly/mod.rs` + Re-exports for the three new types + lowercase aliases. `src/simd.rs` + All 5 dispatch arms (nightly, v4, v3, aarch64, scalar fallback) updated to surface the new types through `crate::simd::*`. `.claude/knowledge/simd-dispatch-architecture.md` + Parity matrix updated — the five rows previously marked ❌ across most backends now show 🟠 polyfill (v3, v4-via-v3, scalar) / 🔵 (nightly via `core::simd`). Verified: `cargo check` clean under default v3 features and under `-Ctarget-cpu=x86-64-v4` (via `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS` + explicit `--target` so build scripts don't SIGILL on non-AVX-512 runners — same pattern as the tier4-avx512-check job).
1 parent 2f096d3 commit 2ef97c0

8 files changed

Lines changed: 354 additions & 17 deletions

File tree

.claude/knowledge/simd-dispatch-architecture.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,19 +150,19 @@ tracked as TD-SIMD-3.)
150150
| `U8x64` |`__m512i` | 🟠 `[u8; 64]` polyfill || 🔵 ||
151151
| `U8x32` |`__m256i` |`__m256i` || 🔵 ||
152152
| `U16x32` |`__m512i` | 🟠 `[u16; 32]` polyfill || 🔵 ||
153-
| `U16x16` | ||| | |
153+
| `U16x16` | 🟠 (via `simd_avx2`) | 🟠 `[u16; 16]` polyfill || 🔵 `core::simd::u16x16` | 🟠 |
154154
| `U32x16` |`__m512i` | 🟠 `[u32; 16]` polyfill || 🔵 ||
155-
| `U32x8` | ||| 🔵 `core::simd::u32x8` | |
155+
| `U32x8` | 🟠 (via `simd_avx2`) | 🟠 `[u32; 8]` polyfill || 🔵 `core::simd::u32x8` | 🟠 |
156156
| `U64x8` |`__m512i` | 🟠 `[u64; 8]` polyfill || 🔵 ||
157-
| `U64x4` | ||| 🔵 `core::simd::u64x4` | |
157+
| `U64x4` | 🟠 (via `simd_avx2`) | 🟠 `[u64; 4]` polyfill || 🔵 `core::simd::u64x4` | 🟠 |
158158
| `I8x32` |`__m256i` |`__m256i` (in `simd_avx512`) || 🔵 ||
159159
| `I8x64` |`__m512i` | 🟠 `[i8; 64]` polyfill || 🔵 ||
160160
| `I16x16` |`__m256i` |`__m256i` (in `simd_avx512`) || 🔵 ||
161161
| `I16x32` |`__m512i` | 🟠 `[i16; 32]` polyfill || 🔵 ||
162162
| `I32x16` |`__m512i` | 🟠 `[i32; 16]` polyfill || 🔵 ||
163-
| `I32x8` | ||| | |
163+
| `I32x8` | 🟠 (via `simd_avx2`) | 🟠 `[i32; 8]` polyfill || 🔵 `core::simd::i32x8` | 🟠 |
164164
| `I64x8` |`__m512i` | 🟠 `[i64; 8]` polyfill || 🔵 ||
165-
| `I64x4` | ||| | |
165+
| `I64x4` | 🟠 (via `simd_avx2`) | 🟠 `[i64; 4]` polyfill || 🔵 `core::simd::i64x4` | 🟠 |
166166
| `BF16x8` |`__m128bh` ||| 🔵 ||
167167
| `BF16x16` |`__m256bh` ||| 🔵 ||
168168
| `F16x16` || 🟠 `[u16; 16]` polyfill (`simd_half`) | 🟠 `[u16; 16]` polyfill (`simd_half`) | 🔵 ||

src/simd.rs

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,10 @@ pub const PREFERRED_I16_LANES: usize = 16;
220220
// as soon as `nightly-simd` is on.
221221
#[cfg(feature = "nightly-simd")]
222222
pub use crate::simd_nightly::{
223-
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u16x32, u32x16, u32x8, u64x4, u64x8,
224-
u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8,
225-
I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64,
223+
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u16x32, u32x16,
224+
u32x8, u64x4, u64x8, u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8,
225+
F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8, I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4,
226+
U64x8, U8x32, U8x64,
226227
};
227228

228229
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))]
@@ -234,10 +235,15 @@ pub use crate::simd_avx512::{
234235
i16x16,
235236
i16x32,
236237
i32x16,
238+
i32x8,
239+
i64x4,
237240
i64x8,
238241
i8x32,
239242
i8x64,
243+
u16x16,
240244
u32x16,
245+
u32x8,
246+
u64x4,
241247
u64x8,
242248
u8x64,
243249
F32Mask16,
@@ -251,11 +257,18 @@ pub use crate::simd_avx512::{
251257
I16x16,
252258
I16x32,
253259
I32x16,
260+
// 256-bit int polyfills surfaced 2026-05-20 (re-exported from
261+
// `simd_avx2` via `simd_avx512`'s re-export at line ~2260).
262+
I32x8,
263+
I64x4,
254264
I64x8,
255265
I8x32,
256266
I8x64,
267+
U16x16,
257268
U16x32,
258269
U32x16,
270+
U32x8,
271+
U64x4,
259272
U64x8,
260273
U8x64,
261274
};
@@ -302,8 +315,9 @@ pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16,
302315
not(feature = "nightly-simd")
303316
))]
304317
pub use crate::simd_avx2::{
305-
f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32,
306-
I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
318+
f32x16, f64x8, i16x32, i32x16, i32x8, i64x4, i64x8, i8x64, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32Mask16,
319+
F32x16, F64Mask8, F64x8, I16x32, I32x16, I32x8, I64x4, I64x8, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8,
320+
U8x64,
307321
};
308322

309323
// U8x32 — native AVX2 byte width (one __m256i = 32 bytes). Available on
@@ -335,7 +349,8 @@ pub(crate) mod scalar;
335349
pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8};
336350
#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
337351
pub use scalar::{
338-
f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64,
352+
f32x8, f64x4, i32x16, i32x8, i64x4, i64x8, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32x8, F64x4, I32x16, I32x8,
353+
I64x4, I64x8, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64,
339354
};
340355

341356
// Other non-x86 targets (wasm, riscv, etc.): full scalar fallback.
@@ -345,8 +360,9 @@ pub use scalar::{
345360
not(feature = "nightly-simd")
346361
))]
347362
pub use scalar::{
348-
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16,
349-
F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64,
363+
f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u32x16, u32x8,
364+
u64x4, u64x8, u8x64, F32Mask16, F32x16, F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8,
365+
I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64,
350366
};
351367

352368
// Scalar BF16 conversion — always available on all platforms

src/simd_avx2.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1542,6 +1542,19 @@ avx2_int_type!(U16x32, u16, 32, 0u16);
15421542
avx2_int_type!(U32x16, u32, 16, 0u32);
15431543
avx2_int_type!(U64x8, u64, 8, 0u64);
15441544

1545+
// 256-bit int lanes — scalar polyfills filling the gap surfaced by the
1546+
// 2026-05-20 matrix audit. None of these had wrappers anywhere except
1547+
// for `U32x8` / `U64x4` in `simd_nightly`. Adding `U16x16`, `U32x8`,
1548+
// `U64x4`, `I32x8`, `I64x4` here mirrors the existing 512-bit polyfill
1549+
// pattern (`[$elem; $lanes]` storage, align 64). Native AVX2 `__m256i`
1550+
// upgrades for these are TD-SIMD-3 (the same fold-into-real-SIMD task
1551+
// already tracked for the 512-bit polyfills above).
1552+
avx2_int_type!(U16x16, u16, 16, 0u16);
1553+
avx2_int_type!(U32x8, u32, 8, 0u32);
1554+
avx2_int_type!(U64x4, u64, 4, 0u64);
1555+
avx2_int_type!(I32x8, i32, 8, 0i32);
1556+
avx2_int_type!(I64x4, i64, 4, 0i64);
1557+
15451558
// Extra methods for U16x32 (widen/narrow, shift, multiply) — AVX2 scalar fallback.
15461559
impl U16x32 {
15471560
#[inline(always)]
@@ -2266,6 +2279,19 @@ pub type i8x64 = I8x64;
22662279
#[allow(non_camel_case_types)]
22672280
pub type i16x32 = I16x32;
22682281

2282+
// Lowercase aliases for the 256-bit polyfills added in the 2026-05-20
2283+
// missing-lanes sweep.
2284+
#[allow(non_camel_case_types)]
2285+
pub type u16x16 = U16x16;
2286+
#[allow(non_camel_case_types)]
2287+
pub type u32x8 = U32x8;
2288+
#[allow(non_camel_case_types)]
2289+
pub type u64x4 = U64x4;
2290+
#[allow(non_camel_case_types)]
2291+
pub type i32x8 = I32x8;
2292+
#[allow(non_camel_case_types)]
2293+
pub type i64x4 = I64x4;
2294+
22692295
#[cfg(test)]
22702296
mod tests {
22712297
use super::*;

src/simd_avx512.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2256,6 +2256,14 @@ pub type i16x32 = I16x32;
22562256
#[allow(non_camel_case_types)]
22572257
pub type i16x16 = I16x16;
22582258

2259+
// 256-bit int lanes — added 2026-05-20 missing-lanes sweep. These types
2260+
// don't have native `__m256i` wrappers in this module yet; re-exported
2261+
// from `simd_avx2.rs` (where they live as scalar-storage polyfills via
2262+
// the `avx2_int_type!` macro) so the v4 dispatch arm in `simd.rs` can
2263+
// surface them through `crate::simd::*` with the same names the v3 arm
2264+
// uses. Native AVX2 `__m256i` upgrades for these are TD-SIMD-3.
2265+
pub use crate::simd_avx2::{i32x8, i64x4, u16x16, u32x8, u64x4, I32x8, I64x4, U16x16, U32x8, U64x4};
2266+
22592267
// ============================================================================
22602268
// BF16 conversion wrappers — AVX-512 BF16 hardware instructions
22612269
// ============================================================================

src/simd_nightly/i_word_types.rs

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
use core::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd};
55
use core::simd::num::SimdInt;
6-
use core::simd::{i16x16, i16x32, i32x16, i64x8};
6+
use core::simd::{i16x16, i16x32, i32x16, i32x8, i64x4, i64x8};
77

88
// ════════════════════════════════════════════════════════════════════
99
// I16x16 — 16-lane signed 16-bit integer
@@ -428,3 +428,177 @@ impl core::fmt::Display for I64x8 {
428428
write!(f, "I64x8({:?})", &self.to_array()[..])
429429
}
430430
}
431+
432+
// ════════════════════════════════════════════════════════════════════
433+
// I32x8 — 8-lane i32 (256-bit, added 2026-05-20 missing-lanes sweep)
434+
// ════════════════════════════════════════════════════════════════════
435+
436+
/// 8-lane `i32` SIMD vector backed by `core::simd::i32x8`.
437+
///
438+
/// API mirrors `simd_avx512::I32x16` at half-width. Miri-executable.
439+
#[derive(Copy, Clone, Debug)]
440+
#[repr(transparent)]
441+
pub struct I32x8(pub i32x8);
442+
443+
impl I32x8 {
444+
pub const LANES: usize = 8;
445+
446+
#[inline(always)]
447+
pub fn splat(v: i32) -> Self {
448+
Self(i32x8::splat(v))
449+
}
450+
451+
#[inline(always)]
452+
pub fn from_slice(s: &[i32]) -> Self {
453+
assert!(s.len() >= 8, "I32x8::from_slice needs >=8 elements");
454+
Self(i32x8::from_slice(s))
455+
}
456+
457+
#[inline(always)]
458+
pub fn from_array(arr: [i32; 8]) -> Self {
459+
Self(i32x8::from_array(arr))
460+
}
461+
462+
#[inline(always)]
463+
pub fn to_array(self) -> [i32; 8] {
464+
self.0.to_array()
465+
}
466+
467+
#[inline(always)]
468+
pub fn copy_to_slice(self, s: &mut [i32]) {
469+
assert!(s.len() >= 8, "I32x8::copy_to_slice needs >=8 elements");
470+
self.0.copy_to_slice(s);
471+
}
472+
473+
#[inline(always)]
474+
pub fn reduce_sum(self) -> i32 {
475+
self.0.reduce_sum()
476+
}
477+
#[inline(always)]
478+
pub fn reduce_min(self) -> i32 {
479+
self.0.reduce_min()
480+
}
481+
#[inline(always)]
482+
pub fn reduce_max(self) -> i32 {
483+
self.0.reduce_max()
484+
}
485+
486+
#[inline(always)]
487+
pub fn simd_min(self, other: Self) -> Self {
488+
Self(self.0.simd_min(other.0))
489+
}
490+
#[inline(always)]
491+
pub fn simd_max(self, other: Self) -> Self {
492+
Self(self.0.simd_max(other.0))
493+
}
494+
495+
#[inline(always)]
496+
pub fn cmpeq_mask(self, other: Self) -> u8 {
497+
self.0.simd_eq(other.0).to_bitmask() as u8
498+
}
499+
#[inline(always)]
500+
pub fn cmpgt_mask(self, other: Self) -> u8 {
501+
self.0.simd_gt(other.0).to_bitmask() as u8
502+
}
503+
}
504+
505+
impl Default for I32x8 {
506+
#[inline(always)]
507+
fn default() -> Self {
508+
Self::splat(0)
509+
}
510+
}
511+
512+
impl PartialEq for I32x8 {
513+
#[inline(always)]
514+
fn eq(&self, other: &Self) -> bool {
515+
self.to_array() == other.to_array()
516+
}
517+
}
518+
519+
// ════════════════════════════════════════════════════════════════════
520+
// I64x4 — 4-lane i64 (256-bit, added 2026-05-20 missing-lanes sweep)
521+
// ════════════════════════════════════════════════════════════════════
522+
523+
/// 4-lane `i64` SIMD vector backed by `core::simd::i64x4`.
524+
///
525+
/// API mirrors `simd_avx512::I64x8` at half-width. Miri-executable.
526+
#[derive(Copy, Clone, Debug)]
527+
#[repr(transparent)]
528+
pub struct I64x4(pub i64x4);
529+
530+
impl I64x4 {
531+
pub const LANES: usize = 4;
532+
533+
#[inline(always)]
534+
pub fn splat(v: i64) -> Self {
535+
Self(i64x4::splat(v))
536+
}
537+
538+
#[inline(always)]
539+
pub fn from_slice(s: &[i64]) -> Self {
540+
assert!(s.len() >= 4, "I64x4::from_slice needs >=4 elements");
541+
Self(i64x4::from_slice(s))
542+
}
543+
544+
#[inline(always)]
545+
pub fn from_array(arr: [i64; 4]) -> Self {
546+
Self(i64x4::from_array(arr))
547+
}
548+
549+
#[inline(always)]
550+
pub fn to_array(self) -> [i64; 4] {
551+
self.0.to_array()
552+
}
553+
554+
#[inline(always)]
555+
pub fn copy_to_slice(self, s: &mut [i64]) {
556+
assert!(s.len() >= 4, "I64x4::copy_to_slice needs >=4 elements");
557+
self.0.copy_to_slice(s);
558+
}
559+
560+
#[inline(always)]
561+
pub fn reduce_sum(self) -> i64 {
562+
self.0.reduce_sum()
563+
}
564+
#[inline(always)]
565+
pub fn reduce_min(self) -> i64 {
566+
self.0.reduce_min()
567+
}
568+
#[inline(always)]
569+
pub fn reduce_max(self) -> i64 {
570+
self.0.reduce_max()
571+
}
572+
573+
#[inline(always)]
574+
pub fn simd_min(self, other: Self) -> Self {
575+
Self(self.0.simd_min(other.0))
576+
}
577+
#[inline(always)]
578+
pub fn simd_max(self, other: Self) -> Self {
579+
Self(self.0.simd_max(other.0))
580+
}
581+
582+
#[inline(always)]
583+
pub fn cmpeq_mask(self, other: Self) -> u8 {
584+
self.0.simd_eq(other.0).to_bitmask() as u8
585+
}
586+
#[inline(always)]
587+
pub fn cmpgt_mask(self, other: Self) -> u8 {
588+
self.0.simd_gt(other.0).to_bitmask() as u8
589+
}
590+
}
591+
592+
impl Default for I64x4 {
593+
#[inline(always)]
594+
fn default() -> Self {
595+
Self::splat(0)
596+
}
597+
}
598+
599+
impl PartialEq for I64x4 {
600+
#[inline(always)]
601+
fn eq(&self, other: &Self) -> bool {
602+
self.to_array() == other.to_array()
603+
}
604+
}

src/simd_nightly/mod.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ pub use f16_types::F16x16;
3939
pub use f32_types::{F32x16, F32x8};
4040
pub use f64_types::{F64x4, F64x8};
4141
pub use i8_types::{I8x32, I8x64};
42-
pub use i_word_types::{I16x16, I16x32, I32x16, I64x8};
42+
pub use i_word_types::{I16x16, I16x32, I32x16, I32x8, I64x4, I64x8};
4343
pub use masks::{F32Mask16, F32Mask8, F64Mask4, F64Mask8};
4444
pub use u8_types::{U8x32, U8x64};
45-
pub use u_word_types::{U16x32, U32x16, U32x8, U64x4, U64x8};
45+
pub use u_word_types::{U16x16, U16x32, U32x16, U32x8, U64x4, U64x8};
4646

4747
// Lowercase aliases — match the std::simd convention used by
4848
// `simd_avx2.rs`, `simd_avx512.rs`, and the scalar fallback in
@@ -83,3 +83,10 @@ pub type i16x16 = I16x16;
8383
pub type i32x16 = I32x16;
8484
#[allow(non_camel_case_types)]
8585
pub type i64x8 = I64x8;
86+
// 256-bit aliases for the missing-lanes sweep (2026-05-20).
87+
#[allow(non_camel_case_types)]
88+
pub type u16x16 = U16x16;
89+
#[allow(non_camel_case_types)]
90+
pub type i32x8 = I32x8;
91+
#[allow(non_camel_case_types)]
92+
pub type i64x4 = I64x4;

0 commit comments

Comments
 (0)