feat(simd): missing-lanes sweep — U16x16/U32x8/U64x4/I32x8/I64x4 across all backends

claude · claude · commit 2ef97c03977f · 2026-05-20T16:57:44.000Z
PR #178's matrix audit surfaced five 256-bit int lane types that were either entirely missing or stranded in `simd_nightly` only. Adds them across every backend so `crate::simd::{U16x16, U32x8, U64x4, I32x8, I64x4}` resolves uniformly on v3 / v4 / native / nightly / scalar / aarch64 paths. `src/simd_avx2.rs` + 5× `avx2_int_type!` instantiations producing scalar-storage `[$elem; $lanes]` polyfills (align 64). Same macro pattern as the existing 512-bit polyfills (U8x64, U16x32, …). Native AVX2 `__m256i` upgrades are TD-SIMD-3. + 5× lowercase aliases (`u16x16 = U16x16`, etc.) matching the std::simd convention used by every other lane type in the file. `src/simd_scalar.rs` + 5× `impl_int_type!` instantiations mirroring the AVX2 polyfills above. Consumers on non-x86/non-aarch64 (wasm32, riscv, thumb) reach the same type names through `crate::simd::*`. + Lowercase aliases. `src/simd_avx512.rs` + Re-export of the new types from `simd_avx2` so the v4 dispatch arm in `simd.rs` can surface them without forking the macro into this file. Both files are already gated on `target_arch = "x86_64"`, so the re-export is cheap. Native `__m256i` upgrades here are TD-SIMD-3 (same story as the v3 polyfills). `src/simd_nightly/u_word_types.rs` + `U16x16` wrapper backed by `core::simd::u16x16`. Same API surface as the existing 32-/16-/8-lane wrappers — splat, from_slice, from_array, to_array, copy_to_slice, reduce_{sum,min,max}, simd_min/max, cmpeq_mask, cmpgt_mask, Default. `src/simd_nightly/i_word_types.rs` + `I32x8` and `I64x4` wrappers backed by `core::simd::{i32x8, i64x4}`. Same API surface as siblings; PartialEq via array compare. `src/simd_nightly/mod.rs` + Re-exports for the three new types + lowercase aliases. `src/simd.rs` + All 5 dispatch arms (nightly, v4, v3, aarch64, scalar fallback) updated to surface the new types through `crate::simd::*`. `.claude/knowledge/simd-dispatch-architecture.md` + Parity matrix updated — the five rows previously marked ❌ across most backends now show 🟠 polyfill (v3, v4-via-v3, scalar) / 🔵 (nightly via `core::simd`). Verified: `cargo check` clean under default v3 features and under `-Ctarget-cpu=x86-64-v4` (via `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS` + explicit `--target` so build scripts don't SIGILL on non-AVX-512 runners — same pattern as the tier4-avx512-check job).
diff --git a/.claude/knowledge/simd-dispatch-architecture.md b/.claude/knowledge/simd-dispatch-architecture.md
@@ -150,19 +150,19 @@ tracked as TD-SIMD-3.)
 | `U8x64` | ✅ `__m512i` | 🟠 `[u8; 64]` polyfill | ❌ | 🔵 | ✅ |
 | `U8x32` | ✅ `__m256i` | ✅ `__m256i` | ❌ | 🔵 | ✅ |
 | `U16x32` | ✅ `__m512i` | 🟠 `[u16; 32]` polyfill | ❌ | 🔵 | ✅ |
-| `U16x16` | ❌ | ❌ | ❌ | ❌ | ❌ |
+| `U16x16` | 🟠 (via `simd_avx2`) | 🟠 `[u16; 16]` polyfill | ❌ | 🔵 `core::simd::u16x16` | 🟠 |
 | `U32x16` | ✅ `__m512i` | 🟠 `[u32; 16]` polyfill | ❌ | 🔵 | ✅ |
-| `U32x8` | ❌ | ❌ | ❌ | 🔵 `core::simd::u32x8` | ❌ |
+| `U32x8` | 🟠 (via `simd_avx2`) | 🟠 `[u32; 8]` polyfill | ❌ | 🔵 `core::simd::u32x8` | 🟠 |
 | `U64x8` | ✅ `__m512i` | 🟠 `[u64; 8]` polyfill | ❌ | 🔵 | ✅ |
-| `U64x4` | ❌ | ❌ | ❌ | 🔵 `core::simd::u64x4` | ❌ |
+| `U64x4` | 🟠 (via `simd_avx2`) | 🟠 `[u64; 4]` polyfill | ❌ | 🔵 `core::simd::u64x4` | 🟠 |
 | `I8x32` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ |
 | `I8x64` | ✅ `__m512i` | 🟠 `[i8; 64]` polyfill | ❌ | 🔵 | ✅ |
 | `I16x16` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ |
 | `I16x32` | ✅ `__m512i` | 🟠 `[i16; 32]` polyfill | ❌ | 🔵 | ✅ |
 | `I32x16` | ✅ `__m512i` | 🟠 `[i32; 16]` polyfill | ❌ | 🔵 | ✅ |
-| `I32x8` | ❌ | ❌ | ❌ | ❌ | ❌ |
+| `I32x8` | 🟠 (via `simd_avx2`) | 🟠 `[i32; 8]` polyfill | ❌ | 🔵 `core::simd::i32x8` | 🟠 |
 | `I64x8` | ✅ `__m512i` | 🟠 `[i64; 8]` polyfill | ❌ | 🔵 | ✅ |
-| `I64x4` | ❌ | ❌ | ❌ | ❌ | ❌ |
+| `I64x4` | 🟠 (via `simd_avx2`) | 🟠 `[i64; 4]` polyfill | ❌ | 🔵 `core::simd::i64x4` | 🟠 |
 | `BF16x8` | ✅ `__m128bh` | ❌ | ❌ | 🔵 | ✅ |
 | `BF16x16` | ✅ `__m256bh` | ❌ | ❌ | 🔵 | ✅ |
 | `F16x16` | ❌ | 🟠 `[u16; 16]` polyfill (`simd_half`) | 🟠 `[u16; 16]` polyfill (`simd_half`) | 🔵 | ✅ |
diff --git a/src/simd.rs b/src/simd.rs
@@ -220,9 +220,10 @@ pub const PREFERRED_I16_LANES: usize = 16;
 // as soon as `nightly-simd` is on.
 #[cfg(feature = "nightly-simd")]
 pub use crate::simd_nightly::{
-    f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u16x32, u32x16, u32x8, u64x4, u64x8,
-    u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8,
-    I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64,
+    f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u16x32, u32x16,
+    u32x8, u64x4, u64x8, u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8,
+    F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8, I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4,
+    U64x8, U8x32, U8x64,
 };
 
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))]
@@ -234,10 +235,15 @@ pub use crate::simd_avx512::{
     i16x16,
     i16x32,
     i32x16,
+    i32x8,
+    i64x4,
     i64x8,
     i8x32,
     i8x64,
+    u16x16,
     u32x16,
+    u32x8,
+    u64x4,
     u64x8,
     u8x64,
     F32Mask16,
@@ -251,11 +257,18 @@ pub use crate::simd_avx512::{
     I16x16,
     I16x32,
     I32x16,
+    // 256-bit int polyfills surfaced 2026-05-20 (re-exported from
+    // `simd_avx2` via `simd_avx512`'s re-export at line ~2260).
+    I32x8,
+    I64x4,
     I64x8,
     I8x32,
     I8x64,
+    U16x16,
     U16x32,
     U32x16,
+    U32x8,
+    U64x4,
     U64x8,
     U8x64,
 };
@@ -302,8 +315,9 @@ pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16,
     not(feature = "nightly-simd")
 ))]
 pub use crate::simd_avx2::{
-    f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32,
-    I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
+    f32x16, f64x8, i16x32, i32x16, i32x8, i64x4, i64x8, i8x64, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32Mask16,
+    F32x16, F64Mask8, F64x8, I16x32, I32x16, I32x8, I64x4, I64x8, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8,
+    U8x64,
 };
 
 // U8x32 — native AVX2 byte width (one __m256i = 32 bytes). Available on
@@ -335,7 +349,8 @@ pub(crate) mod scalar;
 pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8};
 #[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
 pub use scalar::{
-    f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64,
+    f32x8, f64x4, i32x16, i32x8, i64x4, i64x8, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32x8, F64x4, I32x16, I32x8,
+    I64x4, I64x8, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64,
 };
 
 // Other non-x86 targets (wasm, riscv, etc.): full scalar fallback.
@@ -345,8 +360,9 @@ pub use scalar::{
     not(feature = "nightly-simd")
 ))]
 pub use scalar::{
-    f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16,
-    F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64,
+    f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u32x16, u32x8,
+    u64x4, u64x8, u8x64, F32Mask16, F32x16, F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8,
+    I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64,
 };
 
 // Scalar BF16 conversion — always available on all platforms
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
@@ -1542,6 +1542,19 @@ avx2_int_type!(U16x32, u16, 32, 0u16);
 avx2_int_type!(U32x16, u32, 16, 0u32);
 avx2_int_type!(U64x8, u64, 8, 0u64);
 
+// 256-bit int lanes — scalar polyfills filling the gap surfaced by the
+// 2026-05-20 matrix audit. None of these had wrappers anywhere except
+// for `U32x8` / `U64x4` in `simd_nightly`. Adding `U16x16`, `U32x8`,
+// `U64x4`, `I32x8`, `I64x4` here mirrors the existing 512-bit polyfill
+// pattern (`[$elem; $lanes]` storage, align 64). Native AVX2 `__m256i`
+// upgrades for these are TD-SIMD-3 (the same fold-into-real-SIMD task
+// already tracked for the 512-bit polyfills above).
+avx2_int_type!(U16x16, u16, 16, 0u16);
+avx2_int_type!(U32x8, u32, 8, 0u32);
+avx2_int_type!(U64x4, u64, 4, 0u64);
+avx2_int_type!(I32x8, i32, 8, 0i32);
+avx2_int_type!(I64x4, i64, 4, 0i64);
+
 // Extra methods for U16x32 (widen/narrow, shift, multiply) — AVX2 scalar fallback.
 impl U16x32 {
     #[inline(always)]
@@ -2266,6 +2279,19 @@ pub type i8x64 = I8x64;
 #[allow(non_camel_case_types)]
 pub type i16x32 = I16x32;
 
+// Lowercase aliases for the 256-bit polyfills added in the 2026-05-20
+// missing-lanes sweep.
+#[allow(non_camel_case_types)]
+pub type u16x16 = U16x16;
+#[allow(non_camel_case_types)]
+pub type u32x8 = U32x8;
+#[allow(non_camel_case_types)]
+pub type u64x4 = U64x4;
+#[allow(non_camel_case_types)]
+pub type i32x8 = I32x8;
+#[allow(non_camel_case_types)]
+pub type i64x4 = I64x4;
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs
@@ -2256,6 +2256,14 @@ pub type i16x32 = I16x32;
 #[allow(non_camel_case_types)]
 pub type i16x16 = I16x16;
 
+// 256-bit int lanes — added 2026-05-20 missing-lanes sweep. These types
+// don't have native `__m256i` wrappers in this module yet; re-exported
+// from `simd_avx2.rs` (where they live as scalar-storage polyfills via
+// the `avx2_int_type!` macro) so the v4 dispatch arm in `simd.rs` can
+// surface them through `crate::simd::*` with the same names the v3 arm
+// uses. Native AVX2 `__m256i` upgrades for these are TD-SIMD-3.
+pub use crate::simd_avx2::{i32x8, i64x4, u16x16, u32x8, u64x4, I32x8, I64x4, U16x16, U32x8, U64x4};
+
 // ============================================================================
 // BF16 conversion wrappers — AVX-512 BF16 hardware instructions
 // ============================================================================
diff --git a/src/simd_nightly/i_word_types.rs b/src/simd_nightly/i_word_types.rs
@@ -3,7 +3,7 @@
 
 use core::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd};
 use core::simd::num::SimdInt;
-use core::simd::{i16x16, i16x32, i32x16, i64x8};
+use core::simd::{i16x16, i16x32, i32x16, i32x8, i64x4, i64x8};
 
 // ════════════════════════════════════════════════════════════════════
 // I16x16 — 16-lane signed 16-bit integer
@@ -428,3 +428,177 @@ impl core::fmt::Display for I64x8 {
         write!(f, "I64x8({:?})", &self.to_array()[..])
     }
 }
+
+// ════════════════════════════════════════════════════════════════════
+// I32x8 — 8-lane i32 (256-bit, added 2026-05-20 missing-lanes sweep)
+// ════════════════════════════════════════════════════════════════════
+
+/// 8-lane `i32` SIMD vector backed by `core::simd::i32x8`.
+///
+/// API mirrors `simd_avx512::I32x16` at half-width. Miri-executable.
+#[derive(Copy, Clone, Debug)]
+#[repr(transparent)]
+pub struct I32x8(pub i32x8);
+
+impl I32x8 {
+    pub const LANES: usize = 8;
+
+    #[inline(always)]
+    pub fn splat(v: i32) -> Self {
+        Self(i32x8::splat(v))
+    }
+
+    #[inline(always)]
+    pub fn from_slice(s: &[i32]) -> Self {
+        assert!(s.len() >= 8, "I32x8::from_slice needs >=8 elements");
+        Self(i32x8::from_slice(s))
+    }
+
+    #[inline(always)]
+    pub fn from_array(arr: [i32; 8]) -> Self {
+        Self(i32x8::from_array(arr))
+    }
+
+    #[inline(always)]
+    pub fn to_array(self) -> [i32; 8] {
+        self.0.to_array()
+    }
+
+    #[inline(always)]
+    pub fn copy_to_slice(self, s: &mut [i32]) {
+        assert!(s.len() >= 8, "I32x8::copy_to_slice needs >=8 elements");
+        self.0.copy_to_slice(s);
+    }
+
+    #[inline(always)]
+    pub fn reduce_sum(self) -> i32 {
+        self.0.reduce_sum()
+    }
+    #[inline(always)]
+    pub fn reduce_min(self) -> i32 {
+        self.0.reduce_min()
+    }
+    #[inline(always)]
+    pub fn reduce_max(self) -> i32 {
+        self.0.reduce_max()
+    }
+
+    #[inline(always)]
+    pub fn simd_min(self, other: Self) -> Self {
+        Self(self.0.simd_min(other.0))
+    }
+    #[inline(always)]
+    pub fn simd_max(self, other: Self) -> Self {
+        Self(self.0.simd_max(other.0))
+    }
+
+    #[inline(always)]
+    pub fn cmpeq_mask(self, other: Self) -> u8 {
+        self.0.simd_eq(other.0).to_bitmask() as u8
+    }
+    #[inline(always)]
+    pub fn cmpgt_mask(self, other: Self) -> u8 {
+        self.0.simd_gt(other.0).to_bitmask() as u8
+    }
+}
+
+impl Default for I32x8 {
+    #[inline(always)]
+    fn default() -> Self {
+        Self::splat(0)
+    }
+}
+
+impl PartialEq for I32x8 {
+    #[inline(always)]
+    fn eq(&self, other: &Self) -> bool {
+        self.to_array() == other.to_array()
+    }
+}
+
+// ════════════════════════════════════════════════════════════════════
+// I64x4 — 4-lane i64 (256-bit, added 2026-05-20 missing-lanes sweep)
+// ════════════════════════════════════════════════════════════════════
+
+/// 4-lane `i64` SIMD vector backed by `core::simd::i64x4`.
+///
+/// API mirrors `simd_avx512::I64x8` at half-width. Miri-executable.
+#[derive(Copy, Clone, Debug)]
+#[repr(transparent)]
+pub struct I64x4(pub i64x4);
+
+impl I64x4 {
+    pub const LANES: usize = 4;
+
+    #[inline(always)]
+    pub fn splat(v: i64) -> Self {
+        Self(i64x4::splat(v))
+    }
+
+    #[inline(always)]
+    pub fn from_slice(s: &[i64]) -> Self {
+        assert!(s.len() >= 4, "I64x4::from_slice needs >=4 elements");
+        Self(i64x4::from_slice(s))
+    }
+
+    #[inline(always)]
+    pub fn from_array(arr: [i64; 4]) -> Self {
+        Self(i64x4::from_array(arr))
+    }
+
+    #[inline(always)]
+    pub fn to_array(self) -> [i64; 4] {
+        self.0.to_array()
+    }
+
+    #[inline(always)]
+    pub fn copy_to_slice(self, s: &mut [i64]) {
+        assert!(s.len() >= 4, "I64x4::copy_to_slice needs >=4 elements");
+        self.0.copy_to_slice(s);
+    }
+
+    #[inline(always)]
+    pub fn reduce_sum(self) -> i64 {
+        self.0.reduce_sum()
+    }
+    #[inline(always)]
+    pub fn reduce_min(self) -> i64 {
+        self.0.reduce_min()
+    }
+    #[inline(always)]
+    pub fn reduce_max(self) -> i64 {
+        self.0.reduce_max()
+    }
+
+    #[inline(always)]
+    pub fn simd_min(self, other: Self) -> Self {
+        Self(self.0.simd_min(other.0))
+    }
+    #[inline(always)]
+    pub fn simd_max(self, other: Self) -> Self {
+        Self(self.0.simd_max(other.0))
+    }
+
+    #[inline(always)]
+    pub fn cmpeq_mask(self, other: Self) -> u8 {
+        self.0.simd_eq(other.0).to_bitmask() as u8
+    }
+    #[inline(always)]
+    pub fn cmpgt_mask(self, other: Self) -> u8 {
+        self.0.simd_gt(other.0).to_bitmask() as u8
+    }
+}
+
+impl Default for I64x4 {
+    #[inline(always)]
+    fn default() -> Self {
+        Self::splat(0)
+    }
+}
+
+impl PartialEq for I64x4 {
+    #[inline(always)]
+    fn eq(&self, other: &Self) -> bool {
+        self.to_array() == other.to_array()
+    }
+}
diff --git a/src/simd_nightly/mod.rs b/src/simd_nightly/mod.rs
@@ -39,10 +39,10 @@ pub use f16_types::F16x16;
 pub use f32_types::{F32x16, F32x8};
 pub use f64_types::{F64x4, F64x8};
 pub use i8_types::{I8x32, I8x64};
-pub use i_word_types::{I16x16, I16x32, I32x16, I64x8};
+pub use i_word_types::{I16x16, I16x32, I32x16, I32x8, I64x4, I64x8};
 pub use masks::{F32Mask16, F32Mask8, F64Mask4, F64Mask8};
 pub use u8_types::{U8x32, U8x64};
-pub use u_word_types::{U16x32, U32x16, U32x8, U64x4, U64x8};
+pub use u_word_types::{U16x16, U16x32, U32x16, U32x8, U64x4, U64x8};
 
 // Lowercase aliases — match the std::simd convention used by
 // `simd_avx2.rs`, `simd_avx512.rs`, and the scalar fallback in
@@ -83,3 +83,10 @@ pub type i16x16 = I16x16;
 pub type i32x16 = I32x16;
 #[allow(non_camel_case_types)]
 pub type i64x8 = I64x8;
+// 256-bit aliases for the missing-lanes sweep (2026-05-20).
+#[allow(non_camel_case_types)]
+pub type u16x16 = U16x16;
+#[allow(non_camel_case_types)]
+pub type i32x8 = I32x8;
+#[allow(non_camel_case_types)]
+pub type i64x4 = I64x4;
diff --git a/src/simd_nightly/u_word_types.rs b/src/simd_nightly/u_word_types.rs
diff --git a/src/simd_scalar.rs b/src/simd_scalar.rs