feat(simd_int_ops): MX-T1a — lift add_i8 / sub_i8 / add_i16 to polyfilled lanes

claude · claude · commit b5bca4ecca00 · 2026-05-20T23:47:37.000Z
Phase 1 of the per-CPU integration plan: the integer-elementwise slice ops in simd_int_ops were uniformly scalar on every CPU despite the polyfilled I8x64 / I16x32 lanes existing and being SIMD-backed on every backend. This routes the three ops through the polyfill. Per-backend dispatch follows the existing min_i8 / max_i8 template: x86_64 → I8x64 / I16x32 (AVX-512BW _mm512_add_epi8 zmm / AVX2 polyfill of I8x64 as 2×__m256i on v3 builds) aarch64 → I8x16 / I16x8 (NEON vaddq_s8 / vaddq_s16) other → scalar wrapping loop (unchanged) Wrapping arithmetic is preserved on every path: _mm512_add_epi8 and vaddq_s8 are bit-for-bit equivalent to i8::wrapping_add, so the existing tests (add_i8_matches_scalar_for_tail_lengths covering lengths 0/1/32/63/64/65/127/128/129/256) verify correctness across the cfg chain. No new tests needed — the parity-against-scalar sweep already exercised every boundary. Verification: * default v3 build (uses AVX2 polyfill of I8x64): 15 simd_int_ops tests pass; 2087 lib tests pass; clippy -D warnings clean. * cascadelake config (native _mm512_add_epi8 / _mm512_add_epi16): 15 simd_int_ops tests pass. * sapphirerapids config: NOT verified — the dev-runtime CPU on this host advertises only avx512_vnni in /proc/cpuinfo (no AMX / BF16 / FP16), so SPR-targeted binaries SIGILL on UNRELATED pre-existing tests like min_max_i8_boundary_values. The SPR config's correctness needs verification on real SPR silicon. Companion matrix entries flipped: C. simd_int_ops → row `add_i8` : ⚠️ scalar 🚨 → ✅ I8x64/I8x16 row `sub_i8` : ⚠️ scalar 🚨 → ✅ I8x64/I8x16 row `add_i16` : ⚠️ scalar 🚨 → ✅ I16x32/I16x8 Remaining Phase 1 work in simd_int_ops: MX-T1b — `dot_i8` / `dot_i16` require a widening-multiply-add polyfill primitive (i8×i8 → i32 via VPMADDUBSW + horizontal add on x86, vmlal_s16 + vaddv_s32 on NEON). The widening-multiply primitive doesn't yet exist on the polyfilled types; promoting these without it would force per-arch intrinsics into simd_int_ops, violating the agnostic-surface principle. Defer to the polyfill-primitive PR. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
diff --git a/src/simd_int_ops.rs b/src/simd_int_ops.rs
@@ -19,30 +19,162 @@
 
 /// Element-wise `dst[i] += src[i]` (wrapping i8 add).
 ///
-/// Panics if `dst.len() != src.len()`.
+/// Dispatches to the widest available SIMD lane:
+///
+/// | Backend    | Lane    | Per-iteration intrinsic |
+/// |------------|---------|-------------------------|
+/// | x86_64     | `I8x64` | `_mm512_add_epi8` zmm (AVX-512-BW) / 2× `_mm256_add_epi8` ymm (AVX2 polyfill of `I8x64`) |
+/// | aarch64    | `I8x16` | `vaddq_s8` × N                                |
+/// | other      | scalar  | `i8::wrapping_add` lane-by-lane               |
+///
+/// Wrapping arithmetic. Panics if `dst.len() != src.len()`.
 #[inline]
 pub fn add_i8(dst: &mut [i8], src: &[i8]) {
     assert_eq!(dst.len(), src.len(), "add_i8: length mismatch");
-    for i in 0..dst.len() {
-        dst[i] = dst[i].wrapping_add(src[i]);
+    let n = dst.len();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        use crate::simd::I8x64;
+        const L: usize = 64;
+        let chunks = n / L;
+        for c in 0..chunks {
+            let off = c * L;
+            let d = I8x64::from_slice(&dst[off..]);
+            let s = I8x64::from_slice(&src[off..]);
+            let arr = (d + s).to_array();
+            dst[off..off + L].copy_from_slice(&arr);
+        }
+        for i in (chunks * L)..n {
+            dst[i] = dst[i].wrapping_add(src[i]);
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        use crate::simd_neon::I8x16;
+        const L: usize = 16;
+        let chunks = n / L;
+        for c in 0..chunks {
+            let off = c * L;
+            let d = I8x16::from_slice(&dst[off..]);
+            let s = I8x16::from_slice(&src[off..]);
+            let arr = d.add(s).to_array();
+            dst[off..off + L].copy_from_slice(&arr);
+        }
+        for i in (chunks * L)..n {
+            dst[i] = dst[i].wrapping_add(src[i]);
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        for i in 0..n {
+            dst[i] = dst[i].wrapping_add(src[i]);
+        }
     }
 }
 
 /// Element-wise `dst[i] -= src[i]` (wrapping i8 sub).
+///
+/// Dispatches the same way as [`add_i8`] (zmm AVX-512-BW / ymm AVX2 /
+/// 128-bit NEON / scalar) using the polyfilled lane's `Sub`
+/// implementation.
 #[inline]
 pub fn sub_i8(dst: &mut [i8], src: &[i8]) {
     assert_eq!(dst.len(), src.len(), "sub_i8: length mismatch");
-    for i in 0..dst.len() {
-        dst[i] = dst[i].wrapping_sub(src[i]);
+    let n = dst.len();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        use crate::simd::I8x64;
+        const L: usize = 64;
+        let chunks = n / L;
+        for c in 0..chunks {
+            let off = c * L;
+            let d = I8x64::from_slice(&dst[off..]);
+            let s = I8x64::from_slice(&src[off..]);
+            let arr = (d - s).to_array();
+            dst[off..off + L].copy_from_slice(&arr);
+        }
+        for i in (chunks * L)..n {
+            dst[i] = dst[i].wrapping_sub(src[i]);
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        use crate::simd_neon::I8x16;
+        const L: usize = 16;
+        let chunks = n / L;
+        for c in 0..chunks {
+            let off = c * L;
+            let d = I8x16::from_slice(&dst[off..]);
+            let s = I8x16::from_slice(&src[off..]);
+            let arr = d.sub(s).to_array();
+            dst[off..off + L].copy_from_slice(&arr);
+        }
+        for i in (chunks * L)..n {
+            dst[i] = dst[i].wrapping_sub(src[i]);
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        for i in 0..n {
+            dst[i] = dst[i].wrapping_sub(src[i]);
+        }
     }
 }
 
 /// Element-wise `dst[i] += src[i]` (wrapping i16 add).
+///
+/// Dispatches to `I16x32` (AVX-512-BW `_mm512_add_epi16`) on x86_64,
+/// `I16x8` (`vaddq_s16`) on aarch64, scalar otherwise.
 #[inline]
 pub fn add_i16(dst: &mut [i16], src: &[i16]) {
     assert_eq!(dst.len(), src.len(), "add_i16: length mismatch");
-    for i in 0..dst.len() {
-        dst[i] = dst[i].wrapping_add(src[i]);
+    let n = dst.len();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        use crate::simd::I16x32;
+        const L: usize = 32;
+        let chunks = n / L;
+        for c in 0..chunks {
+            let off = c * L;
+            let d = I16x32::from_slice(&dst[off..]);
+            let s = I16x32::from_slice(&src[off..]);
+            let arr = (d + s).to_array();
+            dst[off..off + L].copy_from_slice(&arr);
+        }
+        for i in (chunks * L)..n {
+            dst[i] = dst[i].wrapping_add(src[i]);
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        use crate::simd_neon::I16x8;
+        const L: usize = 8;
+        let chunks = n / L;
+        for c in 0..chunks {
+            let off = c * L;
+            let d = I16x8::from_slice(&dst[off..]);
+            let s = I16x8::from_slice(&src[off..]);
+            let arr = d.add(s).to_array();
+            dst[off..off + L].copy_from_slice(&arr);
+        }
+        for i in (chunks * L)..n {
+            dst[i] = dst[i].wrapping_add(src[i]);
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        for i in 0..n {
+            dst[i] = dst[i].wrapping_add(src[i]);
+        }
     }
 }