Skip to content

Commit b5bca4e

Browse files
committed
feat(simd_int_ops): MX-T1a — lift add_i8 / sub_i8 / add_i16 to polyfilled lanes
Phase 1 of the per-CPU integration plan: the integer-elementwise slice ops in simd_int_ops were uniformly scalar on every CPU despite the polyfilled I8x64 / I16x32 lanes existing and being SIMD-backed on every backend. This routes the three ops through the polyfill. Per-backend dispatch follows the existing min_i8 / max_i8 template: x86_64 → I8x64 / I16x32 (AVX-512BW _mm512_add_epi8 zmm / AVX2 polyfill of I8x64 as 2×__m256i on v3 builds) aarch64 → I8x16 / I16x8 (NEON vaddq_s8 / vaddq_s16) other → scalar wrapping loop (unchanged) Wrapping arithmetic is preserved on every path: _mm512_add_epi8 and vaddq_s8 are bit-for-bit equivalent to i8::wrapping_add, so the existing tests (add_i8_matches_scalar_for_tail_lengths covering lengths 0/1/32/63/64/65/127/128/129/256) verify correctness across the cfg chain. No new tests needed — the parity-against-scalar sweep already exercised every boundary. Verification: * default v3 build (uses AVX2 polyfill of I8x64): 15 simd_int_ops tests pass; 2087 lib tests pass; clippy -D warnings clean. * cascadelake config (native _mm512_add_epi8 / _mm512_add_epi16): 15 simd_int_ops tests pass. * sapphirerapids config: NOT verified — the dev-runtime CPU on this host advertises only avx512_vnni in /proc/cpuinfo (no AMX / BF16 / FP16), so SPR-targeted binaries SIGILL on UNRELATED pre-existing tests like min_max_i8_boundary_values. The SPR config's correctness needs verification on real SPR silicon. Companion matrix entries flipped: C. simd_int_ops → row `add_i8` : ⚠️ scalar 🚨 → ✅ I8x64/I8x16 row `sub_i8` : ⚠️ scalar 🚨 → ✅ I8x64/I8x16 row `add_i16` : ⚠️ scalar 🚨 → ✅ I16x32/I16x8 Remaining Phase 1 work in simd_int_ops: MX-T1b — `dot_i8` / `dot_i16` require a widening-multiply-add polyfill primitive (i8×i8 → i32 via VPMADDUBSW + horizontal add on x86, vmlal_s16 + vaddv_s32 on NEON). The widening-multiply primitive doesn't yet exist on the polyfilled types; promoting these without it would force per-arch intrinsics into simd_int_ops, violating the agnostic-surface principle. Defer to the polyfill-primitive PR. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
1 parent 058ef61 commit b5bca4e

1 file changed

Lines changed: 139 additions & 7 deletions

File tree

src/simd_int_ops.rs

Lines changed: 139 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,162 @@
1919

2020
/// Element-wise `dst[i] += src[i]` (wrapping i8 add).
2121
///
22-
/// Panics if `dst.len() != src.len()`.
22+
/// Dispatches to the widest available SIMD lane:
23+
///
24+
/// | Backend | Lane | Per-iteration intrinsic |
25+
/// |------------|---------|-------------------------|
26+
/// | x86_64 | `I8x64` | `_mm512_add_epi8` zmm (AVX-512-BW) / 2× `_mm256_add_epi8` ymm (AVX2 polyfill of `I8x64`) |
27+
/// | aarch64 | `I8x16` | `vaddq_s8` × N |
28+
/// | other | scalar | `i8::wrapping_add` lane-by-lane |
29+
///
30+
/// Wrapping arithmetic. Panics if `dst.len() != src.len()`.
2331
#[inline]
2432
pub fn add_i8(dst: &mut [i8], src: &[i8]) {
2533
assert_eq!(dst.len(), src.len(), "add_i8: length mismatch");
26-
for i in 0..dst.len() {
27-
dst[i] = dst[i].wrapping_add(src[i]);
34+
let n = dst.len();
35+
36+
#[cfg(target_arch = "x86_64")]
37+
{
38+
use crate::simd::I8x64;
39+
const L: usize = 64;
40+
let chunks = n / L;
41+
for c in 0..chunks {
42+
let off = c * L;
43+
let d = I8x64::from_slice(&dst[off..]);
44+
let s = I8x64::from_slice(&src[off..]);
45+
let arr = (d + s).to_array();
46+
dst[off..off + L].copy_from_slice(&arr);
47+
}
48+
for i in (chunks * L)..n {
49+
dst[i] = dst[i].wrapping_add(src[i]);
50+
}
51+
}
52+
53+
#[cfg(target_arch = "aarch64")]
54+
{
55+
use crate::simd_neon::I8x16;
56+
const L: usize = 16;
57+
let chunks = n / L;
58+
for c in 0..chunks {
59+
let off = c * L;
60+
let d = I8x16::from_slice(&dst[off..]);
61+
let s = I8x16::from_slice(&src[off..]);
62+
let arr = d.add(s).to_array();
63+
dst[off..off + L].copy_from_slice(&arr);
64+
}
65+
for i in (chunks * L)..n {
66+
dst[i] = dst[i].wrapping_add(src[i]);
67+
}
68+
}
69+
70+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
71+
{
72+
for i in 0..n {
73+
dst[i] = dst[i].wrapping_add(src[i]);
74+
}
2875
}
2976
}
3077

3178
/// Element-wise `dst[i] -= src[i]` (wrapping i8 sub).
79+
///
80+
/// Dispatches the same way as [`add_i8`] (zmm AVX-512-BW / ymm AVX2 /
81+
/// 128-bit NEON / scalar) using the polyfilled lane's `Sub`
82+
/// implementation.
3283
#[inline]
3384
pub fn sub_i8(dst: &mut [i8], src: &[i8]) {
3485
assert_eq!(dst.len(), src.len(), "sub_i8: length mismatch");
35-
for i in 0..dst.len() {
36-
dst[i] = dst[i].wrapping_sub(src[i]);
86+
let n = dst.len();
87+
88+
#[cfg(target_arch = "x86_64")]
89+
{
90+
use crate::simd::I8x64;
91+
const L: usize = 64;
92+
let chunks = n / L;
93+
for c in 0..chunks {
94+
let off = c * L;
95+
let d = I8x64::from_slice(&dst[off..]);
96+
let s = I8x64::from_slice(&src[off..]);
97+
let arr = (d - s).to_array();
98+
dst[off..off + L].copy_from_slice(&arr);
99+
}
100+
for i in (chunks * L)..n {
101+
dst[i] = dst[i].wrapping_sub(src[i]);
102+
}
103+
}
104+
105+
#[cfg(target_arch = "aarch64")]
106+
{
107+
use crate::simd_neon::I8x16;
108+
const L: usize = 16;
109+
let chunks = n / L;
110+
for c in 0..chunks {
111+
let off = c * L;
112+
let d = I8x16::from_slice(&dst[off..]);
113+
let s = I8x16::from_slice(&src[off..]);
114+
let arr = d.sub(s).to_array();
115+
dst[off..off + L].copy_from_slice(&arr);
116+
}
117+
for i in (chunks * L)..n {
118+
dst[i] = dst[i].wrapping_sub(src[i]);
119+
}
120+
}
121+
122+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
123+
{
124+
for i in 0..n {
125+
dst[i] = dst[i].wrapping_sub(src[i]);
126+
}
37127
}
38128
}
39129

40130
/// Element-wise `dst[i] += src[i]` (wrapping i16 add).
131+
///
132+
/// Dispatches to `I16x32` (AVX-512-BW `_mm512_add_epi16`) on x86_64,
133+
/// `I16x8` (`vaddq_s16`) on aarch64, scalar otherwise.
41134
#[inline]
42135
pub fn add_i16(dst: &mut [i16], src: &[i16]) {
43136
assert_eq!(dst.len(), src.len(), "add_i16: length mismatch");
44-
for i in 0..dst.len() {
45-
dst[i] = dst[i].wrapping_add(src[i]);
137+
let n = dst.len();
138+
139+
#[cfg(target_arch = "x86_64")]
140+
{
141+
use crate::simd::I16x32;
142+
const L: usize = 32;
143+
let chunks = n / L;
144+
for c in 0..chunks {
145+
let off = c * L;
146+
let d = I16x32::from_slice(&dst[off..]);
147+
let s = I16x32::from_slice(&src[off..]);
148+
let arr = (d + s).to_array();
149+
dst[off..off + L].copy_from_slice(&arr);
150+
}
151+
for i in (chunks * L)..n {
152+
dst[i] = dst[i].wrapping_add(src[i]);
153+
}
154+
}
155+
156+
#[cfg(target_arch = "aarch64")]
157+
{
158+
use crate::simd_neon::I16x8;
159+
const L: usize = 8;
160+
let chunks = n / L;
161+
for c in 0..chunks {
162+
let off = c * L;
163+
let d = I16x8::from_slice(&dst[off..]);
164+
let s = I16x8::from_slice(&src[off..]);
165+
let arr = d.add(s).to_array();
166+
dst[off..off + L].copy_from_slice(&arr);
167+
}
168+
for i in (chunks * L)..n {
169+
dst[i] = dst[i].wrapping_add(src[i]);
170+
}
171+
}
172+
173+
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
174+
{
175+
for i in 0..n {
176+
dst[i] = dst[i].wrapping_add(src[i]);
177+
}
46178
}
47179
}
48180

0 commit comments

Comments
 (0)