Skip to content

Commit b85ca35

Browse files
committed
feat: I32x16 Base17 ops — from_i16_slice, abs, to_i16_array, cmpge_zero_mask
Added to all 3 tiers (AVX-512 / AVX2 / scalar): from_i16_slice(&[i16]) → I32x16 — load 16×i16, sign-extend to 16×i32 abs() → I32x16 — absolute value per lane to_i16_array() → [i16; 16] — narrow 16×i32 back to 16×i16 cmpge_zero_mask() → u16 — bit mask where lane >= 0 These are the primitives bgz17_bridge.rs needs to replace its 92 raw intrinsics with crate::simd::I32x16 calls. Fixed duplicate abs() in AVX-512 I32x16. 19 bgz17_bridge tests pass. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
1 parent 8ba065c commit b85ca35

3 files changed

Lines changed: 78 additions & 5 deletions

File tree

src/simd.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,25 @@ mod scalar {
546546
for i in 0..16 { out[i] = self.0[i].abs(); }
547547
Self(out)
548548
}
549+
#[inline(always)]
550+
pub fn from_i16_slice(s: &[i16]) -> Self {
551+
assert!(s.len() >= 16);
552+
let mut o = [0i32; 16];
553+
for i in 0..16 { o[i] = s[i] as i32; }
554+
Self(o)
555+
}
556+
#[inline(always)]
557+
pub fn to_i16_array(self) -> [i16; 16] {
558+
let mut o = [0i16; 16];
559+
for i in 0..16 { o[i] = self.0[i] as i16; }
560+
o
561+
}
562+
#[inline(always)]
563+
pub fn cmpge_zero_mask(self) -> u16 {
564+
let mut mask = 0u16;
565+
for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } }
566+
mask
567+
}
549568
}
550569

551570
impl Mul for I32x16 {

src/simd_avx2.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,31 @@ impl I32x16 {
843843
#[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].max(other.0[i]); } Self(o) }
844844
#[inline(always)] pub fn cast_f32(self) -> F32x16 { let mut o = [0.0f32; 16]; for i in 0..16 { o[i] = self.0[i] as f32; } F32x16::from_array(o) }
845845
#[inline(always)] pub fn abs(self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].abs(); } Self(o) }
846+
847+
/// Load 16 × i16, sign-extend to 16 × i32.
848+
#[inline(always)]
849+
pub fn from_i16_slice(s: &[i16]) -> Self {
850+
assert!(s.len() >= 16);
851+
let mut o = [0i32; 16];
852+
for i in 0..16 { o[i] = s[i] as i32; }
853+
Self(o)
854+
}
855+
856+
/// Narrow 16 × i32 to 16 × i16 (truncation).
857+
#[inline(always)]
858+
pub fn to_i16_array(self) -> [i16; 16] {
859+
let mut o = [0i16; 16];
860+
for i in 0..16 { o[i] = self.0[i] as i16; }
861+
o
862+
}
863+
864+
/// Mask: bit i set where lane i >= 0.
865+
#[inline(always)]
866+
pub fn cmpge_zero_mask(self) -> u16 {
867+
let mut mask = 0u16;
868+
for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } }
869+
mask
870+
}
846871
}
847872
impl Mul for I32x16 { type Output = Self; #[inline(always)] fn mul(self, r: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].wrapping_mul(r.0[i]); } Self(o) } }
848873
impl MulAssign for I32x16 { #[inline(always)] fn mul_assign(&mut self, r: Self) { *self = *self * r; } }

src/simd_avx512.rs

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,40 @@ impl I32x16 {
758758
unsafe { _mm512_reduce_max_epi32(self.0) }
759759
}
760760

761+
// ── Base17 i16[17] operations: load-widen, abs, narrow ──────────────
762+
// Used by bgz17_bridge.rs for L1 distance, weighted L1, sign agreement, xor_bind.
763+
764+
/// Load 16 × i16 from slice, sign-extend to 16 × i32.
765+
/// This is the first step of every Base17 kernel: i16 → i32 to avoid overflow.
766+
#[inline(always)]
767+
pub fn from_i16_slice(s: &[i16]) -> Self {
768+
assert!(s.len() >= 16);
769+
Self(unsafe { _mm512_cvtepi16_epi32(_mm256_loadu_si256(s.as_ptr() as *const __m256i)) })
770+
}
771+
772+
/// Absolute value per lane.
773+
#[inline(always)]
774+
pub fn abs(self) -> Self {
775+
Self(unsafe { _mm512_abs_epi32(self.0) })
776+
}
777+
778+
/// Narrow 16 × i32 back to 16 × i16 (truncation, no saturation).
779+
#[inline(always)]
780+
pub fn to_i16_array(self) -> [i16; 16] {
781+
unsafe {
782+
let packed = _mm512_cvtepi32_epi16(self.0);
783+
let mut arr = [0i16; 16];
784+
_mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, packed);
785+
arr
786+
}
787+
}
788+
789+
/// Compare >= 0: returns 16-bit mask. Bit i set where lane i >= 0.
790+
#[inline(always)]
791+
pub fn cmpge_zero_mask(self) -> u16 {
792+
unsafe { _mm512_cmpge_epi32_mask(self.0, _mm512_setzero_si512()) }
793+
}
794+
761795
#[inline(always)]
762796
pub fn simd_min(self, other: Self) -> Self {
763797
Self(unsafe { _mm512_min_epi32(self.0, other.0) })
@@ -773,11 +807,6 @@ impl I32x16 {
773807
pub fn cast_f32(self) -> F32x16 {
774808
F32x16(unsafe { _mm512_cvtepi32_ps(self.0) })
775809
}
776-
777-
#[inline(always)]
778-
pub fn abs(self) -> Self {
779-
Self(unsafe { _mm512_abs_epi32(self.0) })
780-
}
781810
}
782811

783812
impl_bin_op!(I32x16, Add, add, _mm512_add_epi32);

0 commit comments

Comments
 (0)