Skip to content

Commit f373c75

Browse files
authored
Merge pull request #204 from AdaWorldAPI/claude/splat3d-cpu-simd-renderer-MAOO0
perf(simd): vectorize I8x16::saturating_abs (VPABSB) + binding W1a tests
2 parents 9ef918c + 73b6dd2 commit f373c75

1 file changed

Lines changed: 74 additions & 4 deletions

File tree

src/simd_avx512.rs

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2406,11 +2406,22 @@ impl I8x16 {
24062406
/// ```
24072407
#[inline(always)]
24082408
pub fn saturating_abs(self) -> Self {
2409-
let mut o = [0i8; 16];
2410-
for i in 0..16 {
2411-
o[i] = self.0[i].saturating_abs();
2409+
// SAFETY: `_mm_abs_epi8` (SSSE3) and `_mm_min_epu8` (SSE2) are available
2410+
// on every x86_64 build this file compiles for — the workspace pins
2411+
// `x86-64-v3`, which includes SSSE3. The unaligned load/store match the
2412+
// `[i8; 16]` storage. VPABSB returns 0x80 for `i8::MIN` (the bit pattern
2413+
// of +128, which does not fit in i8); VPMINUB then clamps 0x80 (= 128
2414+
// unsigned) down to 0x7f (= 127 = `i8::MAX`), producing the saturating
2415+
// result bare VPABSB cannot — per the consumer contract's VPABSB
2416+
// correction. All 16 lanes are saturated branchlessly.
2417+
use core::arch::x86_64::*;
2418+
unsafe {
2419+
let v = _mm_loadu_si128(self.0.as_ptr() as *const __m128i);
2420+
let clamped = _mm_min_epu8(_mm_abs_epi8(v), _mm_set1_epi8(0x7f_u8 as i8));
2421+
let mut o = [0i8; 16];
2422+
_mm_storeu_si128(o.as_mut_ptr() as *mut __m128i, clamped);
2423+
Self(o)
24122424
}
2413-
Self(o)
24142425
}
24152426
}
24162427

@@ -4494,4 +4505,63 @@ mod int_simd_tests {
44944505
assert_eq!(I16x32::LANES, 32);
44954506
assert_eq!(I16x16::LANES, 16);
44964507
}
4508+
4509+
// ── W1a primitive tests (binding per the consumer contract) ──────────────
4510+
4511+
/// Binding: `saturating_abs(i8::MIN) == i8::MAX` for every lane (the VPABSB
4512+
/// correction — bare VPABSB would return i8::MIN).
4513+
#[test]
4514+
fn w1a_saturating_abs_i8x16_min_saturates_to_max() {
4515+
let r = I8x16::splat(i8::MIN).saturating_abs().to_array();
4516+
assert!(r.iter().all(|&x| x == i8::MAX), "got {r:?}");
4517+
}
4518+
4519+
#[test]
4520+
fn w1a_saturating_abs_i8x16_matches_scalar_reference() {
4521+
let corpus: [i8; 16] = [i8::MIN, -128, -127, -1, 0, 1, 7, 8, 64, 126, i8::MAX, -64, -2, 2, 100, -100];
4522+
let got = I8x16::from_array(corpus).saturating_abs().to_array();
4523+
let mut want = [0i8; 16];
4524+
for i in 0..16 {
4525+
want[i] = corpus[i].saturating_abs();
4526+
}
4527+
assert_eq!(got, want);
4528+
}
4529+
4530+
#[test]
4531+
fn w1a_saturating_abs_i8x32_min_saturates_to_max() {
4532+
let r = I8x32::splat(i8::MIN).saturating_abs().to_array();
4533+
assert!(r.iter().all(|&x| x == i8::MAX), "got {r:?}");
4534+
}
4535+
4536+
#[test]
4537+
fn w1a_from_i4_packed_u64_sign_extends() {
4538+
// 0x0 → 0, 0xf → -1, 0x8 → -8, 0x7 → 7
4539+
assert_eq!(I8x16::from_i4_packed_u64(0).lane_i8::<0>(), 0);
4540+
assert_eq!(I8x16::from_i4_packed_u64(u64::MAX).lane_i8::<0>(), -1);
4541+
assert_eq!(I8x16::from_i4_packed_u64(0x8888_8888_8888_8888).lane_i8::<3>(), -8);
4542+
assert_eq!(I8x16::from_i4_packed_u64(0x7777_7777_7777_7777).lane_i8::<5>(), 7);
4543+
// Mixed: low nibble 0x3 → 3, next nibble 0xC → -4.
4544+
let mixed = I8x16::from_i4_packed_u64(0xC3);
4545+
assert_eq!(mixed.lane_i8::<0>(), 3);
4546+
assert_eq!(mixed.lane_i8::<1>(), -4);
4547+
}
4548+
4549+
#[test]
4550+
fn w1a_u64x8_popcnt_and_xor_popcount() {
4551+
let ones = U64x8::splat(u64::MAX);
4552+
assert!(ones.popcnt().to_array().iter().all(|&x| x == 64));
4553+
assert!(U64x8::splat(0).popcnt().to_array().iter().all(|&x| x == 0));
4554+
// Hamming: all-bits-different → 64 × 8 = 512; same → 0.
4555+
assert_eq!(U64x8::splat(u64::MAX).xor_popcount(U64x8::splat(0)), 512);
4556+
let v = U64x8::splat(0xdead_beef_cafe_babe);
4557+
assert_eq!(v.xor_popcount(v), 0);
4558+
}
4559+
4560+
#[test]
4561+
fn w1a_gather_u16_in_bounds() {
4562+
let table = [10u16, 20, 30, 40, 50, 60, 70, 80];
4563+
let idx = U16x8::from_array([0, 2, 4, 6, 1, 3, 5, 7]);
4564+
let got = U16x8::gather_u16(idx, &table).to_array();
4565+
assert_eq!(got, [10, 30, 50, 70, 20, 40, 60, 80]);
4566+
}
44974567
}

0 commit comments

Comments
 (0)