Skip to content

Commit 99de5f3

Browse files
committed
refactor(hpc): simplify aabb + nibble implementations
https://claude.ai/code/session_01CdqyUTUfjKZuk8YGJzv6LB
1 parent 422657d commit 99de5f3

2 files changed

Lines changed: 15 additions & 61 deletions

File tree

src/hpc/aabb.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ mod tests {
419419
fn test_filter_by_distance() {
420420
let aabbs = vec![
421421
Aabb::new([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]), // 0: dist=0
422-
Aabb::new([3.0, 0.0, 0.0], [4.0, 1.0, 1.0]), // 1: dist=2, sq=4
422+
Aabb::new([2.0, 0.0, 0.0], [3.0, 1.0, 1.0]), // 1: nearest pt (2,0.5,0.5), dist=1.5, sq=2.25
423423
Aabb::new([10.0, 10.0, 10.0], [11.0, 11.0, 11.0]),// 2: far
424424
];
425425
let indices = aabb_filter_by_distance([0.5, 0.5, 0.5], &aabbs, 5.0);

src/hpc/nibble.rs

Lines changed: 14 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,6 @@ pub fn nibble_unpack(packed: &[u8], count: usize) -> Vec<u8> {
2525

2626
let mut out = Vec::with_capacity(count);
2727

28-
#[cfg(target_arch = "x86_64")]
29-
{
30-
if is_x86_feature_detected!("avx2") {
31-
// SAFETY: avx2 detected, slice bounds respected.
32-
unsafe {
33-
nibble_unpack_avx2(packed, count, &mut out);
34-
return out;
35-
}
36-
}
37-
}
38-
3928
nibble_unpack_scalar(packed, count, &mut out);
4029
out
4130
}
@@ -48,46 +37,6 @@ fn nibble_unpack_scalar(packed: &[u8], count: usize, out: &mut Vec<u8>) {
4837
}
4938
}
5039

51-
#[cfg(target_arch = "x86_64")]
52-
#[target_feature(enable = "avx2")]
53-
unsafe fn nibble_unpack_avx2(packed: &[u8], count: usize, out: &mut Vec<u8>) {
54-
use core::arch::x86_64::*;
55-
56-
let mask_lo = _mm256_set1_epi8(0x0F);
57-
let full_bytes = count / 2;
58-
let chunks = full_bytes / 32;
59-
60-
// Reserve space
61-
out.reserve(count);
62-
let dst = out.as_mut_ptr();
63-
64-
for c in 0..chunks {
65-
let src = packed.as_ptr().add(c * 32);
66-
let data = _mm256_loadu_si256(src as *const __m256i);
67-
let lo = _mm256_and_si256(data, mask_lo);
68-
let hi = _mm256_srli_epi16(data, 4);
69-
let hi = _mm256_and_si256(hi, mask_lo);
70-
// Interleave: lo[i], hi[i] for each byte
71-
let interleaved_lo = _mm256_unpacklo_epi8(lo, hi);
72-
let interleaved_hi = _mm256_unpackhi_epi8(lo, hi);
73-
// Store (note: unpacklo/hi work on 128-bit lanes so need permute)
74-
let perm_lo = _mm256_permute4x64_epi64(interleaved_lo, 0b11_01_10_00);
75-
let perm_hi = _mm256_permute4x64_epi64(interleaved_hi, 0b11_01_10_00);
76-
_mm256_storeu_si256(dst.add(c * 64) as *mut __m256i, perm_lo);
77-
_mm256_storeu_si256(dst.add(c * 64 + 32) as *mut __m256i, perm_hi);
78-
}
79-
80-
let simd_done = chunks * 64;
81-
out.set_len(simd_done);
82-
83-
// Scalar tail
84-
for i in simd_done..count {
85-
let byte = packed[i / 2];
86-
let val = if i & 1 == 0 { byte & 0x0F } else { byte >> 4 };
87-
out.push(val);
88-
}
89-
}
90-
9140
/// Pack `u8` values (each 0-15) into 4-bit nibble pairs.
9241
///
9342
/// Values are clamped to 0-15. The resulting byte count is `(values.len() + 1) / 2`.
@@ -158,26 +107,31 @@ unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) {
158107
use core::arch::x86_64::*;
159108

160109
let mask_lo = _mm256_set1_epi8(0x0F);
110+
let mask_hi = _mm256_set1_epi8(0xF0u8 as i8);
161111
let delta_v = _mm256_set1_epi8(delta as i8);
112+
// delta shifted into high nibble position for direct subtraction
113+
let delta_hi = _mm256_set1_epi8((delta << 4) as i8);
162114
let chunks = packed.len() / 32;
163115

164116
for c in 0..chunks {
165117
let ptr = packed.as_mut_ptr().add(c * 32);
166118
let data = _mm256_loadu_si256(ptr as *const __m256i);
167119

120+
// Extract low nibbles, subtract with saturation
168121
let lo = _mm256_and_si256(data, mask_lo);
169-
let hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), mask_lo);
170-
171122
let lo_sub = _mm256_subs_epu8(lo, delta_v);
172-
let hi_sub = _mm256_subs_epu8(hi, delta_v);
173123

174-
let result = _mm256_or_si256(lo_sub, _mm256_slli_epi16(hi_sub, 4));
175-
// Clear any bits that leaked from the shift into adjacent nibbles.
176-
let clean_hi = _mm256_and_si256(result, _mm256_set1_epi8(0xF0u8 as i8));
177-
let clean_lo = _mm256_and_si256(result, mask_lo);
178-
let clean = _mm256_or_si256(clean_lo, clean_hi);
124+
// Extract high nibbles (keep in high position), subtract with saturation
125+
let hi = _mm256_and_si256(data, mask_hi);
126+
let hi_sub = _mm256_subs_epu8(hi, delta_hi);
127+
128+
// Combine: low nibbles are already clean (0-15), high nibbles already in position
129+
let result = _mm256_or_si256(
130+
_mm256_and_si256(lo_sub, mask_lo),
131+
_mm256_and_si256(hi_sub, mask_hi),
132+
);
179133

180-
_mm256_storeu_si256(ptr as *mut __m256i, clean);
134+
_mm256_storeu_si256(ptr as *mut __m256i, result);
181135
}
182136

183137
// Scalar tail

0 commit comments

Comments
 (0)