@@ -25,17 +25,6 @@ pub fn nibble_unpack(packed: &[u8], count: usize) -> Vec<u8> {
2525
2626 let mut out = Vec :: with_capacity ( count) ;
2727
28- #[ cfg( target_arch = "x86_64" ) ]
29- {
30- if is_x86_feature_detected ! ( "avx2" ) {
31- // SAFETY: avx2 detected, slice bounds respected.
32- unsafe {
33- nibble_unpack_avx2 ( packed, count, & mut out) ;
34- return out;
35- }
36- }
37- }
38-
3928 nibble_unpack_scalar ( packed, count, & mut out) ;
4029 out
4130}
@@ -48,46 +37,6 @@ fn nibble_unpack_scalar(packed: &[u8], count: usize, out: &mut Vec<u8>) {
4837 }
4938}
5039
51- #[ cfg( target_arch = "x86_64" ) ]
52- #[ target_feature( enable = "avx2" ) ]
53- unsafe fn nibble_unpack_avx2 ( packed : & [ u8 ] , count : usize , out : & mut Vec < u8 > ) {
54- use core:: arch:: x86_64:: * ;
55-
56- let mask_lo = _mm256_set1_epi8 ( 0x0F ) ;
57- let full_bytes = count / 2 ;
58- let chunks = full_bytes / 32 ;
59-
60- // Reserve space
61- out. reserve ( count) ;
62- let dst = out. as_mut_ptr ( ) ;
63-
64- for c in 0 ..chunks {
65- let src = packed. as_ptr ( ) . add ( c * 32 ) ;
66- let data = _mm256_loadu_si256 ( src as * const __m256i ) ;
67- let lo = _mm256_and_si256 ( data, mask_lo) ;
68- let hi = _mm256_srli_epi16 ( data, 4 ) ;
69- let hi = _mm256_and_si256 ( hi, mask_lo) ;
70- // Interleave: lo[i], hi[i] for each byte
71- let interleaved_lo = _mm256_unpacklo_epi8 ( lo, hi) ;
72- let interleaved_hi = _mm256_unpackhi_epi8 ( lo, hi) ;
73- // Store (note: unpacklo/hi work on 128-bit lanes so need permute)
74- let perm_lo = _mm256_permute4x64_epi64 ( interleaved_lo, 0b11_01_10_00 ) ;
75- let perm_hi = _mm256_permute4x64_epi64 ( interleaved_hi, 0b11_01_10_00 ) ;
76- _mm256_storeu_si256 ( dst. add ( c * 64 ) as * mut __m256i , perm_lo) ;
77- _mm256_storeu_si256 ( dst. add ( c * 64 + 32 ) as * mut __m256i , perm_hi) ;
78- }
79-
80- let simd_done = chunks * 64 ;
81- out. set_len ( simd_done) ;
82-
83- // Scalar tail
84- for i in simd_done..count {
85- let byte = packed[ i / 2 ] ;
86- let val = if i & 1 == 0 { byte & 0x0F } else { byte >> 4 } ;
87- out. push ( val) ;
88- }
89- }
90-
9140/// Pack `u8` values (each 0-15) into 4-bit nibble pairs.
9241///
9342/// Values are clamped to 0-15. The resulting byte count is `(values.len() + 1) / 2`.
@@ -158,26 +107,31 @@ unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) {
158107 use core:: arch:: x86_64:: * ;
159108
160109 let mask_lo = _mm256_set1_epi8 ( 0x0F ) ;
110+ let mask_hi = _mm256_set1_epi8 ( 0xF0u8 as i8 ) ;
161111 let delta_v = _mm256_set1_epi8 ( delta as i8 ) ;
112+ // delta shifted into high nibble position for direct subtraction
113+ let delta_hi = _mm256_set1_epi8 ( ( delta << 4 ) as i8 ) ;
162114 let chunks = packed. len ( ) / 32 ;
163115
164116 for c in 0 ..chunks {
165117 let ptr = packed. as_mut_ptr ( ) . add ( c * 32 ) ;
166118 let data = _mm256_loadu_si256 ( ptr as * const __m256i ) ;
167119
120+ // Extract low nibbles, subtract with saturation
168121 let lo = _mm256_and_si256 ( data, mask_lo) ;
169- let hi = _mm256_and_si256 ( _mm256_srli_epi16 ( data, 4 ) , mask_lo) ;
170-
171122 let lo_sub = _mm256_subs_epu8 ( lo, delta_v) ;
172- let hi_sub = _mm256_subs_epu8 ( hi, delta_v) ;
173123
174- let result = _mm256_or_si256 ( lo_sub, _mm256_slli_epi16 ( hi_sub, 4 ) ) ;
175- // Clear any bits that leaked from the shift into adjacent nibbles.
176- let clean_hi = _mm256_and_si256 ( result, _mm256_set1_epi8 ( 0xF0u8 as i8 ) ) ;
177- let clean_lo = _mm256_and_si256 ( result, mask_lo) ;
178- let clean = _mm256_or_si256 ( clean_lo, clean_hi) ;
124+ // Extract high nibbles (keep in high position), subtract with saturation
125+ let hi = _mm256_and_si256 ( data, mask_hi) ;
126+ let hi_sub = _mm256_subs_epu8 ( hi, delta_hi) ;
127+
128+ // Combine: low nibbles are already clean (0-15), high nibbles already in position
129+ let result = _mm256_or_si256 (
130+ _mm256_and_si256 ( lo_sub, mask_lo) ,
131+ _mm256_and_si256 ( hi_sub, mask_hi) ,
132+ ) ;
179133
180- _mm256_storeu_si256 ( ptr as * mut __m256i , clean ) ;
134+ _mm256_storeu_si256 ( ptr as * mut __m256i , result ) ;
181135 }
182136
183137 // Scalar tail
0 commit comments