@@ -118,7 +118,7 @@ pub use crate::simd_avx512::{
118118 // 256-bit (AVX2 baseline, __m256/__m256d)
119119 F32x8 , F64x4 , f32x8, f64x4,
120120 // 512-bit (native AVX-512, __m512/__m512d/__m512i)
121- F32x16 , F64x8 , U8x64 , I32x16 , I64x8 , U32x16 , U64x8 ,
121+ F32x16 , F64x8 , U8x64 , I32x16 , I64x8 , U16x32 , U32x16 , U64x8 ,
122122 F32Mask16 , F64Mask8 ,
123123 f32x16, f64x8, u8x64, i32x16, i64x8, u32x16, u64x8,
124124} ;
@@ -152,7 +152,7 @@ pub use crate::simd_avx512::{F32x8, F64x4, f32x8, f64x4};
152152
153153#[ cfg( all( target_arch = "x86_64" , not( target_feature = "avx512f" ) ) ) ]
154154pub use crate :: simd_avx2:: {
155- F32x16 , F64x8 , U8x64 , I32x16 , I64x8 , U32x16 , U64x8 ,
155+ F32x16 , F64x8 , U8x64 , I32x16 , I64x8 , U16x32 , U32x16 , U64x8 ,
156156 F32Mask16 , F64Mask8 ,
157157 f32x16, f64x8, u8x64, i32x16, i64x8, u32x16, u64x8,
158158} ;
@@ -551,9 +551,41 @@ mod scalar {
551551 impl_int_type ! ( U8x64 , u8 , 64 , 0u8 ) ;
552552 impl_int_type ! ( I32x16 , i32 , 16 , 0i32 ) ;
553553 impl_int_type ! ( I64x8 , i64 , 8 , 0i64 ) ;
554+ impl_int_type ! ( U16x32 , u16 , 32 , 0u16 ) ;
554555 impl_int_type ! ( U32x16 , u32 , 16 , 0u32 ) ;
555556 impl_int_type ! ( U64x8 , u64 , 8 , 0u64 ) ;
556557
558+ // Extra methods for U16x32 (widen/narrow, shift, multiply)
559+ impl U16x32 {
560+ #[ inline( always) ]
561+ pub fn from_u8x64_lo ( v : U8x64 ) -> Self {
562+ let mut out = [ 0u16 ; 32 ] ; for i in 0 ..32 { out[ i] = v. 0 [ i] as u16 ; } Self ( out)
563+ }
564+ #[ inline( always) ]
565+ pub fn from_u8x64_hi ( v : U8x64 ) -> Self {
566+ let mut out = [ 0u16 ; 32 ] ; for i in 0 ..32 { out[ i] = v. 0 [ 32 + i] as u16 ; } Self ( out)
567+ }
568+ #[ inline( always) ]
569+ pub fn pack_saturate_u8 ( self , other : Self ) -> U8x64 {
570+ let mut out = [ 0u8 ; 64 ] ;
571+ for i in 0 ..32 { out[ i] = self . 0 [ i] . min ( 255 ) as u8 ; }
572+ for i in 0 ..32 { out[ 32 + i] = other. 0 [ i] . min ( 255 ) as u8 ; }
573+ U8x64 ( out)
574+ }
575+ #[ inline( always) ]
576+ pub fn shr ( self , imm : u32 ) -> Self {
577+ let mut out = [ 0u16 ; 32 ] ; for i in 0 ..32 { out[ i] = if imm < 16 { self . 0 [ i] >> imm } else { 0 } ; } Self ( out)
578+ }
579+ #[ inline( always) ]
580+ pub fn shl ( self , imm : u32 ) -> Self {
581+ let mut out = [ 0u16 ; 32 ] ; for i in 0 ..32 { out[ i] = if imm < 16 { self . 0 [ i] << imm } else { 0 } ; } Self ( out)
582+ }
583+ #[ inline( always) ]
584+ pub fn mullo ( self , other : Self ) -> Self {
585+ let mut out = [ 0u16 ; 32 ] ; for i in 0 ..32 { out[ i] = self . 0 [ i] . wrapping_mul ( other. 0 [ i] ) ; } Self ( out)
586+ }
587+ }
588+
557589 // Extra methods for I32x16 that float types have via the macro
558590 impl I32x16 {
559591 #[ inline( always) ]
@@ -842,6 +874,10 @@ mod scalar {
842874 let mut out = [ 0u8 ; 64 ] ; for i in 0 ..64 { out[ i] = self . 0 [ ( idx. 0 [ i] & 63 ) as usize ] ; } Self ( out)
843875 }
844876 #[ inline( always) ]
877+ pub fn movemask ( self ) -> u64 {
878+ let mut m: u64 = 0 ; for i in 0 ..64 { if self . 0 [ i] & 0x80 != 0 { m |= 1 << i; } } m
879+ }
880+ #[ inline( always) ]
845881 pub fn unpack_lo_epi8 ( self , other : Self ) -> Self {
846882 let mut out = [ 0u8 ; 64 ] ;
847883 for lane in 0 ..4 { let b = lane * 16 ; for i in 0 ..8 { out[ b+i* 2 ] = self . 0 [ b+i] ; out[ b+i* 2 +1 ] = other. 0 [ b+i] ; } }
@@ -905,7 +941,7 @@ mod scalar {
905941
906942#[ cfg( not( target_arch = "x86_64" ) ) ]
907943pub use scalar:: {
908- F32x16 , F64x8 , U8x64 , I32x16 , I64x8 , U32x16 , U64x8 ,
944+ F32x16 , F64x8 , U8x64 , I32x16 , I64x8 , U16x32 , U32x16 , U64x8 ,
909945 F32x8 , F64x4 ,
910946 F32Mask16 , F64Mask8 ,
911947 f32x16, f64x8, u8x64, i32x16, i64x8, u32x16, u64x8,
0 commit comments