From 776a473536a2a54b961fc4b46214340d4d2eda2e Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Fri, 1 May 2026 22:55:38 +1000 Subject: [PATCH 1/7] scaffold: x86_64 SSE encode/decode module structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stands up the cfg-gated module structure for an x86_64 SIMD implementation, wired into encode_into / decode_into the same way the aarch64 NEON path is. Bodies are scalar-fallback stubs for now — the structural contract (function signatures, magic constants, test harness) matches aarch64 so swapping in intrinsics one helper at a time is safe. # What's here src/ops/x86_64.rs — div_85, div_85_sq, div_85_cube (same magic constants as aarch64.rs; currently scalar-implemented stubs). Same quickcheck tests as the NEON path. src/ops/mod.rs — cfg-arm exporting div_* on x86_64. src/block.rs::sse — `SseEncoder` + `try_decode_block_x4` stubs that delegate to the scalar block functions. Each is annotated with a comment block sketching the target intrinsic chain (load, divmod, shuffle, store). src/lib.rs — encode_into / decode_into gain a #[cfg(target_arch = "x86_64")] arm mirroring the aarch64 one. The scalar-fallback gate becomes cfg(not(any(aarch64, x86_64))). # How to develop x86 SIMD locally on Apple Silicon Two checked-in workflows: rustup target add x86_64-apple-darwin cargo test --target x86_64-apple-darwin # runs under Rosetta cargo build --target x86_64-apple-darwin # cross-compile only cargo clippy --target x86_64-apple-darwin --all-targets -- -D warnings Rosetta translates SSE/AVX→NEON dynamically, so it's good for correctness but not for benchmarking. AVX-512 is unsupported. For real x86 perf numbers, run `cargo bench` on the GitHub Actions Ubuntu x86_64 runner (already in CI) or a cloud VM. # How to fill in the SIMD bodies Each stub has a numbered sketch of the intrinsic chain in its docstring. The aarch64 → x86 mapping cheat-sheet lives at the top of `src/ops/x86_64.rs`. The order I'd take is: 1. `div_magic` in src/ops/x86_64.rs (the simplest function, just `_mm_mul_epu32` + `_mm_srli_epi32`). Quickcheck against the existing tests in that file. 2. `SseEncoder::encode_block_x4` next, since the encoder's parallel-magic structure is straightforward to port. 3. `try_decode_block_x4` last, with the same overflow-detection trick the NEON version uses. # Verified locally cargo test — 47 tests pass on aarch64 cargo test --target x86_64-apple-darwin — 47 tests pass via Rosetta cargo clippy --all-targets -- -D warnings — clean (aarch64) cargo clippy --target x86_64-apple-darwin --all-targets -- -D warnings — clean cargo check --target x86_64-unknown-linux-gnu --all-targets — clean cargo fmt --all -- --check — clean --- src/block.rs | 118 ++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 108 +++++++++++++++++++++++++++++++++++- src/ops/mod.rs | 12 +++- src/ops/x86_64.rs | 136 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 369 insertions(+), 5 deletions(-) create mode 100644 src/ops/x86_64.rs diff --git a/src/block.rs b/src/block.rs index ef040fc..0f2887d 100644 --- a/src/block.rs +++ b/src/block.rs @@ -514,6 +514,124 @@ mod neon { } } +// ───────────────────────────────────────────────────────────────────────── +// SSE encoder / decoder (x86_64) +// ───────────────────────────────────────────────────────────────────────── +// +// SSSE3 is required for `_mm_shuffle_epi8` (the equivalent of NEON's +// `vqtbl1q_u8`); SSE4.1 is required for `_mm_mullo_epi32` (lane-wise +// u32 multiply). Both are universal on x86_64 silicon since ~2008. We +// depend on them being present at compile time via the default +// `target-feature` set on `x86_64-*-*` (which includes SSSE3 and +// SSE4.1 for all modern targets including `x86_64-unknown-linux-gnu` +// and `x86_64-apple-darwin`). + +#[cfg(target_arch = "x86_64")] +pub(crate) use sse::{SseEncoder, try_decode_block_x4}; + +#[cfg(target_arch = "x86_64")] +mod sse { + #![allow(clippy::indexing_slicing)] + // The `unwrap`s in the stub bodies are over `&[u8; 16]` / `&[u8; 20]` + // sub-array conversions whose lengths are statically known. Once the + // bodies are replaced with intrinsics, this allow can probably go. + #![allow(clippy::unwrap_used)] + #![allow(dead_code)] // scaffold — bodies will use these once intrinsics are written. + + use crate::ops::{div_85, div_85_cube, div_85_sq}; + use std::arch::x86_64::__m128i; + + /// SSSE3/SSE4.1 4-block encoder. Mirrors [`super::neon::NeonEncoder`]. + /// + /// Constructor is a no-op; per-call splice indexes live in static + /// arrays loaded via `_mm_loadu_si128`. + pub(crate) struct SseEncoder; + + impl SseEncoder { + #[inline] + pub(crate) fn new() -> Self { + Self + } + + /// Encode exactly 16 input bytes into exactly 20 output bytes. + /// + /// **Stub.** This currently delegates to the scalar + /// [`super::encode_block`] for each of the 4 sub-blocks, so the + /// path is correct but unaccelerated. Replace the body with the + /// SSE intrinsic chain to get the speed-up: + /// + /// 1. Load 16 bytes via `_mm_loadu_si128` and byte-swap each + /// u32 lane via `_mm_shuffle_epi8` with a constant index + /// `[3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12]`. + /// 2. Compute `q1 = n / 85`, `q2 = n / 85²`, `q3 = n / 85³`, + /// `q4 = q2 / 85²` using [`div_85`] / [`div_85_sq`] / + /// [`div_85_cube`] (already wired up — see [`crate::ops`]). + /// 3. Compute the 5 digits via lane-wise multiply-subtract + /// (`_mm_sub_epi32(q_k_minus_1, _mm_mullo_epi32(q_k, splat85))`). + /// 4. Splice digits into out_1 (16 bytes) + out_2 (4 bytes) + /// via `_mm_shuffle_epi8` with a chain of OR-blends. + /// 5. Convert digits 0..84 to ASCII via `_mm_shuffle_epi8` + /// against a precomputed alphabet table (the table is too + /// large for a single `pshufb`, so split at 64 like the + /// NEON path). + /// 6. Store via `_mm_storeu_si128` + a 4-byte tail store. + #[inline] + pub(crate) fn encode_block_x4(&self, in_bytes: &[u8; 16], out_bytes: &mut [u8; 20]) { + // SAFETY: in_bytes is a 16-byte array; sub-arrays at the + // 4-byte boundaries are in-bounds. + for i in 0..4 { + let src: &[u8; 4] = (&in_bytes[i * 4..i * 4 + 4]).try_into().unwrap(); + let dst: &mut [u8; 5] = (&mut out_bytes[i * 5..i * 5 + 5]).try_into().unwrap(); + super::encode_block(src, dst); + } + // Suppress unused-import warnings while these are not yet wired + // through the SIMD path. + let _: fn(__m128i) -> __m128i = div_85; + let _: fn(__m128i) -> __m128i = div_85_sq; + let _: fn(__m128i) -> __m128i = div_85_cube; + } + } + + /// SSSE3 decode of 20 input chars → 16 output bytes. + /// + /// **Stub.** Currently delegates to scalar [`super::decode_block`] + /// for each of the 4 sub-blocks, returning `Err(())` on the first + /// invalid char or u32 overflow so the loop in + /// [`crate::decode_into`] falls back to scalar (which surfaces + /// the precise [`crate::DecodeError`] with the right byte + /// position). + /// + /// Sketch of the SIMD body: + /// + /// 1. `_mm_loadu_si128` for chars 0..16 and a partial load for + /// chars 16..19 (zero-padded). + /// 2. Range-validate `[33, 126]` via two `_mm_cmpgt_epi8` masks + /// and `_mm_or_si128`. + /// 3. ASCII → digit via two `_mm_shuffle_epi8` table lookups + /// (low half + high half of the alphabet) with a `_mm_blendv_epi8` + /// selector keyed on `chars < 97`. + /// 4. Detect 0xFF results (in-alphabet gaps) via `_mm_cmpeq_epi8`. + /// 5. Permute digits into per-position `__m128i` vectors `v_d0..v_d4` + /// via `_mm_shuffle_epi8` over a paired source. + /// 6. Horner-style `vmlaq_n_u32` chain becomes `_mm_mullo_epi32` + + /// `_mm_add_epi32`. + /// 7. Overflow check: `n_wrap < d0 * 85⁴` lane-wise via + /// `_mm_cmplt_epi32` (for unsigned this needs the bias trick). + /// 8. Byte-swap each u32 lane back to BE via `_mm_shuffle_epi8`, + /// then `_mm_storeu_si128`. + #[inline] + pub(crate) fn try_decode_block_x4(input: &[u8; 20], out: &mut [u8; 16]) -> Result<(), ()> { + for i in 0..4 { + let src: &[u8; 5] = (&input[i * 5..i * 5 + 5]).try_into().unwrap(); + let dst: &mut [u8; 4] = (&mut out[i * 4..i * 4 + 4]).try_into().unwrap(); + // base_position 0 because the caller (`decode_into`) replays + // the chunk scalar on Err and handles position reporting itself. + super::decode_block(src, dst, 0).map_err(|_| ())?; + } + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/lib.rs b/src/lib.rs index 035490c..d2b19d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -196,7 +196,54 @@ fn encode_into(input: &[u8], out: &mut [u8]) { } } -#[cfg(not(target_arch = "aarch64"))] +#[cfg(target_arch = "x86_64")] +fn encode_into(input: &[u8], out: &mut [u8]) { + use block::SseEncoder; + + let encoder = SseEncoder::new(); + let mut in_off = 0usize; + let mut out_off = 0usize; + + // 16-byte → 20-char SSE loop. (Currently a scalar-fallback stub + // inside `SseEncoder::encode_block_x4`; replace its body with + // SSSE3/SSE4.1 intrinsics to get the speedup.) + while in_off.saturating_add(16) <= input.len() { + let Some(in_chunk) = chunk_at::<16>(input, in_off) else { + return; + }; + let Some(out_chunk) = chunk_at_mut::<20>(out, out_off) else { + return; + }; + encoder.encode_block_x4(in_chunk, out_chunk); + in_off = in_off.saturating_add(16); + out_off = out_off.saturating_add(20); + } + + while in_off.saturating_add(4) <= input.len() { + let Some(block) = chunk_at::<4>(input, in_off) else { + return; + }; + let Some(chunk) = chunk_at_mut::<5>(out, out_off) else { + return; + }; + encode_block(block, chunk); + in_off = in_off.saturating_add(4); + out_off = out_off.saturating_add(5); + } + + let tail_in_len = input.len().saturating_sub(in_off); + if tail_in_len > 0 { + let tail_out_len = tail_in_len.saturating_add(1); + if let (Some(tail_in), Some(tail_out)) = ( + input.get(in_off..in_off.saturating_add(tail_in_len)), + out.get_mut(out_off..out_off.saturating_add(tail_out_len)), + ) { + encode_tail(tail_in, tail_out); + } + } +} + +#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn encode_into(input: &[u8], out: &mut [u8]) { let mut in_off = 0usize; let mut out_off = 0usize; @@ -290,7 +337,64 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { Ok(()) } -#[cfg(not(target_arch = "aarch64"))] +#[cfg(target_arch = "x86_64")] +#[allow( + clippy::unwrap_used, + clippy::indexing_slicing, + reason = "infallible / in-bounds by loop-condition invariant; see SAFETY comment" +)] +fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { + use block::try_decode_block_x4; + + let mut in_off = 0usize; + let mut out_off = 0usize; + + while in_off + 20 <= input.len() && out_off + 16 <= out.len() { + // SAFETY: the slice arms below are checked by the loop guard. + let Some(in_chunk) = chunk_at::<20>(input, in_off) else { + return Ok(()); + }; + let Some(out_chunk) = chunk_at_mut::<16>(out, out_off) else { + return Ok(()); + }; + + if try_decode_block_x4(in_chunk, out_chunk).is_ok() { + in_off += 20; + out_off += 16; + } else { + for _ in 0..4 { + let block: &[u8; 5] = (&input[in_off..in_off + 5]).try_into().unwrap(); + let chunk: &mut [u8; 4] = (&mut out[out_off..out_off + 4]).try_into().unwrap(); + decode_block(block, chunk, in_off)?; + in_off += 5; + out_off += 4; + } + } + } + + while in_off + 5 <= input.len() && out_off + 4 <= out.len() { + let block: &[u8; 5] = (&input[in_off..in_off + 5]).try_into().unwrap(); + let chunk: &mut [u8; 4] = (&mut out[out_off..out_off + 4]).try_into().unwrap(); + decode_block(block, chunk, in_off)?; + in_off += 5; + out_off += 4; + } + + let tail_in_len = input.len().saturating_sub(in_off); + if tail_in_len > 0 { + let tail_out_len = tail_in_len.saturating_sub(1); + if let (Some(tail_in), Some(tail_out)) = ( + input.get(in_off..in_off.saturating_add(tail_in_len)), + out.get_mut(out_off..out_off.saturating_add(tail_out_len)), + ) { + decode_tail(tail_in, tail_out, in_off)?; + } + } + + Ok(()) +} + +#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { let mut in_off = 0usize; let mut out_off = 0usize; diff --git a/src/ops/mod.rs b/src/ops/mod.rs index aee763e..369e3b2 100644 --- a/src/ops/mod.rs +++ b/src/ops/mod.rs @@ -2,12 +2,18 @@ //! //! Production scalar code uses plain `n / 85` / `n % 85` because the //! compiler lowers those to the libdivide-style multiply-then-shift -//! sequence in release builds. The hand-written NEON helpers in -//! [`aarch64`] cover the parallel-divide-by-`85ᵏ` cases the compiler -//! cannot auto-vectorise across u32x4 lanes. +//! sequence in release builds. The hand-written SIMD helpers below +//! cover the parallel-divide-by-`85ᵏ` cases the compiler cannot +//! auto-vectorise across u32x4 lanes. #[cfg(target_arch = "aarch64")] mod aarch64; #[cfg(target_arch = "aarch64")] pub(crate) use aarch64::{div_85, div_85_cube, div_85_sq}; + +#[cfg(target_arch = "x86_64")] +mod x86_64; + +#[cfg(target_arch = "x86_64")] +pub(crate) use x86_64::{div_85, div_85_cube, div_85_sq}; diff --git a/src/ops/x86_64.rs b/src/ops/x86_64.rs new file mode 100644 index 0000000..29b2be3 --- /dev/null +++ b/src/ops/x86_64.rs @@ -0,0 +1,136 @@ +//! x86_64 SSE2 / SSSE3 helpers — the structural mirror of `aarch64.rs`. +//! +//! The magic constants are the same (libdivide form, computed from the +//! divisor); only the SIMD intrinsics differ. Mapping cheat-sheet: +//! +//! | NEON | x86 (SSE2 / SSSE3) | what it does | +//! |------------------------|---------------------------|-------------------------------------------| +//! | `vmull_u32` | `_mm_mul_epu32` | u32×u32 → u64 widening multiply (2 lanes) | +//! | `vshrq_n_u32` | `_mm_srli_epi32` | logical shift right per u32 lane | +//! | `vmulq_n_u32` | `_mm_mullo_epi32` (SSE4.1)| u32×u32 → u32 (low 32 bits) | +//! | `vsubq_u32` | `_mm_sub_epi32` | u32 lane-wise subtract | +//! | `vqtbl1q_u8` | `_mm_shuffle_epi8` (SSSE3)| byte-shuffle / lookup | +//! | `vbslq_u8` | `_mm_blendv_epi8` (SSE4.1)| byte-wise blend with mask | +//! | `vrev32q_u8` | `_mm_shuffle_epi8` + idx | reverse bytes in each u32 lane | +//! +//! Required CPU features for the decode hot path: **SSSE3** (for +//! `pshufb`). For the encode hot path: **SSE4.1** (for `pmulld`). +//! Both are universal on x86_64 silicon since ~2008. +//! +//! ## Implementation status +//! +//! Currently a stub: the helpers below compile and produce correct +//! results, but they go through the scalar path (manual 4-lane +//! unroll). Replace the bodies with intrinsics one at a time and +//! verify against the quickcheck tests at the bottom of the file. + +use std::arch::x86_64::__m128i; + +/// Lane-wise `input / 85`. Magic constants verified for the full u32 range. +#[inline] +pub(crate) fn div_85(input: __m128i) -> __m128i { + // m = ceil(2^38 / 85) = 3_233_857_729; m * 85 - 2^38 = 21 < 2^6. + div_magic::<3_233_857_729, 6>(input) +} + +/// Lane-wise `input / 7225` (i.e. `input / 85²`). Valid for all u32 inputs. +#[inline] +pub(crate) fn div_85_sq(input: __m128i) -> __m128i { + // m = ceil(2^44 / 7225) = 2_434_904_643; m * 7225 - 2^44 = 1259 < 2^12. + div_magic::<2_434_904_643, 12>(input) +} + +/// Lane-wise `input / 614125` (i.e. `input / 85³`). Valid for all u32 inputs. +#[inline] +pub(crate) fn div_85_cube(input: __m128i) -> __m128i { + // m = ceil(2^51 / 614125) = 3_666_679_933; bound check passes for u32. + div_magic::<3_666_679_933, 19>(input) +} + +/// Generic libdivide-style div: `q = (n * MAGIC) >> (32 + SHIFT)`. +/// +/// SSE2 path. The widening multiply on x86 is `_mm_mul_epu32`, which +/// multiplies the *even* u32 lanes of two `__m128i` registers and +/// produces a `__m128i` containing two u64 results in the even +/// positions. We multiply twice — once for even lanes, once after +/// shuffling odd lanes into even positions — then shift to extract +/// the high u32 of each u64 product, and finally apply the SHIFT. +// +// TODO(x86 SIMD): replace this scalar fallback body with the SSE2 +// multiply + shift sequence sketched above. The interface is fixed; +// only the body needs to change. See the existing aarch64.rs for the +// matching NEON sequence. +#[inline] +fn div_magic(input: __m128i) -> __m128i { + use std::arch::x86_64::{_mm_loadu_si128, _mm_storeu_si128}; + + // Stub: dump to stack, do scalar division, reload. Correct but slow. + let mut lanes = [0u32; 4]; + unsafe { + _mm_storeu_si128(lanes.as_mut_ptr().cast::<__m128i>(), input); + } + for lane in &mut lanes { + let xl = MAGIC as u64; + let yl = *lane as u64; + let q = ((xl * yl) >> 32) as u32; + *lane = q >> SHIFT; + } + unsafe { _mm_loadu_si128(lanes.as_ptr().cast::<__m128i>()) } +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::{Arbitrary, quickcheck}; + use std::{ + arch::x86_64::{_mm_loadu_si128, _mm_storeu_si128}, + array, + }; + + #[derive(Debug, Clone, Copy)] + struct InputBlock([u32; 4]); + + impl Arbitrary for InputBlock { + fn arbitrary(g: &mut quickcheck::Gen) -> Self { + InputBlock(array::from_fn(|_| u32::arbitrary(g))) + } + } + + fn lanes(v: __m128i) -> [u32; 4] { + let mut out = [0u32; 4]; + unsafe { _mm_storeu_si128(out.as_mut_ptr().cast::<__m128i>(), v) }; + out + } + + fn load(input: &[u32; 4]) -> __m128i { + unsafe { _mm_loadu_si128(input.as_ptr().cast::<__m128i>()) } + } + + quickcheck! { + fn div_85_matches_scalar(block: InputBlock) -> bool { + let InputBlock(input) = block; + let q_out = lanes(div_85(load(&input))); + (0..4).all(|i| q_out[i] == input[i] / 85) + } + + fn div_85_sq_matches_scalar(block: InputBlock) -> bool { + let InputBlock(input) = block; + let q_out = lanes(div_85_sq(load(&input))); + (0..4).all(|i| q_out[i] == input[i] / (85 * 85)) + } + + fn div_85_cube_matches_scalar(block: InputBlock) -> bool { + let InputBlock(input) = block; + let q_out = lanes(div_85_cube(load(&input))); + (0..4).all(|i| q_out[i] == input[i] / (85 * 85 * 85)) + } + + fn div_85_to_the_4_via_composition(block: InputBlock) -> bool { + let InputBlock(input) = block; + let q2 = div_85_sq(load(&input)); + let q4 = div_85_sq(q2); + let q4_out = lanes(q4); + (0..4).all(|i| q4_out[i] == input[i] / (85u32.pow(4))) + } + } +} From 6746d4a63157e217408aba1c79679092fd6ce2ba Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Fri, 1 May 2026 23:59:50 +1000 Subject: [PATCH 2/7] feat(x86): port encode + decode to SSSE3 / SSE4.1 intrinsics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the scalar-fallback stubs in `src/block.rs::sse` and `src/ops/x86_64.rs` with real SIMD bodies. Algorithm shape is unchanged from the NEON path; only the intrinsics differ. # div_magic (src/ops/x86_64.rs) Lane-wise libdivide: m = _mm_set1_epi32(MAGIC) even = _mm_mul_epu32(input, m) // touches lanes 0, 2 odd = _mm_mul_epu32(_mm_srli_epi64::<32>(input), m) // lanes 1, 3 pick the high u32 of each u64 product via _mm_shuffle_epi32::<0xF5> blend the two halves with _mm_blend_epi16::<0xCC> finally _mm_srli_epi32:: # encode_block_x4 (src/block.rs::sse) Same parallel-magic structure as NEON: q1, q2, q3 in parallel from n, then q4 = q2 / 85². Five digits via lane-wise multiply-subtract (_mm_mullo_epi32 + _mm_sub_epi32). Splice digits into out_1 (16 bytes) + out_2 (4 bytes) via PSHUFB+OR chains using the same byte-position indexes as NEON. Finally byte_to_char85_x86 (digit → ASCII). # byte_to_char85_x86 — corrected approach Hit a real x86 gotcha here. PSHUFB only zeros its output when bit 7 of the index byte is set; indices in 16..127 produce `table[index & 0x0F]`. The naive "subtract chunk-base, PSHUFB, OR" pattern that works on NEON's TBX (which has merge semantics) silently produces wrong bytes for most inputs (e.g. digit 84 returns t0[4]='4' from chunk 0 instead of 0). Fix: split each digit into a 4-bit high-nibble (chunk index 0..5) and a 4-bit low-nibble (entry within chunk). Run all 6 chunk PSHUFBs unconditionally with the low nibble; mask each result with `high_nib == N` (cmpeq) and OR. This costs ~22 ops vs NEON's single-instruction vqtbl4q_u8, which is intrinsic to x86 PSHUFB having only one 16-byte source. # try_decode_block_x4 (src/block.rs::sse) Mirrors NEON: load 16+4 chars, range-validate [33, 126] via cmpgt, ASCII→digit via the same chunk_idx-selection trick (96-byte CHAR_TO_85_PADDED table), permute to per-position vectors via two PSHUFBs (main + tail) ORed together, Horner via mullo+add, overflow detection via the bias-and-signed-cmpgt unsigned-cmplt trick (SSE has no native unsigned cmplt), big-endian store via PSHUFB + storeu. # Target-feature gating Every SIMD entry point carries `#[target_feature(enable = "sse4.1,ssse3")]` so the file compiles on any `x86_64-*-*` target regardless of default target-features (`x86_64-unknown-linux-gnu` defaults to just SSE2). The callers in `encode_into` / `decode_into` do a one-shot `std::is_x86_feature_detected!` check at the entry point and route to a scalar fallback (`encode_into_scalar` / `decode_into_scalar`) for hosts without SSE4.1+SSSE3. The branch predicts perfectly so per-call cost is negligible. # Verified cargo test — 47 tests pass (aarch64 native) cargo test --target x86_64-apple-darwin — 47 tests pass (Rosetta) cargo clippy --all-targets -- -D warnings — clean (aarch64) cargo clippy --target x86_64-apple-darwin --all-targets -- -D warnings — clean (x86 mac) cargo check --target x86_64-unknown-linux-gnu --all-targets — clean cargo fmt --all -- --check — clean Rosetta dynamically retargets SSE→NEON on Apple Silicon, so it validates correctness only — the perf numbers it produces are not representative of native x86. CI's Ubuntu x86_64 runner will give the first real perf data point. --- src/block.rs | 473 ++++++++++++++++++++++++++++++++++++++-------- src/lib.rs | 92 ++++++++- src/ops/x86_64.rs | 141 ++++++++------ 3 files changed, 568 insertions(+), 138 deletions(-) diff --git a/src/block.rs b/src/block.rs index 0f2887d..5c78186 100644 --- a/src/block.rs +++ b/src/block.rs @@ -531,20 +531,79 @@ pub(crate) use sse::{SseEncoder, try_decode_block_x4}; #[cfg(target_arch = "x86_64")] mod sse { - #![allow(clippy::indexing_slicing)] - // The `unwrap`s in the stub bodies are over `&[u8; 16]` / `&[u8; 20]` - // sub-array conversions whose lengths are statically known. Once the - // bodies are replaced with intrinsics, this allow can probably go. - #![allow(clippy::unwrap_used)] - #![allow(dead_code)] // scaffold — bodies will use these once intrinsics are written. + //! SSSE3 + SSE4.1 encode/decode. + //! + //! Every entry point is `unsafe fn` with + //! `#[target_feature(enable = "sse4.1,ssse3")]` so the file + //! compiles on any `x86_64-*-*` target regardless of the default + //! target-feature set. The caller (`crate::decode_into` / + //! `crate::encode_into`) does a one-shot + //! `is_x86_feature_detected!` check before invoking and falls + //! back to scalar if the host CPU lacks the features. use crate::ops::{div_85, div_85_cube, div_85_sq}; - use std::arch::x86_64::__m128i; + use std::arch::x86_64::{ + __m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_cmpgt_epi8, _mm_cmpgt_epi32, + _mm_extract_epi32, _mm_loadu_si128, _mm_movemask_epi8, _mm_mullo_epi32, _mm_or_si128, + _mm_set1_epi8, _mm_set1_epi32, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16, + _mm_storeu_si128, _mm_sub_epi32, _mm_xor_si128, + }; - /// SSSE3/SSE4.1 4-block encoder. Mirrors [`super::neon::NeonEncoder`]. - /// - /// Constructor is a no-op; per-call splice indexes live in static - /// arrays loaded via `_mm_loadu_si128`. + // Index vectors for splicing the 5 digit values into out_1 (16 bytes) + // and out_2 (4 bytes). Same byte-position layout as the NEON path — + // each digit value lives in byte 0/4/8/12 of its u32 lane, and the + // 0xFF entries say "leave this output byte at zero" (PSHUFB sets + // bytes whose index has the high bit set to zero). + static IDX_OUT_1: [[u8; 16]; 5] = [ + [ + 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, 12, + ], + [ + 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, + ], + [ + 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, + ], + [ + 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, + ], + [ + 0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, + ], + ]; + static IDX_OUT_2: [[u8; 16]; 4] = [ + [ + 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, + ], + [ + 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, + ], + [ + 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, + ], + [ + 0xFF, 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, + ], + ]; + + /// PSHUFB index that reverses the bytes within each u32 lane. Used + /// to load 16 bytes as 4 big-endian u32 values (the encoder's input + /// is interpreted in network byte order). + static REV32_IDX: [u8; 16] = [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]; + + /// Alphabet padded to 96 bytes so we can do six 16-byte PSHUFB + /// lookups without reading past the end. Bytes 85..95 are unused + /// padding (any value is fine — they're never indexed because + /// digit values are in [0, 84]). + #[rustfmt::skip] + static ALPHABET_PADDED: &[u8; 96] = + b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~\0\0\0\0\0\0\0\0\0\0\0"; + + /// SSSE3 + SSE4.1 4-block encoder. Mirrors [`super::neon::NeonEncoder`]. pub(crate) struct SseEncoder; impl SseEncoder { @@ -555,80 +614,344 @@ mod sse { /// Encode exactly 16 input bytes into exactly 20 output bytes. /// - /// **Stub.** This currently delegates to the scalar - /// [`super::encode_block`] for each of the 4 sub-blocks, so the - /// path is correct but unaccelerated. Replace the body with the - /// SSE intrinsic chain to get the speed-up: - /// - /// 1. Load 16 bytes via `_mm_loadu_si128` and byte-swap each - /// u32 lane via `_mm_shuffle_epi8` with a constant index - /// `[3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12]`. - /// 2. Compute `q1 = n / 85`, `q2 = n / 85²`, `q3 = n / 85³`, - /// `q4 = q2 / 85²` using [`div_85`] / [`div_85_sq`] / - /// [`div_85_cube`] (already wired up — see [`crate::ops`]). - /// 3. Compute the 5 digits via lane-wise multiply-subtract - /// (`_mm_sub_epi32(q_k_minus_1, _mm_mullo_epi32(q_k, splat85))`). - /// 4. Splice digits into out_1 (16 bytes) + out_2 (4 bytes) - /// via `_mm_shuffle_epi8` with a chain of OR-blends. - /// 5. Convert digits 0..84 to ASCII via `_mm_shuffle_epi8` - /// against a precomputed alphabet table (the table is too - /// large for a single `pshufb`, so split at 64 like the - /// NEON path). - /// 6. Store via `_mm_storeu_si128` + a 4-byte tail store. + /// # Safety + /// Requires SSE4.1 + SSSE3 to be available on the host CPU. #[inline] - pub(crate) fn encode_block_x4(&self, in_bytes: &[u8; 16], out_bytes: &mut [u8; 20]) { - // SAFETY: in_bytes is a 16-byte array; sub-arrays at the - // 4-byte boundaries are in-bounds. - for i in 0..4 { - let src: &[u8; 4] = (&in_bytes[i * 4..i * 4 + 4]).try_into().unwrap(); - let dst: &mut [u8; 5] = (&mut out_bytes[i * 5..i * 5 + 5]).try_into().unwrap(); - super::encode_block(src, dst); + #[target_feature(enable = "sse4.1,ssse3")] + pub(crate) unsafe fn encode_block_x4(&self, in_bytes: &[u8; 16], out_bytes: &mut [u8; 20]) { + // SAFETY: caller has verified SSE4.1+SSSE3; calls below to the + // user-defined `unsafe fn` helpers (div_85*, byte_to_char85_x86) + // therefore satisfy their `target_feature` preconditions, and + // the unaligned pointer write at the end is bounded by the + // `[u8; 20]` array size. + unsafe { + // Load 16 bytes and reverse each 4-byte group to get 4 BE u32s. + let raw = _mm_loadu_si128(in_bytes.as_ptr().cast::<__m128i>()); + let n = + _mm_shuffle_epi8(raw, _mm_loadu_si128(REV32_IDX.as_ptr().cast::<__m128i>())); + + // Parallel-magic divides: q1, q2, q3 from n; q4 from q2. + let q1 = div_85(n); + let q2 = div_85_sq(n); + let q3 = div_85_cube(n); + let q4 = div_85_sq(q2); + + // Five digits via lane-wise multiply-subtract: + // d0 = q4 + // d1 = q3 - q4·85 + // d2 = q2 - q3·85 + // d3 = q1 - q2·85 + // d4 = n - q1·85 + let m85 = _mm_set1_epi32(85); + let d0 = q4; + let d1 = _mm_sub_epi32(q3, _mm_mullo_epi32(q4, m85)); + let d2 = _mm_sub_epi32(q2, _mm_mullo_epi32(q3, m85)); + let d3 = _mm_sub_epi32(q1, _mm_mullo_epi32(q2, m85)); + let d4 = _mm_sub_epi32(n, _mm_mullo_epi32(q1, m85)); + + // Splice 5 digits into out_1 via PSHUFB+OR chain. + let idx0 = _mm_loadu_si128(IDX_OUT_1[0].as_ptr().cast::<__m128i>()); + let idx1 = _mm_loadu_si128(IDX_OUT_1[1].as_ptr().cast::<__m128i>()); + let idx2 = _mm_loadu_si128(IDX_OUT_1[2].as_ptr().cast::<__m128i>()); + let idx3 = _mm_loadu_si128(IDX_OUT_1[3].as_ptr().cast::<__m128i>()); + let idx4 = _mm_loadu_si128(IDX_OUT_1[4].as_ptr().cast::<__m128i>()); + let p0 = _mm_shuffle_epi8(d0, idx0); + let p1 = _mm_shuffle_epi8(d1, idx1); + let p2 = _mm_shuffle_epi8(d2, idx2); + let p3 = _mm_shuffle_epi8(d3, idx3); + let p4 = _mm_shuffle_epi8(d4, idx4); + let out_1 = + _mm_or_si128(_mm_or_si128(p0, p1), _mm_or_si128(p2, _mm_or_si128(p3, p4))); + + // Same for out_2 (4 chars from block 3, digits 1..4). + let i20 = _mm_loadu_si128(IDX_OUT_2[0].as_ptr().cast::<__m128i>()); + let i21 = _mm_loadu_si128(IDX_OUT_2[1].as_ptr().cast::<__m128i>()); + let i22 = _mm_loadu_si128(IDX_OUT_2[2].as_ptr().cast::<__m128i>()); + let i23 = _mm_loadu_si128(IDX_OUT_2[3].as_ptr().cast::<__m128i>()); + let q21 = _mm_shuffle_epi8(d1, i20); + let q22 = _mm_shuffle_epi8(d2, i21); + let q23 = _mm_shuffle_epi8(d3, i22); + let q24 = _mm_shuffle_epi8(d4, i23); + let out_2 = _mm_or_si128(_mm_or_si128(q21, q22), _mm_or_si128(q23, q24)); + + // Digit value (0..84) → ASCII char via PSHUFB chain. + let out_1 = byte_to_char85_x86(out_1); + let out_2 = byte_to_char85_x86(out_2); + + // Store: 16 bytes from out_1, then the low 4 bytes from out_2. + _mm_storeu_si128(out_bytes.as_mut_ptr().cast::<__m128i>(), out_1); + let lo = _mm_extract_epi32::<0>(out_2) as u32; + std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(16).cast::(), lo); } - // Suppress unused-import warnings while these are not yet wired - // through the SIMD path. - let _: fn(__m128i) -> __m128i = div_85; - let _: fn(__m128i) -> __m128i = div_85_sq; - let _: fn(__m128i) -> __m128i = div_85_cube; } } - /// SSSE3 decode of 20 input chars → 16 output bytes. + /// Maps 16 lane-wise digit values (0..84) to ASCII characters. /// - /// **Stub.** Currently delegates to scalar [`super::decode_block`] - /// for each of the 4 sub-blocks, returning `Err(())` on the first - /// invalid char or u32 overflow so the loop in - /// [`crate::decode_into`] falls back to scalar (which surfaces - /// the precise [`crate::DecodeError`] with the right byte - /// position). + /// PSHUFB only zeros its output when **bit 7** of the index is set; + /// indices like 16..127 produce `table[index & 0x0F]` instead of 0, + /// so a naive "subtract chunk-base, then PSHUFB and OR all chunks" + /// pattern doesn't work directly. /// - /// Sketch of the SIMD body: + /// Instead we split each digit into a high nibble (chunk index, 0..5) + /// and a low nibble (entry within chunk, 0..15). All 6 chunk PSHUFBs + /// run unconditionally with the low nibble; the chunk-equality mask + /// (`high_nibble == N`) keeps only the correct one. This costs more + /// instructions than NEON's `vqtbl4q_u8` (which does a 64-entry + /// lookup in a single op) but avoids any PSHUFB high-bit shenanigans. + #[inline] + #[target_feature(enable = "sse4.1,ssse3")] + unsafe fn byte_to_char85_x86(x85: __m128i) -> __m128i { + // SAFETY: ALPHABET_PADDED is 96 bytes; offsets 0/16/32/48/64/80 + // each leave 16 readable bytes. Caller satisfies SSE4.1+SSSE3. + unsafe { + // _mm_srli_epi16 shifts each u16 lane → bits cross byte + // boundaries within a u16. The trailing AND 0x0F masks the + // cross-contamination so we get a clean per-byte high nibble. + let high_nib = _mm_and_si128(_mm_srli_epi16::<4>(x85), _mm_set1_epi8(0x0F)); + let low_nib = _mm_and_si128(x85, _mm_set1_epi8(0x0F)); + + let t0 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().cast::<__m128i>()); + let t1 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(16).cast::<__m128i>()); + let t2 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(32).cast::<__m128i>()); + let t3 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(48).cast::<__m128i>()); + let t4 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(64).cast::<__m128i>()); + let t5 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(80).cast::<__m128i>()); + + // 6 PSHUFBs — each looks up `low_nib` in its own chunk. + let r0 = _mm_shuffle_epi8(t0, low_nib); + let r1 = _mm_shuffle_epi8(t1, low_nib); + let r2 = _mm_shuffle_epi8(t2, low_nib); + let r3 = _mm_shuffle_epi8(t3, low_nib); + let r4 = _mm_shuffle_epi8(t4, low_nib); + let r5 = _mm_shuffle_epi8(t5, low_nib); + + // Mask each result by `high_nib == N` and OR them together. + let m0 = _mm_cmpeq_epi8(high_nib, _mm_setzero_si128()); + let m1 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(1)); + let m2 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(2)); + let m3 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(3)); + let m4 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(4)); + let m5 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(5)); + + _mm_or_si128( + _mm_or_si128( + _mm_or_si128(_mm_and_si128(r0, m0), _mm_and_si128(r1, m1)), + _mm_or_si128(_mm_and_si128(r2, m2), _mm_and_si128(r3, m3)), + ), + _mm_or_si128(_mm_and_si128(r4, m4), _mm_and_si128(r5, m5)), + ) + } + } + + /// 96-byte char→digit lookup, indexed by `char - 33`. Same data as + /// the NEON `CHAR_TO_85` + `CHAR_TO_85_X2` tables concatenated: + /// chars 33..96 then chars 97..128. Invalid in-range chars (alphabet + /// gaps) and pad slots are 0xFF so the caller can detect them via + /// `_mm_cmpeq_epi8` against 0xFF. + #[rustfmt::skip] + static CHAR_TO_85_PADDED: [u8; 96] = [ + // chars 33..96 (64 entries) + 62, 0xFF, 63, 64, 65, 66, 0xFF, 67, 68, 69, 70, 0xFF, 71, 0xFF, 0xFF, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xFF, 72, 73, 74, 75, 76, 77, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xFF, 0xFF, 0xFF, 78, 79, 80, + // chars 97..128 (32 entries) + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 81, 82, 83, 84, 0xFF, 0xFF, + ]; + + /// Maps 16 ASCII chars to base85 digit values (0..84) lane-wise. + /// Chars not in the alphabet (or out of [33, 126]) come back as 0xFF. + #[inline] + #[target_feature(enable = "sse4.1,ssse3")] + unsafe fn char85_to_byte_x86(chars: __m128i) -> __m128i { + // SAFETY: caller upholds SSE4.1+SSSE3. + unsafe { + // Normalise so valid chars 33..128 map to indices 0..95. + // Out-of-range chars wrap; they'll be filtered by the caller's + // explicit range check. + let normalised = _mm_sub_epi32(chars, _mm_set1_epi8(33)); + let high_nib = _mm_and_si128(_mm_srli_epi16::<4>(normalised), _mm_set1_epi8(0x0F)); + let low_nib = _mm_and_si128(normalised, _mm_set1_epi8(0x0F)); + + let t0 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().cast::<__m128i>()); + let t1 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(16).cast::<__m128i>()); + let t2 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(32).cast::<__m128i>()); + let t3 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(48).cast::<__m128i>()); + let t4 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(64).cast::<__m128i>()); + let t5 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(80).cast::<__m128i>()); + + let r0 = _mm_shuffle_epi8(t0, low_nib); + let r1 = _mm_shuffle_epi8(t1, low_nib); + let r2 = _mm_shuffle_epi8(t2, low_nib); + let r3 = _mm_shuffle_epi8(t3, low_nib); + let r4 = _mm_shuffle_epi8(t4, low_nib); + let r5 = _mm_shuffle_epi8(t5, low_nib); + + let m0 = _mm_cmpeq_epi8(high_nib, _mm_setzero_si128()); + let m1 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(1)); + let m2 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(2)); + let m3 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(3)); + let m4 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(4)); + let m5 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(5)); + + // For high_nib >= 6 (i.e. char > 128 + offset), no chunk mask + // matches and the OR-result is zero. We rely on the caller's + // range check to flag those — zero would otherwise alias with + // legitimate digit value 0 ('0'). + _mm_or_si128( + _mm_or_si128( + _mm_or_si128(_mm_and_si128(r0, m0), _mm_and_si128(r1, m1)), + _mm_or_si128(_mm_and_si128(r2, m2), _mm_and_si128(r3, m3)), + ), + _mm_or_si128(_mm_and_si128(r4, m4), _mm_and_si128(r5, m5)), + ) + } + } + + /// Per-position permutation indexes for the decoder. Each `IDX_DK_*` + /// extracts byte k of each 5-byte block into a u32x4 where lane i + /// holds digit k of block i (in the low byte of the lane). + /// `_MAIN` indexes pull from `digits_main` (chars 0..15) and `_TAIL` + /// from `digits_tail` (chars 16..19, padded). Both run via PSHUFB, + /// and the two results are ORed. + #[rustfmt::skip] + static IDX_DEC: [([u8; 16], [u8; 16]); 5] = [ + // d0: chars 0, 5, 10, 15 → all from main + ( + [0, 0xFF, 0xFF, 0xFF, 5, 0xFF, 0xFF, 0xFF, 10, 0xFF, 0xFF, 0xFF, 15, 0xFF, 0xFF, 0xFF], + [0xFF; 16], + ), + // d1: chars 1, 6, 11 main + 16 (tail[0]) + ( + [1, 0xFF, 0xFF, 0xFF, 6, 0xFF, 0xFF, 0xFF, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF], + ), + // d2: chars 2, 7, 12 main + 17 (tail[1]) + ( + [2, 0xFF, 0xFF, 0xFF, 7, 0xFF, 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1, 0xFF, 0xFF, 0xFF], + ), + // d3: chars 3, 8, 13 main + 18 (tail[2]) + ( + [3, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 0xFF, 0xFF, 0xFF], + ), + // d4: chars 4, 9, 14 main + 19 (tail[3]) + ( + [4, 0xFF, 0xFF, 0xFF, 9, 0xFF, 0xFF, 0xFF, 14, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 3, 0xFF, 0xFF, 0xFF], + ), + ]; + + /// Mask: 0xFF for the first 4 bytes (the real input chars 16..19), + /// 0x00 for the rest (zero-padding in `chars_tail`). + static TAIL_VALID_MASK: [u8; 16] = [0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + + /// PSHUFB index that byte-swaps each u32 lane (BE store after the + /// Horner chain produces native-endian u32s). + static REV32_DEC_IDX: [u8; 16] = REV32_IDX; + + /// SSSE3 + SSE4.1 4-block decoder. Mirrors [`super::neon::try_decode_block_x4`]. + /// + /// Returns `Err(())` (so the caller falls back to the scalar path, + /// which will surface a precise [`crate::DecodeError`]) when: + /// - any input byte is outside the base85 alphabet, or + /// - any of the four 5-char blocks decodes to a value > `u32::MAX`. /// - /// 1. `_mm_loadu_si128` for chars 0..16 and a partial load for - /// chars 16..19 (zero-padded). - /// 2. Range-validate `[33, 126]` via two `_mm_cmpgt_epi8` masks - /// and `_mm_or_si128`. - /// 3. ASCII → digit via two `_mm_shuffle_epi8` table lookups - /// (low half + high half of the alphabet) with a `_mm_blendv_epi8` - /// selector keyed on `chars < 97`. - /// 4. Detect 0xFF results (in-alphabet gaps) via `_mm_cmpeq_epi8`. - /// 5. Permute digits into per-position `__m128i` vectors `v_d0..v_d4` - /// via `_mm_shuffle_epi8` over a paired source. - /// 6. Horner-style `vmlaq_n_u32` chain becomes `_mm_mullo_epi32` + - /// `_mm_add_epi32`. - /// 7. Overflow check: `n_wrap < d0 * 85⁴` lane-wise via - /// `_mm_cmplt_epi32` (for unsigned this needs the bias trick). - /// 8. Byte-swap each u32 lane back to BE via `_mm_shuffle_epi8`, - /// then `_mm_storeu_si128`. + /// # Safety + /// Requires SSE4.1 + SSSE3 on the host CPU. #[inline] - pub(crate) fn try_decode_block_x4(input: &[u8; 20], out: &mut [u8; 16]) -> Result<(), ()> { - for i in 0..4 { - let src: &[u8; 5] = (&input[i * 5..i * 5 + 5]).try_into().unwrap(); - let dst: &mut [u8; 4] = (&mut out[i * 4..i * 4 + 4]).try_into().unwrap(); - // base_position 0 because the caller (`decode_into`) replays - // the chunk scalar on Err and handles position reporting itself. - super::decode_block(src, dst, 0).map_err(|_| ())?; + #[target_feature(enable = "sse4.1,ssse3")] + pub(crate) unsafe fn try_decode_block_x4( + input: &[u8; 20], + out: &mut [u8; 16], + ) -> Result<(), ()> { + // SAFETY: caller upholds SSE4.1+SSSE3. + unsafe { + // Load 16 chars + 4 tail chars (zero-padded to 16). + let chars_main = _mm_loadu_si128(input.as_ptr().cast::<__m128i>()); + let mut tail_buf = [0u8; 16]; + tail_buf[..4].copy_from_slice(&input[16..20]); + let chars_tail = _mm_loadu_si128(tail_buf.as_ptr().cast::<__m128i>()); + let tail_valid_mask = _mm_loadu_si128(TAIL_VALID_MASK.as_ptr().cast::<__m128i>()); + + // Range validation: char must be in [33, 126]. + // _mm_cmpgt_epi8 is signed but values 33..126 fit in i8 [33, 126]. + let too_low_main = _mm_cmpgt_epi8(_mm_set1_epi8(33), chars_main); + let too_high_main = _mm_cmpgt_epi8(chars_main, _mm_set1_epi8(126)); + let invalid_range_main = _mm_or_si128(too_low_main, too_high_main); + + let too_low_tail = _mm_cmpgt_epi8(_mm_set1_epi8(33), chars_tail); + let too_high_tail = _mm_cmpgt_epi8(chars_tail, _mm_set1_epi8(126)); + let invalid_range_tail = + _mm_and_si128(_mm_or_si128(too_low_tail, too_high_tail), tail_valid_mask); + + // ASCII → digit lookup (returns 0xFF for in-range invalid chars). + let digits_main = char85_to_byte_x86(chars_main); + let digits_tail = char85_to_byte_x86(chars_tail); + + let invalid_dig_main = _mm_cmpeq_epi8(digits_main, _mm_set1_epi8(0xFF_u8 as i8)); + let invalid_dig_tail = _mm_and_si128( + _mm_cmpeq_epi8(digits_tail, _mm_set1_epi8(0xFF_u8 as i8)), + tail_valid_mask, + ); + + let any_invalid = _mm_or_si128( + _mm_or_si128(invalid_range_main, invalid_range_tail), + _mm_or_si128(invalid_dig_main, invalid_dig_tail), + ); + // _mm_movemask_epi8 returns the high bit of each byte as a 16-bit mask. + // Non-zero ⇒ at least one invalid byte. + if _mm_movemask_epi8(any_invalid) != 0 { + return Err(()); + } + + // Permute digits into per-position u32x4 vectors. PSHUFB only + // takes one source vector, so each position needs two PSHUFBs + // (main + tail) ORed together. + let v_dk = |idx_main: &[u8; 16], idx_tail: &[u8; 16]| -> __m128i { + let im = _mm_loadu_si128(idx_main.as_ptr().cast::<__m128i>()); + let it = _mm_loadu_si128(idx_tail.as_ptr().cast::<__m128i>()); + _mm_or_si128( + _mm_shuffle_epi8(digits_main, im), + _mm_shuffle_epi8(digits_tail, it), + ) + }; + let v_d0 = v_dk(&IDX_DEC[0].0, &IDX_DEC[0].1); + let v_d1 = v_dk(&IDX_DEC[1].0, &IDX_DEC[1].1); + let v_d2 = v_dk(&IDX_DEC[2].0, &IDX_DEC[2].1); + let v_d3 = v_dk(&IDX_DEC[3].0, &IDX_DEC[3].1); + let v_d4 = v_dk(&IDX_DEC[4].0, &IDX_DEC[4].1); + + // Horner: n = ((((d0·85 + d1)·85 + d2)·85 + d3)·85 + d4) + let m85 = _mm_set1_epi32(85); + let n = _mm_add_epi32(v_d1, _mm_mullo_epi32(v_d0, m85)); + let n = _mm_add_epi32(v_d2, _mm_mullo_epi32(n, m85)); + let n = _mm_add_epi32(v_d3, _mm_mullo_epi32(n, m85)); + let n = _mm_add_epi32(v_d4, _mm_mullo_epi32(n, m85)); + + // Overflow detection: n_wrap < d0 · 85⁴ ⇒ overflow occurred. + // SSE doesn't have an unsigned cmplt; bias-and-signed-cmpgt + // is the standard trick. + let d0_85_4 = _mm_mullo_epi32(v_d0, _mm_set1_epi32(52_200_625)); + let bias = _mm_set1_epi32(0x80000000_u32 as i32); + let n_b = _mm_xor_si128(n, bias); + let d0_b = _mm_xor_si128(d0_85_4, bias); + // overflow_mask lane = 0xFFFFFFFF if d0_85_4 > n (unsigned). + let overflow_mask = _mm_cmpgt_epi32(d0_b, n_b); + if _mm_movemask_epi8(overflow_mask) != 0 { + return Err(()); + } + + // Byte-swap each u32 lane to BE and store. + let bytes_be = + _mm_shuffle_epi8(n, _mm_loadu_si128(REV32_DEC_IDX.as_ptr().cast::<__m128i>())); + _mm_storeu_si128(out.as_mut_ptr().cast::<__m128i>(), bytes_be); + + Ok(()) } - Ok(()) } } diff --git a/src/lib.rs b/src/lib.rs index d2b19d9..9c2318f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,13 +200,20 @@ fn encode_into(input: &[u8], out: &mut [u8]) { fn encode_into(input: &[u8], out: &mut [u8]) { use block::SseEncoder; + // Runtime feature gate: the SSE intrinsics carry + // `#[target_feature(enable = "sse4.1,ssse3")]`, so calling them on + // a CPU without those features is UB. Detect once at the entry + // point and route to the scalar fallback otherwise. This branch + // predicts perfectly so the cost is negligible. + if !(std::is_x86_feature_detected!("sse4.1") && std::is_x86_feature_detected!("ssse3")) { + encode_into_scalar(input, out); + return; + } + let encoder = SseEncoder::new(); let mut in_off = 0usize; let mut out_off = 0usize; - // 16-byte → 20-char SSE loop. (Currently a scalar-fallback stub - // inside `SseEncoder::encode_block_x4`; replace its body with - // SSSE3/SSE4.1 intrinsics to get the speedup.) while in_off.saturating_add(16) <= input.len() { let Some(in_chunk) = chunk_at::<16>(input, in_off) else { return; @@ -214,7 +221,8 @@ fn encode_into(input: &[u8], out: &mut [u8]) { let Some(out_chunk) = chunk_at_mut::<20>(out, out_off) else { return; }; - encoder.encode_block_x4(in_chunk, out_chunk); + // SAFETY: we verified SSE4.1 + SSSE3 above. + unsafe { encoder.encode_block_x4(in_chunk, out_chunk) }; in_off = in_off.saturating_add(16); out_off = out_off.saturating_add(20); } @@ -243,6 +251,37 @@ fn encode_into(input: &[u8], out: &mut [u8]) { } } +/// Pure-scalar fallback used by the x86 path on CPUs lacking +/// SSE4.1 + SSSE3 (e.g. very old Pentium 4-class hardware). +#[cfg(target_arch = "x86_64")] +fn encode_into_scalar(input: &[u8], out: &mut [u8]) { + let mut in_off = 0usize; + let mut out_off = 0usize; + + while in_off.saturating_add(4) <= input.len() { + let Some(block) = chunk_at::<4>(input, in_off) else { + return; + }; + let Some(chunk) = chunk_at_mut::<5>(out, out_off) else { + return; + }; + encode_block(block, chunk); + in_off = in_off.saturating_add(4); + out_off = out_off.saturating_add(5); + } + + let tail_in_len = input.len().saturating_sub(in_off); + if tail_in_len > 0 { + let tail_out_len = tail_in_len.saturating_add(1); + if let (Some(tail_in), Some(tail_out)) = ( + input.get(in_off..in_off.saturating_add(tail_in_len)), + out.get_mut(out_off..out_off.saturating_add(tail_out_len)), + ) { + encode_tail(tail_in, tail_out); + } + } +} + #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn encode_into(input: &[u8], out: &mut [u8]) { let mut in_off = 0usize; @@ -346,11 +385,17 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { use block::try_decode_block_x4; + // Runtime feature gate: the SSE intrinsics are gated on + // `target_feature(enable = "sse4.1,ssse3")`; calling them on a CPU + // lacking those features is UB. Detect once at the entry point. + if !(std::is_x86_feature_detected!("sse4.1") && std::is_x86_feature_detected!("ssse3")) { + return decode_into_scalar(input, out); + } + let mut in_off = 0usize; let mut out_off = 0usize; while in_off + 20 <= input.len() && out_off + 16 <= out.len() { - // SAFETY: the slice arms below are checked by the loop guard. let Some(in_chunk) = chunk_at::<20>(input, in_off) else { return Ok(()); }; @@ -358,7 +403,9 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { return Ok(()); }; - if try_decode_block_x4(in_chunk, out_chunk).is_ok() { + // SAFETY: SSE4.1 + SSSE3 verified above. + let sse_ok = unsafe { try_decode_block_x4(in_chunk, out_chunk) }.is_ok(); + if sse_ok { in_off += 20; out_off += 16; } else { @@ -394,6 +441,39 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { Ok(()) } +/// Pure-scalar fallback used by the x86 decode path on CPUs lacking +/// SSE4.1 + SSSE3. +#[cfg(target_arch = "x86_64")] +fn decode_into_scalar(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { + let mut in_off = 0usize; + let mut out_off = 0usize; + + while in_off.saturating_add(5) <= input.len() { + let Some(block) = chunk_at::<5>(input, in_off) else { + return Ok(()); + }; + let Some(chunk) = chunk_at_mut::<4>(out, out_off) else { + return Ok(()); + }; + decode_block(block, chunk, in_off)?; + in_off = in_off.saturating_add(5); + out_off = out_off.saturating_add(4); + } + + let tail_in_len = input.len().saturating_sub(in_off); + if tail_in_len > 0 { + let tail_out_len = tail_in_len.saturating_sub(1); + if let (Some(tail_in), Some(tail_out)) = ( + input.get(in_off..in_off.saturating_add(tail_in_len)), + out.get_mut(out_off..out_off.saturating_add(tail_out_len)), + ) { + decode_tail(tail_in, tail_out, in_off)?; + } + } + + Ok(()) +} + #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { let mut in_off = 0usize; diff --git a/src/ops/x86_64.rs b/src/ops/x86_64.rs index 29b2be3..d5f6269 100644 --- a/src/ops/x86_64.rs +++ b/src/ops/x86_64.rs @@ -1,81 +1,108 @@ -//! x86_64 SSE2 / SSSE3 helpers — the structural mirror of `aarch64.rs`. +//! x86_64 SSSE3 / SSE4.1 helpers — the structural mirror of `aarch64.rs`. //! //! The magic constants are the same (libdivide form, computed from the //! divisor); only the SIMD intrinsics differ. Mapping cheat-sheet: //! -//! | NEON | x86 (SSE2 / SSSE3) | what it does | -//! |------------------------|---------------------------|-------------------------------------------| -//! | `vmull_u32` | `_mm_mul_epu32` | u32×u32 → u64 widening multiply (2 lanes) | -//! | `vshrq_n_u32` | `_mm_srli_epi32` | logical shift right per u32 lane | -//! | `vmulq_n_u32` | `_mm_mullo_epi32` (SSE4.1)| u32×u32 → u32 (low 32 bits) | -//! | `vsubq_u32` | `_mm_sub_epi32` | u32 lane-wise subtract | -//! | `vqtbl1q_u8` | `_mm_shuffle_epi8` (SSSE3)| byte-shuffle / lookup | -//! | `vbslq_u8` | `_mm_blendv_epi8` (SSE4.1)| byte-wise blend with mask | -//! | `vrev32q_u8` | `_mm_shuffle_epi8` + idx | reverse bytes in each u32 lane | +//! | NEON | x86 (SSE2 / SSSE3 / SSE4.1) | what it does | +//! |------------------------|------------------------------|-------------------------------------------| +//! | `vmull_u32` | `_mm_mul_epu32` | u32×u32 → u64 widening (2 lanes per call) | +//! | `vshrq_n_u32` | `_mm_srli_epi32` | logical shift right per u32 lane | +//! | `vmulq_n_u32` | `_mm_mullo_epi32` (SSE4.1) | u32×u32 → u32 (low 32 bits) | +//! | `vsubq_u32` | `_mm_sub_epi32` | u32 lane-wise subtract | +//! | `vqtbl1q_u8` | `_mm_shuffle_epi8` (SSSE3) | byte-shuffle / lookup | +//! | `vbslq_u8` | `_mm_blendv_epi8` (SSE4.1) | byte-wise blend with mask | +//! | `vrev32q_u8` | `_mm_shuffle_epi8` + idx | reverse bytes in each u32 lane | //! -//! Required CPU features for the decode hot path: **SSSE3** (for -//! `pshufb`). For the encode hot path: **SSE4.1** (for `pmulld`). -//! Both are universal on x86_64 silicon since ~2008. -//! -//! ## Implementation status -//! -//! Currently a stub: the helpers below compile and produce correct -//! results, but they go through the scalar path (manual 4-lane -//! unroll). Replace the bodies with intrinsics one at a time and -//! verify against the quickcheck tests at the bottom of the file. +//! All SIMD entry points carry `#[target_feature(enable = "sse4.1,ssse3")]` +//! so the file compiles on any `x86_64-*-*` target regardless of the +//! default target-feature set (`x86_64-unknown-linux-gnu` defaults to +//! just SSE2). Callers must perform a runtime feature detection via +//! `std::is_x86_feature_detected!` before invoking. -use std::arch::x86_64::__m128i; +use std::arch::x86_64::{ + __m128i, _mm_blend_epi16, _mm_mul_epu32, _mm_set1_epi32, _mm_shuffle_epi32, _mm_srli_epi32, + _mm_srli_epi64, +}; /// Lane-wise `input / 85`. Magic constants verified for the full u32 range. +/// +/// # Safety +/// Requires SSE4.1 + SSSE3 to be available on the host CPU. #[inline] -pub(crate) fn div_85(input: __m128i) -> __m128i { +#[target_feature(enable = "sse4.1,ssse3")] +pub(crate) unsafe fn div_85(input: __m128i) -> __m128i { // m = ceil(2^38 / 85) = 3_233_857_729; m * 85 - 2^38 = 21 < 2^6. - div_magic::<3_233_857_729, 6>(input) + // SAFETY: caller upholds SSE4.1+SSSE3 (function-level target_feature). + unsafe { div_magic::<3_233_857_729, 6>(input) } } /// Lane-wise `input / 7225` (i.e. `input / 85²`). Valid for all u32 inputs. +/// +/// # Safety +/// Requires SSE4.1 + SSSE3 to be available on the host CPU. #[inline] -pub(crate) fn div_85_sq(input: __m128i) -> __m128i { +#[target_feature(enable = "sse4.1,ssse3")] +pub(crate) unsafe fn div_85_sq(input: __m128i) -> __m128i { // m = ceil(2^44 / 7225) = 2_434_904_643; m * 7225 - 2^44 = 1259 < 2^12. - div_magic::<2_434_904_643, 12>(input) + // SAFETY: caller upholds SSE4.1+SSSE3. + unsafe { div_magic::<2_434_904_643, 12>(input) } } /// Lane-wise `input / 614125` (i.e. `input / 85³`). Valid for all u32 inputs. +/// +/// # Safety +/// Requires SSE4.1 + SSSE3 to be available on the host CPU. #[inline] -pub(crate) fn div_85_cube(input: __m128i) -> __m128i { +#[target_feature(enable = "sse4.1,ssse3")] +pub(crate) unsafe fn div_85_cube(input: __m128i) -> __m128i { // m = ceil(2^51 / 614125) = 3_666_679_933; bound check passes for u32. - div_magic::<3_666_679_933, 19>(input) + // SAFETY: caller upholds SSE4.1+SSSE3. + unsafe { div_magic::<3_666_679_933, 19>(input) } } /// Generic libdivide-style div: `q = (n * MAGIC) >> (32 + SHIFT)`. /// -/// SSE2 path. The widening multiply on x86 is `_mm_mul_epu32`, which -/// multiplies the *even* u32 lanes of two `__m128i` registers and -/// produces a `__m128i` containing two u64 results in the even -/// positions. We multiply twice — once for even lanes, once after -/// shuffling odd lanes into even positions — then shift to extract -/// the high u32 of each u64 product, and finally apply the SHIFT. -// -// TODO(x86 SIMD): replace this scalar fallback body with the SSE2 -// multiply + shift sequence sketched above. The interface is fixed; -// only the body needs to change. See the existing aarch64.rs for the -// matching NEON sequence. +/// `_mm_mul_epu32` multiplies the **low u32 of each u64 lane**, so it +/// only touches input lanes 0 and 2 per call. We do two multiplies — +/// one as-is (for even lanes 0, 2), one after `_mm_srli_epi64::<32>` +/// (which moves odd lanes 1, 3 into the low u32 of each u64 lane). +/// +/// After the two multiplies we have: +/// even = [(in[0]·m)lo, (in[0]·m)hi, (in[2]·m)lo, (in[2]·m)hi] +/// odd = [(in[1]·m)lo, (in[1]·m)hi, (in[3]·m)lo, (in[3]·m)hi] +/// +/// We want u32x4 = [(in[k]·m)>>32 for k=0..3] = the four `hi` lanes. +/// One `_mm_shuffle_epi32::<0xF5>` per side broadcasts the `hi` u32 +/// across each u64 half (so even_hi = [hi0, hi0, hi2, hi2] etc.), and +/// `_mm_blend_epi16::<0xCC>` interleaves them into [hi0, hi1, hi2, hi3]. +/// Finally `_mm_srli_epi32::` applies the libdivide shift. +/// +/// # Safety +/// Requires SSE4.1 + SSSE3 (the function-level `target_feature` +/// satisfies this for any caller that has done the runtime check). #[inline] -fn div_magic(input: __m128i) -> __m128i { - use std::arch::x86_64::{_mm_loadu_si128, _mm_storeu_si128}; +#[target_feature(enable = "sse4.1,ssse3")] +unsafe fn div_magic(input: __m128i) -> __m128i { + let magic = _mm_set1_epi32(MAGIC as i32); - // Stub: dump to stack, do scalar division, reload. Correct but slow. - let mut lanes = [0u32; 4]; - unsafe { - _mm_storeu_si128(lanes.as_mut_ptr().cast::<__m128i>(), input); - } - for lane in &mut lanes { - let xl = MAGIC as u64; - let yl = *lane as u64; - let q = ((xl * yl) >> 32) as u32; - *lane = q >> SHIFT; - } - unsafe { _mm_loadu_si128(lanes.as_ptr().cast::<__m128i>()) } + // Even-lane multiply (touches input lanes 0 and 2). + let even = _mm_mul_epu32(input, magic); + // Shift right by 32 in u64 view to bring lanes 1 and 3 into + // the low u32 of each u64 lane, then multiply. + let input_odd = _mm_srli_epi64::<32>(input); + let odd = _mm_mul_epu32(input_odd, magic); + + // 0b11_11_01_01 picks lane 1 / lane 1 / lane 3 / lane 3 — i.e. + // broadcasts the high u32 of each u64 product across the pair. + let even_hi = _mm_shuffle_epi32::<0b11_11_01_01>(even); + let odd_hi = _mm_shuffle_epi32::<0b11_11_01_01>(odd); + + // Blend at u16 granularity: 0xCC = 0b1100_1100 picks the upper + // u16 pair (i.e. odd_hi) for lanes 1 and 3 of the u32 view, and + // the lower (even_hi) for lanes 0 and 2. + let combined = _mm_blend_epi16::<0xCC>(even_hi, odd_hi); + + _mm_srli_epi32::(combined) } #[cfg(test)] @@ -109,26 +136,26 @@ mod tests { quickcheck! { fn div_85_matches_scalar(block: InputBlock) -> bool { let InputBlock(input) = block; - let q_out = lanes(div_85(load(&input))); + let q_out = lanes(unsafe { div_85(load(&input)) }); (0..4).all(|i| q_out[i] == input[i] / 85) } fn div_85_sq_matches_scalar(block: InputBlock) -> bool { let InputBlock(input) = block; - let q_out = lanes(div_85_sq(load(&input))); + let q_out = lanes(unsafe { div_85_sq(load(&input)) }); (0..4).all(|i| q_out[i] == input[i] / (85 * 85)) } fn div_85_cube_matches_scalar(block: InputBlock) -> bool { let InputBlock(input) = block; - let q_out = lanes(div_85_cube(load(&input))); + let q_out = lanes(unsafe { div_85_cube(load(&input)) }); (0..4).all(|i| q_out[i] == input[i] / (85 * 85 * 85)) } fn div_85_to_the_4_via_composition(block: InputBlock) -> bool { let InputBlock(input) = block; - let q2 = div_85_sq(load(&input)); - let q4 = div_85_sq(q2); + let q2 = unsafe { div_85_sq(load(&input)) }; + let q4 = unsafe { div_85_sq(q2) }; let q4_out = lanes(q4); (0..4).all(|i| q4_out[i] == input[i] / (85u32.pow(4))) } From 199ad3d13d634b4feafadb181bf48e51b78d751b Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Sat, 2 May 2026 00:03:58 +1000 Subject: [PATCH 3/7] fix(x86): re-add inner unsafe blocks for Rust 1.85 MSRV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rust 1.85 still applies `unsafe_op_in_unsafe_fn` to target-feature- gated intrinsics inside an `unsafe fn`, so calls to `_mm_set1_epi32` etc. need explicit `unsafe { }` blocks even though the surrounding function is itself `unsafe fn` with matching `target_feature`. A later toolchain change relaxes that requirement and instead flags the same blocks as `unused_unsafe`. The fix: - Re-add the inner `unsafe { ... }` blocks around each SSE function body so 1.85 accepts them. - Add `#![allow(unused_unsafe)]` at the module level for both `src/ops/x86_64.rs` and the `src/block.rs::sse` submodule so newer toolchains don't break CI on the spurious warning. Verified locally: cargo +1.85 test --target x86_64-apple-darwin — 47 tests pass cargo +1.85 clippy --target x86_64-apple-darwin --all-targets -- -D warnings — clean cargo clippy --target x86_64-apple-darwin --all-targets -- -D warnings — clean (stable) cargo test — 47 tests pass (native) --- src/block.rs | 8 ++++++++ src/ops/x86_64.rs | 52 +++++++++++++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/block.rs b/src/block.rs index 5c78186..f1e4552 100644 --- a/src/block.rs +++ b/src/block.rs @@ -540,6 +540,14 @@ mod sse { //! `crate::encode_into`) does a one-shot //! `is_x86_feature_detected!` check before invoking and falls //! back to scalar if the host CPU lacks the features. + //! + //! Inner `unsafe { ... }` blocks are required on Rust 1.85 (MSRV) + //! for calls to target-feature-gated intrinsics inside an + //! `unsafe fn`. Newer toolchains relax this and may flag those + //! blocks as redundant — hence the module-level `unused_unsafe` + //! allow. + + #![allow(unused_unsafe)] use crate::ops::{div_85, div_85_cube, div_85_sq}; use std::arch::x86_64::{ diff --git a/src/ops/x86_64.rs b/src/ops/x86_64.rs index d5f6269..f4cbfc3 100644 --- a/src/ops/x86_64.rs +++ b/src/ops/x86_64.rs @@ -18,6 +18,14 @@ //! default target-feature set (`x86_64-unknown-linux-gnu` defaults to //! just SSE2). Callers must perform a runtime feature detection via //! `std::is_x86_feature_detected!` before invoking. +//! +//! The inner `unsafe { ... }` blocks below are required on Rust 1.85 +//! (the declared MSRV) because `unsafe_op_in_unsafe_fn` still applies +//! to target-feature-gated intrinsics there. Newer toolchains relax +//! this and may report the blocks as redundant — hence the +//! module-level `unused_unsafe` allow. + +#![allow(unused_unsafe)] use std::arch::x86_64::{ __m128i, _mm_blend_epi16, _mm_mul_epu32, _mm_set1_epi32, _mm_shuffle_epi32, _mm_srli_epi32, @@ -83,26 +91,30 @@ pub(crate) unsafe fn div_85_cube(input: __m128i) -> __m128i { #[inline] #[target_feature(enable = "sse4.1,ssse3")] unsafe fn div_magic(input: __m128i) -> __m128i { - let magic = _mm_set1_epi32(MAGIC as i32); - - // Even-lane multiply (touches input lanes 0 and 2). - let even = _mm_mul_epu32(input, magic); - // Shift right by 32 in u64 view to bring lanes 1 and 3 into - // the low u32 of each u64 lane, then multiply. - let input_odd = _mm_srli_epi64::<32>(input); - let odd = _mm_mul_epu32(input_odd, magic); - - // 0b11_11_01_01 picks lane 1 / lane 1 / lane 3 / lane 3 — i.e. - // broadcasts the high u32 of each u64 product across the pair. - let even_hi = _mm_shuffle_epi32::<0b11_11_01_01>(even); - let odd_hi = _mm_shuffle_epi32::<0b11_11_01_01>(odd); - - // Blend at u16 granularity: 0xCC = 0b1100_1100 picks the upper - // u16 pair (i.e. odd_hi) for lanes 1 and 3 of the u32 view, and - // the lower (even_hi) for lanes 0 and 2. - let combined = _mm_blend_epi16::<0xCC>(even_hi, odd_hi); - - _mm_srli_epi32::(combined) + // SAFETY (1.85 MSRV): caller has SSE4.1+SSSE3; all intrinsics below + // are gated on those features. + unsafe { + let magic = _mm_set1_epi32(MAGIC as i32); + + // Even-lane multiply (touches input lanes 0 and 2). + let even = _mm_mul_epu32(input, magic); + // Shift right by 32 in u64 view to bring lanes 1 and 3 into + // the low u32 of each u64 lane, then multiply. + let input_odd = _mm_srli_epi64::<32>(input); + let odd = _mm_mul_epu32(input_odd, magic); + + // 0b11_11_01_01 picks lane 1 / lane 1 / lane 3 / lane 3 — i.e. + // broadcasts the high u32 of each u64 product across the pair. + let even_hi = _mm_shuffle_epi32::<0b11_11_01_01>(even); + let odd_hi = _mm_shuffle_epi32::<0b11_11_01_01>(odd); + + // Blend at u16 granularity: 0xCC = 0b1100_1100 picks the upper + // u16 pair (i.e. odd_hi) for lanes 1 and 3 of the u32 view, and + // the lower (even_hi) for lanes 0 and 2. + let combined = _mm_blend_epi16::<0xCC>(even_hi, odd_hi); + + _mm_srli_epi32::(combined) + } } #[cfg(test)] From 34f9519004157a1115e3e0ddd19debb289edb4c4 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Sat, 2 May 2026 14:16:18 +1000 Subject: [PATCH 4/7] feat(x86): replace SSE4.1 path with AVX2 (8 blocks per call) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The x86 SIMD path now uses AVX2 throughout, processing 8 blocks per call (vs the previous SSE4.1 path's 4). Both 128-bit lanes of each __m256i register independently run the algorithm we developed for SSE4.1, doubling per-call throughput. AVX2's lane-restricted byte/word ops (PSHUFB, BLEND, SHUFFLE_EPI32) work fine because every step of the algorithm is per-lane anyway. # Why drop SSE4.1 entirely User direction: "if a target doesn't have avx2 it can fall back to the soft impl". AVX2 has been universal on x86 server hardware since Haswell (2013) and the bench workflow now guards on its presence (PR #8). Maintaining a third tier (AVX2 → SSE4.1 → scalar) wasn't worth the duplicated code. # What changed src/ops/x86_64.rs: - div_85, div_85_sq, div_85_cube, div_magic now take/return __m256i. - target_feature attribute changed from "sse4.1,ssse3" to "avx2". src/block.rs::sse → src/block.rs::avx2: - SseEncoder → Avx2Encoder; encode_block_x4 → encode_block_x8 (16-byte → 32-byte input, 20-byte → 40-byte output). - try_decode_block_x4 → try_decode_block_x8 (20-char → 40-char in, 16-byte → 32-byte out). - Static index tables stay 16 bytes; loaded via _mm_loadu_si128 + _mm256_broadcastsi128_si256 to populate both 128-bit lanes. - Encoder output is non-contiguous between lanes (lane 0 → out[0..20], lane 1 → out[20..40]) so we extract each 128-bit lane and the two 4-byte tails separately. Decoder output is 32 contiguous bytes so a single _mm256_storeu_si256 works. - Decoder input is non-contiguous (chars 0..20 + 20..40 with a gap at the tail boundary of each side); two _mm_loadu_si128s are combined with _mm256_set_m128i. src/lib.rs: - encode_into / decode_into runtime gate switches from is_x86_feature_detected!("sse4.1") && ...("ssse3") to just ...("avx2"). Falls back to the existing scalar paths otherwise. - Loop strides updated for the new chunk sizes (32→40 encode, 40→32 decode). # Verified locally cargo test --target x86_64-apple-darwin — 47 tests pass (Rosetta; hits the scalar fallback because Rosetta on this Mac doesn't expose AVX2) cargo test — 47 tests (aarch64 native) cargo +1.85 test --target x86_64-apple-darwin — 47 tests cargo clippy --target x86_64-apple-darwin --all-targets -- -D warnings — clean cargo +1.85 clippy --target x86_64-apple-darwin --all-targets -- -D warnings — clean cargo check --target x86_64-unknown-linux-gnu --all-targets — clean cargo fmt --all -- --check — clean CI on Ubuntu x86_64 (AMD EPYC 7763, has AVX2) will be the first run that actually exercises the new AVX2 code paths end-to-end. --- src/block.rs | 621 +++++++++++++++++++++++----------------------- src/lib.rs | 70 +++--- src/ops/x86_64.rs | 179 ++++++------- 3 files changed, 435 insertions(+), 435 deletions(-) diff --git a/src/block.rs b/src/block.rs index f1e4552..2c2040a 100644 --- a/src/block.rs +++ b/src/block.rs @@ -515,31 +515,37 @@ mod neon { } // ───────────────────────────────────────────────────────────────────────── -// SSE encoder / decoder (x86_64) +// AVX2 encoder / decoder (x86_64) // ───────────────────────────────────────────────────────────────────────── // -// SSSE3 is required for `_mm_shuffle_epi8` (the equivalent of NEON's -// `vqtbl1q_u8`); SSE4.1 is required for `_mm_mullo_epi32` (lane-wise -// u32 multiply). Both are universal on x86_64 silicon since ~2008. We -// depend on them being present at compile time via the default -// `target-feature` set on `x86_64-*-*` (which includes SSSE3 and -// SSE4.1 for all modern targets including `x86_64-unknown-linux-gnu` -// and `x86_64-apple-darwin`). +// Processes 8 blocks per call (32 input bytes, 40 output chars) by +// running the same SSE4.1 algorithm across both 128-bit lanes of an +// `__m256i`. AVX2's lane-restricted byte/word ops (PSHUFB, PSHUFD, +// BLEND) work fine here because the algorithm is fully per-lane — +// each lane handles 4 independent blocks. +// +// On hosts without AVX2 the crate's runtime feature gate routes to +// the scalar fallback (see `crate::encode_into_scalar` / +// `crate::decode_into_scalar`). #[cfg(target_arch = "x86_64")] -pub(crate) use sse::{SseEncoder, try_decode_block_x4}; +pub(crate) use avx2::{Avx2Encoder, try_decode_block_x8}; #[cfg(target_arch = "x86_64")] -mod sse { - //! SSSE3 + SSE4.1 encode/decode. +mod avx2 { + //! AVX2 encode/decode — 8 blocks per call. + //! + //! Both 128-bit lanes of each `__m256i` independently run the + //! algorithm we developed for SSE4.1 (4 blocks per lane, 8 total). + //! AVX2 byte/word ops are lane-restricted (PSHUFB / BLEND / + //! SHUFFLE_EPI32 / etc. operate per 128-bit half), but every + //! step of the algorithm is per-lane anyway, so that's fine. //! //! Every entry point is `unsafe fn` with - //! `#[target_feature(enable = "sse4.1,ssse3")]` so the file - //! compiles on any `x86_64-*-*` target regardless of the default - //! target-feature set. The caller (`crate::decode_into` / - //! `crate::encode_into`) does a one-shot - //! `is_x86_feature_detected!` check before invoking and falls - //! back to scalar if the host CPU lacks the features. + //! `#[target_feature(enable = "avx2")]`. The caller + //! (`crate::encode_into` / `crate::decode_into`) does a one-shot + //! `is_x86_feature_detected!("avx2")` check; on hosts lacking + //! AVX2 the crate routes to the scalar fallback. //! //! Inner `unsafe { ... }` blocks are required on Rust 1.85 (MSRV) //! for calls to target-feature-gated intrinsics inside an @@ -548,20 +554,23 @@ mod sse { //! allow. #![allow(unused_unsafe)] + #![allow(clippy::indexing_slicing)] use crate::ops::{div_85, div_85_cube, div_85_sq}; use std::arch::x86_64::{ - __m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_cmpgt_epi8, _mm_cmpgt_epi32, - _mm_extract_epi32, _mm_loadu_si128, _mm_movemask_epi8, _mm_mullo_epi32, _mm_or_si128, - _mm_set1_epi8, _mm_set1_epi32, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16, - _mm_storeu_si128, _mm_sub_epi32, _mm_xor_si128, + __m128i, __m256i, _mm_loadu_si128, _mm_storeu_si128, _mm256_add_epi32, _mm256_and_si256, + _mm256_broadcastsi128_si256, _mm256_cmpeq_epi8, _mm256_cmpgt_epi8, _mm256_cmpgt_epi32, + _mm256_extract_epi32, _mm256_extracti128_si256, _mm256_loadu_si256, _mm256_movemask_epi8, + _mm256_mullo_epi32, _mm256_or_si256, _mm256_set_m128i, _mm256_set1_epi8, _mm256_set1_epi32, + _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, _mm256_storeu_si256, + _mm256_sub_epi32, _mm256_xor_si256, }; - // Index vectors for splicing the 5 digit values into out_1 (16 bytes) - // and out_2 (4 bytes). Same byte-position layout as the NEON path — - // each digit value lives in byte 0/4/8/12 of its u32 lane, and the - // 0xFF entries say "leave this output byte at zero" (PSHUFB sets - // bytes whose index has the high bit set to zero). + // Per-128-bit-lane index vectors. Loaded as 16-byte arrays from + // .rodata and broadcast to both lanes via VBROADCASTI128 — both + // halves of the 256-bit register share the same index pattern + // because each half is doing an independent 4-block computation. + static IDX_OUT_1: [[u8; 16]; 5] = [ [ 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, 12, @@ -598,333 +607,319 @@ mod sse { ], ]; - /// PSHUFB index that reverses the bytes within each u32 lane. Used - /// to load 16 bytes as 4 big-endian u32 values (the encoder's input - /// is interpreted in network byte order). static REV32_IDX: [u8; 16] = [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]; - /// Alphabet padded to 96 bytes so we can do six 16-byte PSHUFB - /// lookups without reading past the end. Bytes 85..95 are unused - /// padding (any value is fine — they're never indexed because - /// digit values are in [0, 84]). #[rustfmt::skip] static ALPHABET_PADDED: &[u8; 96] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~\0\0\0\0\0\0\0\0\0\0\0"; - /// SSSE3 + SSE4.1 4-block encoder. Mirrors [`super::neon::NeonEncoder`]. - pub(crate) struct SseEncoder; - - impl SseEncoder { - #[inline] - pub(crate) fn new() -> Self { - Self - } - - /// Encode exactly 16 input bytes into exactly 20 output bytes. - /// - /// # Safety - /// Requires SSE4.1 + SSSE3 to be available on the host CPU. - #[inline] - #[target_feature(enable = "sse4.1,ssse3")] - pub(crate) unsafe fn encode_block_x4(&self, in_bytes: &[u8; 16], out_bytes: &mut [u8; 20]) { - // SAFETY: caller has verified SSE4.1+SSSE3; calls below to the - // user-defined `unsafe fn` helpers (div_85*, byte_to_char85_x86) - // therefore satisfy their `target_feature` preconditions, and - // the unaligned pointer write at the end is bounded by the - // `[u8; 20]` array size. - unsafe { - // Load 16 bytes and reverse each 4-byte group to get 4 BE u32s. - let raw = _mm_loadu_si128(in_bytes.as_ptr().cast::<__m128i>()); - let n = - _mm_shuffle_epi8(raw, _mm_loadu_si128(REV32_IDX.as_ptr().cast::<__m128i>())); - - // Parallel-magic divides: q1, q2, q3 from n; q4 from q2. - let q1 = div_85(n); - let q2 = div_85_sq(n); - let q3 = div_85_cube(n); - let q4 = div_85_sq(q2); - - // Five digits via lane-wise multiply-subtract: - // d0 = q4 - // d1 = q3 - q4·85 - // d2 = q2 - q3·85 - // d3 = q1 - q2·85 - // d4 = n - q1·85 - let m85 = _mm_set1_epi32(85); - let d0 = q4; - let d1 = _mm_sub_epi32(q3, _mm_mullo_epi32(q4, m85)); - let d2 = _mm_sub_epi32(q2, _mm_mullo_epi32(q3, m85)); - let d3 = _mm_sub_epi32(q1, _mm_mullo_epi32(q2, m85)); - let d4 = _mm_sub_epi32(n, _mm_mullo_epi32(q1, m85)); - - // Splice 5 digits into out_1 via PSHUFB+OR chain. - let idx0 = _mm_loadu_si128(IDX_OUT_1[0].as_ptr().cast::<__m128i>()); - let idx1 = _mm_loadu_si128(IDX_OUT_1[1].as_ptr().cast::<__m128i>()); - let idx2 = _mm_loadu_si128(IDX_OUT_1[2].as_ptr().cast::<__m128i>()); - let idx3 = _mm_loadu_si128(IDX_OUT_1[3].as_ptr().cast::<__m128i>()); - let idx4 = _mm_loadu_si128(IDX_OUT_1[4].as_ptr().cast::<__m128i>()); - let p0 = _mm_shuffle_epi8(d0, idx0); - let p1 = _mm_shuffle_epi8(d1, idx1); - let p2 = _mm_shuffle_epi8(d2, idx2); - let p3 = _mm_shuffle_epi8(d3, idx3); - let p4 = _mm_shuffle_epi8(d4, idx4); - let out_1 = - _mm_or_si128(_mm_or_si128(p0, p1), _mm_or_si128(p2, _mm_or_si128(p3, p4))); - - // Same for out_2 (4 chars from block 3, digits 1..4). - let i20 = _mm_loadu_si128(IDX_OUT_2[0].as_ptr().cast::<__m128i>()); - let i21 = _mm_loadu_si128(IDX_OUT_2[1].as_ptr().cast::<__m128i>()); - let i22 = _mm_loadu_si128(IDX_OUT_2[2].as_ptr().cast::<__m128i>()); - let i23 = _mm_loadu_si128(IDX_OUT_2[3].as_ptr().cast::<__m128i>()); - let q21 = _mm_shuffle_epi8(d1, i20); - let q22 = _mm_shuffle_epi8(d2, i21); - let q23 = _mm_shuffle_epi8(d3, i22); - let q24 = _mm_shuffle_epi8(d4, i23); - let out_2 = _mm_or_si128(_mm_or_si128(q21, q22), _mm_or_si128(q23, q24)); - - // Digit value (0..84) → ASCII char via PSHUFB chain. - let out_1 = byte_to_char85_x86(out_1); - let out_2 = byte_to_char85_x86(out_2); - - // Store: 16 bytes from out_1, then the low 4 bytes from out_2. - _mm_storeu_si128(out_bytes.as_mut_ptr().cast::<__m128i>(), out_1); - let lo = _mm_extract_epi32::<0>(out_2) as u32; - std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(16).cast::(), lo); - } - } - } - - /// Maps 16 lane-wise digit values (0..84) to ASCII characters. - /// - /// PSHUFB only zeros its output when **bit 7** of the index is set; - /// indices like 16..127 produce `table[index & 0x0F]` instead of 0, - /// so a naive "subtract chunk-base, then PSHUFB and OR all chunks" - /// pattern doesn't work directly. - /// - /// Instead we split each digit into a high nibble (chunk index, 0..5) - /// and a low nibble (entry within chunk, 0..15). All 6 chunk PSHUFBs - /// run unconditionally with the low nibble; the chunk-equality mask - /// (`high_nibble == N`) keeps only the correct one. This costs more - /// instructions than NEON's `vqtbl4q_u8` (which does a 64-entry - /// lookup in a single op) but avoids any PSHUFB high-bit shenanigans. - #[inline] - #[target_feature(enable = "sse4.1,ssse3")] - unsafe fn byte_to_char85_x86(x85: __m128i) -> __m128i { - // SAFETY: ALPHABET_PADDED is 96 bytes; offsets 0/16/32/48/64/80 - // each leave 16 readable bytes. Caller satisfies SSE4.1+SSSE3. - unsafe { - // _mm_srli_epi16 shifts each u16 lane → bits cross byte - // boundaries within a u16. The trailing AND 0x0F masks the - // cross-contamination so we get a clean per-byte high nibble. - let high_nib = _mm_and_si128(_mm_srli_epi16::<4>(x85), _mm_set1_epi8(0x0F)); - let low_nib = _mm_and_si128(x85, _mm_set1_epi8(0x0F)); - - let t0 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().cast::<__m128i>()); - let t1 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(16).cast::<__m128i>()); - let t2 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(32).cast::<__m128i>()); - let t3 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(48).cast::<__m128i>()); - let t4 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(64).cast::<__m128i>()); - let t5 = _mm_loadu_si128(ALPHABET_PADDED.as_ptr().add(80).cast::<__m128i>()); - - // 6 PSHUFBs — each looks up `low_nib` in its own chunk. - let r0 = _mm_shuffle_epi8(t0, low_nib); - let r1 = _mm_shuffle_epi8(t1, low_nib); - let r2 = _mm_shuffle_epi8(t2, low_nib); - let r3 = _mm_shuffle_epi8(t3, low_nib); - let r4 = _mm_shuffle_epi8(t4, low_nib); - let r5 = _mm_shuffle_epi8(t5, low_nib); - - // Mask each result by `high_nib == N` and OR them together. - let m0 = _mm_cmpeq_epi8(high_nib, _mm_setzero_si128()); - let m1 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(1)); - let m2 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(2)); - let m3 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(3)); - let m4 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(4)); - let m5 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(5)); - - _mm_or_si128( - _mm_or_si128( - _mm_or_si128(_mm_and_si128(r0, m0), _mm_and_si128(r1, m1)), - _mm_or_si128(_mm_and_si128(r2, m2), _mm_and_si128(r3, m3)), - ), - _mm_or_si128(_mm_and_si128(r4, m4), _mm_and_si128(r5, m5)), - ) - } - } - - /// 96-byte char→digit lookup, indexed by `char - 33`. Same data as - /// the NEON `CHAR_TO_85` + `CHAR_TO_85_X2` tables concatenated: - /// chars 33..96 then chars 97..128. Invalid in-range chars (alphabet - /// gaps) and pad slots are 0xFF so the caller can detect them via - /// `_mm_cmpeq_epi8` against 0xFF. #[rustfmt::skip] static CHAR_TO_85_PADDED: [u8; 96] = [ - // chars 33..96 (64 entries) 62, 0xFF, 63, 64, 65, 66, 0xFF, 67, 68, 69, 70, 0xFF, 71, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xFF, 72, 73, 74, 75, 76, 77, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xFF, 0xFF, 0xFF, 78, 79, 80, - // chars 97..128 (32 entries) 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 81, 82, 83, 84, 0xFF, 0xFF, ]; - /// Maps 16 ASCII chars to base85 digit values (0..84) lane-wise. - /// Chars not in the alphabet (or out of [33, 126]) come back as 0xFF. - #[inline] - #[target_feature(enable = "sse4.1,ssse3")] - unsafe fn char85_to_byte_x86(chars: __m128i) -> __m128i { - // SAFETY: caller upholds SSE4.1+SSSE3. - unsafe { - // Normalise so valid chars 33..128 map to indices 0..95. - // Out-of-range chars wrap; they'll be filtered by the caller's - // explicit range check. - let normalised = _mm_sub_epi32(chars, _mm_set1_epi8(33)); - let high_nib = _mm_and_si128(_mm_srli_epi16::<4>(normalised), _mm_set1_epi8(0x0F)); - let low_nib = _mm_and_si128(normalised, _mm_set1_epi8(0x0F)); - - let t0 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().cast::<__m128i>()); - let t1 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(16).cast::<__m128i>()); - let t2 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(32).cast::<__m128i>()); - let t3 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(48).cast::<__m128i>()); - let t4 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(64).cast::<__m128i>()); - let t5 = _mm_loadu_si128(CHAR_TO_85_PADDED.as_ptr().add(80).cast::<__m128i>()); - - let r0 = _mm_shuffle_epi8(t0, low_nib); - let r1 = _mm_shuffle_epi8(t1, low_nib); - let r2 = _mm_shuffle_epi8(t2, low_nib); - let r3 = _mm_shuffle_epi8(t3, low_nib); - let r4 = _mm_shuffle_epi8(t4, low_nib); - let r5 = _mm_shuffle_epi8(t5, low_nib); - - let m0 = _mm_cmpeq_epi8(high_nib, _mm_setzero_si128()); - let m1 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(1)); - let m2 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(2)); - let m3 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(3)); - let m4 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(4)); - let m5 = _mm_cmpeq_epi8(high_nib, _mm_set1_epi8(5)); - - // For high_nib >= 6 (i.e. char > 128 + offset), no chunk mask - // matches and the OR-result is zero. We rely on the caller's - // range check to flag those — zero would otherwise alias with - // legitimate digit value 0 ('0'). - _mm_or_si128( - _mm_or_si128( - _mm_or_si128(_mm_and_si128(r0, m0), _mm_and_si128(r1, m1)), - _mm_or_si128(_mm_and_si128(r2, m2), _mm_and_si128(r3, m3)), - ), - _mm_or_si128(_mm_and_si128(r4, m4), _mm_and_si128(r5, m5)), - ) - } - } - - /// Per-position permutation indexes for the decoder. Each `IDX_DK_*` - /// extracts byte k of each 5-byte block into a u32x4 where lane i - /// holds digit k of block i (in the low byte of the lane). - /// `_MAIN` indexes pull from `digits_main` (chars 0..15) and `_TAIL` - /// from `digits_tail` (chars 16..19, padded). Both run via PSHUFB, - /// and the two results are ORed. #[rustfmt::skip] static IDX_DEC: [([u8; 16], [u8; 16]); 5] = [ - // d0: chars 0, 5, 10, 15 → all from main ( [0, 0xFF, 0xFF, 0xFF, 5, 0xFF, 0xFF, 0xFF, 10, 0xFF, 0xFF, 0xFF, 15, 0xFF, 0xFF, 0xFF], [0xFF; 16], ), - // d1: chars 1, 6, 11 main + 16 (tail[0]) ( [1, 0xFF, 0xFF, 0xFF, 6, 0xFF, 0xFF, 0xFF, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF], ), - // d2: chars 2, 7, 12 main + 17 (tail[1]) ( [2, 0xFF, 0xFF, 0xFF, 7, 0xFF, 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1, 0xFF, 0xFF, 0xFF], ), - // d3: chars 3, 8, 13 main + 18 (tail[2]) ( [3, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 0xFF, 0xFF, 0xFF], ), - // d4: chars 4, 9, 14 main + 19 (tail[3]) ( [4, 0xFF, 0xFF, 0xFF, 9, 0xFF, 0xFF, 0xFF, 14, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 3, 0xFF, 0xFF, 0xFF], ), ]; - /// Mask: 0xFF for the first 4 bytes (the real input chars 16..19), - /// 0x00 for the rest (zero-padding in `chars_tail`). static TAIL_VALID_MASK: [u8; 16] = [0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - /// PSHUFB index that byte-swaps each u32 lane (BE store after the - /// Horner chain produces native-endian u32s). - static REV32_DEC_IDX: [u8; 16] = REV32_IDX; + /// Loads a 16-byte index from .rodata and broadcasts it to both + /// 128-bit lanes of an `__m256i`. + /// + /// # Safety + /// `ptr` must be valid for a 16-byte read. Caller has AVX2. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn broadcast16(ptr: *const u8) -> __m256i { + // SAFETY: caller upholds the read-validity precondition. + unsafe { _mm256_broadcastsi128_si256(_mm_loadu_si128(ptr.cast::<__m128i>())) } + } + + /// AVX2 8-block encoder. Mirrors the structure of NEON's + /// `NeonEncoder` but processes 8 input blocks (32 bytes → 40 chars) + /// per call. + pub(crate) struct Avx2Encoder; + + impl Avx2Encoder { + #[inline] + pub(crate) fn new() -> Self { + Self + } + + /// Encode exactly 32 input bytes into exactly 40 output bytes. + /// + /// # Safety + /// Requires AVX2 to be available on the host CPU. + #[inline] + #[target_feature(enable = "avx2")] + pub(crate) unsafe fn encode_block_x8(&self, in_bytes: &[u8; 32], out_bytes: &mut [u8; 40]) { + // SAFETY: AVX2 verified by caller; user-defined unsafe fns + // (div_85*, byte_to_char85_avx2) inherit the target_feature. + // Pointer writes at the end land within the [u8; 40] array. + unsafe { + // Load 32 bytes (8 × 4-byte blocks) and reverse each + // 4-byte group to get 8 BE u32s. + let raw = _mm256_loadu_si256(in_bytes.as_ptr().cast::<__m256i>()); + let n = _mm256_shuffle_epi8(raw, broadcast16(REV32_IDX.as_ptr())); + + // Parallel-magic divides — 8 lanes wide. + let q1 = div_85(n); + let q2 = div_85_sq(n); + let q3 = div_85_cube(n); + let q4 = div_85_sq(q2); + + // Five digits per block via lane-wise multiply-subtract. + let m85 = _mm256_set1_epi32(85); + let d0 = q4; + let d1 = _mm256_sub_epi32(q3, _mm256_mullo_epi32(q4, m85)); + let d2 = _mm256_sub_epi32(q2, _mm256_mullo_epi32(q3, m85)); + let d3 = _mm256_sub_epi32(q1, _mm256_mullo_epi32(q2, m85)); + let d4 = _mm256_sub_epi32(n, _mm256_mullo_epi32(q1, m85)); + + // Splice 5 digits into out_1 (32 bytes; 16 per lane — + // each lane holds the first 16 chars of its 4-block group). + // PSHUFB is per-lane; the same index broadcast to both + // lanes does the right thing. + let idx0 = broadcast16(IDX_OUT_1[0].as_ptr()); + let idx1 = broadcast16(IDX_OUT_1[1].as_ptr()); + let idx2 = broadcast16(IDX_OUT_1[2].as_ptr()); + let idx3 = broadcast16(IDX_OUT_1[3].as_ptr()); + let idx4 = broadcast16(IDX_OUT_1[4].as_ptr()); + let p0 = _mm256_shuffle_epi8(d0, idx0); + let p1 = _mm256_shuffle_epi8(d1, idx1); + let p2 = _mm256_shuffle_epi8(d2, idx2); + let p3 = _mm256_shuffle_epi8(d3, idx3); + let p4 = _mm256_shuffle_epi8(d4, idx4); + let out_1 = _mm256_or_si256( + _mm256_or_si256(p0, p1), + _mm256_or_si256(p2, _mm256_or_si256(p3, p4)), + ); + + // Same for out_2 (8 bytes total; 4 per lane — the + // trailing 4 chars of each 4-block group). + let i20 = broadcast16(IDX_OUT_2[0].as_ptr()); + let i21 = broadcast16(IDX_OUT_2[1].as_ptr()); + let i22 = broadcast16(IDX_OUT_2[2].as_ptr()); + let i23 = broadcast16(IDX_OUT_2[3].as_ptr()); + let q21 = _mm256_shuffle_epi8(d1, i20); + let q22 = _mm256_shuffle_epi8(d2, i21); + let q23 = _mm256_shuffle_epi8(d3, i22); + let q24 = _mm256_shuffle_epi8(d4, i23); + let out_2 = _mm256_or_si256(_mm256_or_si256(q21, q22), _mm256_or_si256(q23, q24)); + + // Digit value (0..84) → ASCII char. + let out_1 = byte_to_char85_avx2(out_1); + let out_2 = byte_to_char85_avx2(out_2); + + // Store. Output layout is non-contiguous between lanes: + // bytes 0..16 — lane 0's 16-char main + // bytes 16..20 — lane 0's 4-char tail + // bytes 20..36 — lane 1's 16-char main + // bytes 36..40 — lane 1's 4-char tail + let lo_main = _mm256_extracti128_si256::<0>(out_1); + let hi_main = _mm256_extracti128_si256::<1>(out_1); + let lo_tail = _mm256_extract_epi32::<0>(out_2) as u32; + let hi_tail = _mm256_extract_epi32::<4>(out_2) as u32; + + _mm_storeu_si128(out_bytes.as_mut_ptr().cast::<__m128i>(), lo_main); + std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(16).cast::(), lo_tail); + _mm_storeu_si128(out_bytes.as_mut_ptr().add(20).cast::<__m128i>(), hi_main); + std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(36).cast::(), hi_tail); + } + } + } + + /// Maps 32 lane-wise digit values (0..84) to ASCII characters. + /// + /// Same chunk-index-selection trick as the SSE port (PSHUFB only + /// zeros on bit-7-set indexes), just doubled. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn byte_to_char85_avx2(x85: __m256i) -> __m256i { + // SAFETY: ALPHABET_PADDED is 96 bytes; the six 16-byte windows + // we read are all in-bounds. Caller has AVX2. + unsafe { + let high_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(x85), _mm256_set1_epi8(0x0F)); + let low_nib = _mm256_and_si256(x85, _mm256_set1_epi8(0x0F)); + + let t0 = broadcast16(ALPHABET_PADDED.as_ptr()); + let t1 = broadcast16(ALPHABET_PADDED.as_ptr().add(16)); + let t2 = broadcast16(ALPHABET_PADDED.as_ptr().add(32)); + let t3 = broadcast16(ALPHABET_PADDED.as_ptr().add(48)); + let t4 = broadcast16(ALPHABET_PADDED.as_ptr().add(64)); + let t5 = broadcast16(ALPHABET_PADDED.as_ptr().add(80)); + + let r0 = _mm256_shuffle_epi8(t0, low_nib); + let r1 = _mm256_shuffle_epi8(t1, low_nib); + let r2 = _mm256_shuffle_epi8(t2, low_nib); + let r3 = _mm256_shuffle_epi8(t3, low_nib); + let r4 = _mm256_shuffle_epi8(t4, low_nib); + let r5 = _mm256_shuffle_epi8(t5, low_nib); + + let m0 = _mm256_cmpeq_epi8(high_nib, _mm256_setzero_si256()); + let m1 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(1)); + let m2 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(2)); + let m3 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(3)); + let m4 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(4)); + let m5 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(5)); + + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256(_mm256_and_si256(r0, m0), _mm256_and_si256(r1, m1)), + _mm256_or_si256(_mm256_and_si256(r2, m2), _mm256_and_si256(r3, m3)), + ), + _mm256_or_si256(_mm256_and_si256(r4, m4), _mm256_and_si256(r5, m5)), + ) + } + } - /// SSSE3 + SSE4.1 4-block decoder. Mirrors [`super::neon::try_decode_block_x4`]. + /// Maps 32 ASCII chars to base85 digit values (0..84) lane-wise. + /// Chars not in the alphabet (or out of [33, 126]) come back as 0xFF. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn char85_to_byte_avx2(chars: __m256i) -> __m256i { + // SAFETY: caller has AVX2; CHAR_TO_85_PADDED is 96 bytes with + // six 16-byte readable windows. + unsafe { + let normalised = _mm256_sub_epi32(chars, _mm256_set1_epi8(33)); + let high_nib = + _mm256_and_si256(_mm256_srli_epi16::<4>(normalised), _mm256_set1_epi8(0x0F)); + let low_nib = _mm256_and_si256(normalised, _mm256_set1_epi8(0x0F)); + + let t0 = broadcast16(CHAR_TO_85_PADDED.as_ptr()); + let t1 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(16)); + let t2 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(32)); + let t3 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(48)); + let t4 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(64)); + let t5 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(80)); + + let r0 = _mm256_shuffle_epi8(t0, low_nib); + let r1 = _mm256_shuffle_epi8(t1, low_nib); + let r2 = _mm256_shuffle_epi8(t2, low_nib); + let r3 = _mm256_shuffle_epi8(t3, low_nib); + let r4 = _mm256_shuffle_epi8(t4, low_nib); + let r5 = _mm256_shuffle_epi8(t5, low_nib); + + let m0 = _mm256_cmpeq_epi8(high_nib, _mm256_setzero_si256()); + let m1 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(1)); + let m2 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(2)); + let m3 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(3)); + let m4 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(4)); + let m5 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(5)); + + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256(_mm256_and_si256(r0, m0), _mm256_and_si256(r1, m1)), + _mm256_or_si256(_mm256_and_si256(r2, m2), _mm256_and_si256(r3, m3)), + ), + _mm256_or_si256(_mm256_and_si256(r4, m4), _mm256_and_si256(r5, m5)), + ) + } + } + + /// AVX2 8-block decoder. 40 chars → 32 bytes. /// /// Returns `Err(())` (so the caller falls back to the scalar path, /// which will surface a precise [`crate::DecodeError`]) when: /// - any input byte is outside the base85 alphabet, or - /// - any of the four 5-char blocks decodes to a value > `u32::MAX`. + /// - any of the eight 5-char blocks decodes to a value > `u32::MAX`. /// /// # Safety - /// Requires SSE4.1 + SSSE3 on the host CPU. + /// Requires AVX2 on the host CPU. #[inline] - #[target_feature(enable = "sse4.1,ssse3")] - pub(crate) unsafe fn try_decode_block_x4( - input: &[u8; 20], - out: &mut [u8; 16], + #[target_feature(enable = "avx2")] + pub(crate) unsafe fn try_decode_block_x8( + input: &[u8; 40], + out: &mut [u8; 32], ) -> Result<(), ()> { - // SAFETY: caller upholds SSE4.1+SSSE3. + // SAFETY: caller has AVX2. unsafe { - // Load 16 chars + 4 tail chars (zero-padded to 16). - let chars_main = _mm_loadu_si128(input.as_ptr().cast::<__m128i>()); - let mut tail_buf = [0u8; 16]; - tail_buf[..4].copy_from_slice(&input[16..20]); - let chars_tail = _mm_loadu_si128(tail_buf.as_ptr().cast::<__m128i>()); - let tail_valid_mask = _mm_loadu_si128(TAIL_VALID_MASK.as_ptr().cast::<__m128i>()); - - // Range validation: char must be in [33, 126]. - // _mm_cmpgt_epi8 is signed but values 33..126 fit in i8 [33, 126]. - let too_low_main = _mm_cmpgt_epi8(_mm_set1_epi8(33), chars_main); - let too_high_main = _mm_cmpgt_epi8(chars_main, _mm_set1_epi8(126)); - let invalid_range_main = _mm_or_si128(too_low_main, too_high_main); - - let too_low_tail = _mm_cmpgt_epi8(_mm_set1_epi8(33), chars_tail); - let too_high_tail = _mm_cmpgt_epi8(chars_tail, _mm_set1_epi8(126)); - let invalid_range_tail = - _mm_and_si128(_mm_or_si128(too_low_tail, too_high_tail), tail_valid_mask); - - // ASCII → digit lookup (returns 0xFF for in-range invalid chars). - let digits_main = char85_to_byte_x86(chars_main); - let digits_tail = char85_to_byte_x86(chars_tail); - - let invalid_dig_main = _mm_cmpeq_epi8(digits_main, _mm_set1_epi8(0xFF_u8 as i8)); - let invalid_dig_tail = _mm_and_si128( - _mm_cmpeq_epi8(digits_tail, _mm_set1_epi8(0xFF_u8 as i8)), + // Layout the 40 chars into the 256-bit register so each + // 128-bit lane handles 4 blocks. Lane 0 ← chars 0..15 + // (first 16 chars of blocks 0..3); lane 1 ← chars 20..35 + // (first 16 chars of blocks 4..7). The 4-char tails (chars + // 16..19 and 36..39) go into a separate "chars_tail" with + // 12 zero-padding bytes per lane. + let chars_main_lo = _mm_loadu_si128(input.as_ptr().cast::<__m128i>()); + let chars_main_hi = _mm_loadu_si128(input.as_ptr().add(20).cast::<__m128i>()); + let chars_main = _mm256_set_m128i(chars_main_hi, chars_main_lo); + + let mut tail_buf_lo = [0u8; 16]; + let mut tail_buf_hi = [0u8; 16]; + tail_buf_lo[..4].copy_from_slice(&input[16..20]); + tail_buf_hi[..4].copy_from_slice(&input[36..40]); + let chars_tail = _mm256_set_m128i( + _mm_loadu_si128(tail_buf_hi.as_ptr().cast::<__m128i>()), + _mm_loadu_si128(tail_buf_lo.as_ptr().cast::<__m128i>()), + ); + let tail_valid_mask = broadcast16(TAIL_VALID_MASK.as_ptr()); + + // Range validation: chars must be in [33, 126]. + let too_low_main = _mm256_cmpgt_epi8(_mm256_set1_epi8(33), chars_main); + let too_high_main = _mm256_cmpgt_epi8(chars_main, _mm256_set1_epi8(126)); + let invalid_range_main = _mm256_or_si256(too_low_main, too_high_main); + + let too_low_tail = _mm256_cmpgt_epi8(_mm256_set1_epi8(33), chars_tail); + let too_high_tail = _mm256_cmpgt_epi8(chars_tail, _mm256_set1_epi8(126)); + let invalid_range_tail = _mm256_and_si256( + _mm256_or_si256(too_low_tail, too_high_tail), + tail_valid_mask, + ); + + // ASCII → digit lookup (0xFF for in-range invalid chars). + let digits_main = char85_to_byte_avx2(chars_main); + let digits_tail = char85_to_byte_avx2(chars_tail); + + let invalid_dig_main = _mm256_cmpeq_epi8(digits_main, _mm256_set1_epi8(0xFF_u8 as i8)); + let invalid_dig_tail = _mm256_and_si256( + _mm256_cmpeq_epi8(digits_tail, _mm256_set1_epi8(0xFF_u8 as i8)), tail_valid_mask, ); - let any_invalid = _mm_or_si128( - _mm_or_si128(invalid_range_main, invalid_range_tail), - _mm_or_si128(invalid_dig_main, invalid_dig_tail), + let any_invalid = _mm256_or_si256( + _mm256_or_si256(invalid_range_main, invalid_range_tail), + _mm256_or_si256(invalid_dig_main, invalid_dig_tail), ); - // _mm_movemask_epi8 returns the high bit of each byte as a 16-bit mask. - // Non-zero ⇒ at least one invalid byte. - if _mm_movemask_epi8(any_invalid) != 0 { + // _mm256_movemask_epi8 returns a 32-bit mask (one bit per + // byte of the 256-bit register). + if _mm256_movemask_epi8(any_invalid) != 0 { return Err(()); } - // Permute digits into per-position u32x4 vectors. PSHUFB only - // takes one source vector, so each position needs two PSHUFBs - // (main + tail) ORed together. - let v_dk = |idx_main: &[u8; 16], idx_tail: &[u8; 16]| -> __m128i { - let im = _mm_loadu_si128(idx_main.as_ptr().cast::<__m128i>()); - let it = _mm_loadu_si128(idx_tail.as_ptr().cast::<__m128i>()); - _mm_or_si128( - _mm_shuffle_epi8(digits_main, im), - _mm_shuffle_epi8(digits_tail, it), + // Permute digits into per-position vectors. Same indexes + // as the SSE path, broadcast to both lanes. + let v_dk = |idx_main: &[u8; 16], idx_tail: &[u8; 16]| -> __m256i { + let im = broadcast16(idx_main.as_ptr()); + let it = broadcast16(idx_tail.as_ptr()); + _mm256_or_si256( + _mm256_shuffle_epi8(digits_main, im), + _mm256_shuffle_epi8(digits_tail, it), ) }; let v_d0 = v_dk(&IDX_DEC[0].0, &IDX_DEC[0].1); @@ -934,29 +929,27 @@ mod sse { let v_d4 = v_dk(&IDX_DEC[4].0, &IDX_DEC[4].1); // Horner: n = ((((d0·85 + d1)·85 + d2)·85 + d3)·85 + d4) - let m85 = _mm_set1_epi32(85); - let n = _mm_add_epi32(v_d1, _mm_mullo_epi32(v_d0, m85)); - let n = _mm_add_epi32(v_d2, _mm_mullo_epi32(n, m85)); - let n = _mm_add_epi32(v_d3, _mm_mullo_epi32(n, m85)); - let n = _mm_add_epi32(v_d4, _mm_mullo_epi32(n, m85)); - - // Overflow detection: n_wrap < d0 · 85⁴ ⇒ overflow occurred. - // SSE doesn't have an unsigned cmplt; bias-and-signed-cmpgt - // is the standard trick. - let d0_85_4 = _mm_mullo_epi32(v_d0, _mm_set1_epi32(52_200_625)); - let bias = _mm_set1_epi32(0x80000000_u32 as i32); - let n_b = _mm_xor_si128(n, bias); - let d0_b = _mm_xor_si128(d0_85_4, bias); - // overflow_mask lane = 0xFFFFFFFF if d0_85_4 > n (unsigned). - let overflow_mask = _mm_cmpgt_epi32(d0_b, n_b); - if _mm_movemask_epi8(overflow_mask) != 0 { + let m85 = _mm256_set1_epi32(85); + let n = _mm256_add_epi32(v_d1, _mm256_mullo_epi32(v_d0, m85)); + let n = _mm256_add_epi32(v_d2, _mm256_mullo_epi32(n, m85)); + let n = _mm256_add_epi32(v_d3, _mm256_mullo_epi32(n, m85)); + let n = _mm256_add_epi32(v_d4, _mm256_mullo_epi32(n, m85)); + + // Overflow detection (bias-and-signed-cmpgt). + let d0_85_4 = _mm256_mullo_epi32(v_d0, _mm256_set1_epi32(52_200_625)); + let bias = _mm256_set1_epi32(0x80000000_u32 as i32); + let n_b = _mm256_xor_si256(n, bias); + let d0_b = _mm256_xor_si256(d0_85_4, bias); + let overflow_mask = _mm256_cmpgt_epi32(d0_b, n_b); + if _mm256_movemask_epi8(overflow_mask) != 0 { return Err(()); } - // Byte-swap each u32 lane to BE and store. - let bytes_be = - _mm_shuffle_epi8(n, _mm_loadu_si128(REV32_DEC_IDX.as_ptr().cast::<__m128i>())); - _mm_storeu_si128(out.as_mut_ptr().cast::<__m128i>(), bytes_be); + // Byte-swap each u32 lane to BE and store. Output is 32 + // contiguous bytes (16 from each lane), so a single + // _mm256_storeu_si256 works. + let bytes_be = _mm256_shuffle_epi8(n, broadcast16(REV32_IDX.as_ptr())); + _mm256_storeu_si256(out.as_mut_ptr().cast::<__m256i>(), bytes_be); Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index 9c2318f..e297ce5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -198,33 +198,34 @@ fn encode_into(input: &[u8], out: &mut [u8]) { #[cfg(target_arch = "x86_64")] fn encode_into(input: &[u8], out: &mut [u8]) { - use block::SseEncoder; - - // Runtime feature gate: the SSE intrinsics carry - // `#[target_feature(enable = "sse4.1,ssse3")]`, so calling them on - // a CPU without those features is UB. Detect once at the entry - // point and route to the scalar fallback otherwise. This branch - // predicts perfectly so the cost is negligible. - if !(std::is_x86_feature_detected!("sse4.1") && std::is_x86_feature_detected!("ssse3")) { + use block::Avx2Encoder; + + // Runtime feature gate: the AVX2 intrinsics are gated on + // `#[target_feature(enable = "avx2")]`; calling them on a CPU + // lacking AVX2 is UB. Detect once at the entry point and fall + // back to the scalar path otherwise. The branch predicts + // perfectly across calls so the cost is negligible. + if !std::is_x86_feature_detected!("avx2") { encode_into_scalar(input, out); return; } - let encoder = SseEncoder::new(); + let encoder = Avx2Encoder::new(); let mut in_off = 0usize; let mut out_off = 0usize; - while in_off.saturating_add(16) <= input.len() { - let Some(in_chunk) = chunk_at::<16>(input, in_off) else { + // 32 → 40 byte AVX2 chunks (8 blocks per iteration). + while in_off.saturating_add(32) <= input.len() { + let Some(in_chunk) = chunk_at::<32>(input, in_off) else { return; }; - let Some(out_chunk) = chunk_at_mut::<20>(out, out_off) else { + let Some(out_chunk) = chunk_at_mut::<40>(out, out_off) else { return; }; - // SAFETY: we verified SSE4.1 + SSSE3 above. - unsafe { encoder.encode_block_x4(in_chunk, out_chunk) }; - in_off = in_off.saturating_add(16); - out_off = out_off.saturating_add(20); + // SAFETY: AVX2 verified above. + unsafe { encoder.encode_block_x8(in_chunk, out_chunk) }; + in_off = in_off.saturating_add(32); + out_off = out_off.saturating_add(40); } while in_off.saturating_add(4) <= input.len() { @@ -251,8 +252,8 @@ fn encode_into(input: &[u8], out: &mut [u8]) { } } -/// Pure-scalar fallback used by the x86 path on CPUs lacking -/// SSE4.1 + SSSE3 (e.g. very old Pentium 4-class hardware). +/// Pure-scalar fallback used by the x86 path on CPUs lacking AVX2 +/// (rare on x86 server hardware after ~2013, but possible). #[cfg(target_arch = "x86_64")] fn encode_into_scalar(input: &[u8], out: &mut [u8]) { let mut in_off = 0usize; @@ -383,33 +384,35 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { reason = "infallible / in-bounds by loop-condition invariant; see SAFETY comment" )] fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { - use block::try_decode_block_x4; + use block::try_decode_block_x8; - // Runtime feature gate: the SSE intrinsics are gated on - // `target_feature(enable = "sse4.1,ssse3")`; calling them on a CPU - // lacking those features is UB. Detect once at the entry point. - if !(std::is_x86_feature_detected!("sse4.1") && std::is_x86_feature_detected!("ssse3")) { + // Runtime feature gate: AVX2 needed for the fast path; otherwise + // route to scalar fallback. + if !std::is_x86_feature_detected!("avx2") { return decode_into_scalar(input, out); } let mut in_off = 0usize; let mut out_off = 0usize; - while in_off + 20 <= input.len() && out_off + 16 <= out.len() { - let Some(in_chunk) = chunk_at::<20>(input, in_off) else { + // 40-char (8-block) AVX2 chunks. On per-chunk failure (invalid + // char or u32 overflow), replay the same chunk scalar so the + // resulting DecodeError carries the precise failing byte position. + while in_off + 40 <= input.len() && out_off + 32 <= out.len() { + let Some(in_chunk) = chunk_at::<40>(input, in_off) else { return Ok(()); }; - let Some(out_chunk) = chunk_at_mut::<16>(out, out_off) else { + let Some(out_chunk) = chunk_at_mut::<32>(out, out_off) else { return Ok(()); }; - // SAFETY: SSE4.1 + SSSE3 verified above. - let sse_ok = unsafe { try_decode_block_x4(in_chunk, out_chunk) }.is_ok(); - if sse_ok { - in_off += 20; - out_off += 16; + // SAFETY: AVX2 verified above. + let avx_ok = unsafe { try_decode_block_x8(in_chunk, out_chunk) }.is_ok(); + if avx_ok { + in_off += 40; + out_off += 32; } else { - for _ in 0..4 { + for _ in 0..8 { let block: &[u8; 5] = (&input[in_off..in_off + 5]).try_into().unwrap(); let chunk: &mut [u8; 4] = (&mut out[out_off..out_off + 4]).try_into().unwrap(); decode_block(block, chunk, in_off)?; @@ -441,8 +444,7 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { Ok(()) } -/// Pure-scalar fallback used by the x86 decode path on CPUs lacking -/// SSE4.1 + SSSE3. +/// Pure-scalar fallback used by the x86 decode path on CPUs lacking AVX2. #[cfg(target_arch = "x86_64")] fn decode_into_scalar(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { let mut in_off = 0usize; diff --git a/src/ops/x86_64.rs b/src/ops/x86_64.rs index f4cbfc3..a90efb8 100644 --- a/src/ops/x86_64.rs +++ b/src/ops/x86_64.rs @@ -1,119 +1,116 @@ -//! x86_64 SSSE3 / SSE4.1 helpers — the structural mirror of `aarch64.rs`. +//! x86_64 AVX2 helpers — structural mirror of `aarch64.rs`, doubled. //! -//! The magic constants are the same (libdivide form, computed from the -//! divisor); only the SIMD intrinsics differ. Mapping cheat-sheet: +//! Every helper operates on a `__m256i` carrying **eight** u32 lanes +//! (vs the four-lane `uint32x4_t` of NEON). The lower 128-bit half +//! handles blocks 0..3 and the upper half handles blocks 4..7; +//! AVX2's lane-restricted byte/word ops (PSHUFB, BLEND, SHUFFLE) +//! work fine because the algorithm is fully per-lane — each 128-bit +//! half runs an independent copy of the SSE4.1 algorithm we replaced. //! -//! | NEON | x86 (SSE2 / SSSE3 / SSE4.1) | what it does | -//! |------------------------|------------------------------|-------------------------------------------| -//! | `vmull_u32` | `_mm_mul_epu32` | u32×u32 → u64 widening (2 lanes per call) | -//! | `vshrq_n_u32` | `_mm_srli_epi32` | logical shift right per u32 lane | -//! | `vmulq_n_u32` | `_mm_mullo_epi32` (SSE4.1) | u32×u32 → u32 (low 32 bits) | -//! | `vsubq_u32` | `_mm_sub_epi32` | u32 lane-wise subtract | -//! | `vqtbl1q_u8` | `_mm_shuffle_epi8` (SSSE3) | byte-shuffle / lookup | -//! | `vbslq_u8` | `_mm_blendv_epi8` (SSE4.1) | byte-wise blend with mask | -//! | `vrev32q_u8` | `_mm_shuffle_epi8` + idx | reverse bytes in each u32 lane | +//! The magic constants are the same libdivide values used on aarch64; +//! only the SIMD intrinsics differ. AVX2 instructions used: //! -//! All SIMD entry points carry `#[target_feature(enable = "sse4.1,ssse3")]` +//! | NEON | AVX2 | what it does | +//! |------------------------|-------------------------------|-------------------------------------| +//! | `vmull_u32` | `_mm256_mul_epu32` | u32×u32 → u64 widening (4 lanes) | +//! | `vshrq_n_u32` | `_mm256_srli_epi32` | logical shift right per u32 lane | +//! | `vmulq_n_u32` | `_mm256_mullo_epi32` | u32×u32 → u32 (low 32 bits, 8 lanes)| +//! | `vsubq_u32` | `_mm256_sub_epi32` | u32 lane-wise subtract | +//! | `vqtbl1q_u8` | `_mm256_shuffle_epi8` | byte-shuffle, **per 128-bit lane** | +//! | `vrev32q_u8` | `_mm256_shuffle_epi8` + idx | reverse bytes in each u32 lane | +//! +//! All SIMD entry points carry `#[target_feature(enable = "avx2")]` //! so the file compiles on any `x86_64-*-*` target regardless of the -//! default target-feature set (`x86_64-unknown-linux-gnu` defaults to -//! just SSE2). Callers must perform a runtime feature detection via -//! `std::is_x86_feature_detected!` before invoking. +//! default target-feature set. Callers must perform a runtime +//! `is_x86_feature_detected!("avx2")` check before invoking; on hosts +//! lacking AVX2 the crate routes to the scalar fallback. //! -//! The inner `unsafe { ... }` blocks below are required on Rust 1.85 -//! (the declared MSRV) because `unsafe_op_in_unsafe_fn` still applies -//! to target-feature-gated intrinsics there. Newer toolchains relax -//! this and may report the blocks as redundant — hence the +//! Inner `unsafe { ... }` blocks are required on Rust 1.85 (MSRV) for +//! calls to target-feature-gated intrinsics inside an `unsafe fn`. +//! Newer toolchains may flag those blocks as redundant, hence the //! module-level `unused_unsafe` allow. #![allow(unused_unsafe)] use std::arch::x86_64::{ - __m128i, _mm_blend_epi16, _mm_mul_epu32, _mm_set1_epi32, _mm_shuffle_epi32, _mm_srli_epi32, - _mm_srli_epi64, + __m256i, _mm256_blend_epi16, _mm256_mul_epu32, _mm256_set1_epi32, _mm256_shuffle_epi32, + _mm256_srli_epi32, _mm256_srli_epi64, }; -/// Lane-wise `input / 85`. Magic constants verified for the full u32 range. +/// Lane-wise `input / 85` (8 u32 lanes). Magic verified for full u32 range. /// /// # Safety -/// Requires SSE4.1 + SSSE3 to be available on the host CPU. +/// Requires AVX2 to be available on the host CPU. #[inline] -#[target_feature(enable = "sse4.1,ssse3")] -pub(crate) unsafe fn div_85(input: __m128i) -> __m128i { +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn div_85(input: __m256i) -> __m256i { // m = ceil(2^38 / 85) = 3_233_857_729; m * 85 - 2^38 = 21 < 2^6. - // SAFETY: caller upholds SSE4.1+SSSE3 (function-level target_feature). + // SAFETY: caller upholds AVX2 (function-level target_feature). unsafe { div_magic::<3_233_857_729, 6>(input) } } /// Lane-wise `input / 7225` (i.e. `input / 85²`). Valid for all u32 inputs. /// /// # Safety -/// Requires SSE4.1 + SSSE3 to be available on the host CPU. +/// Requires AVX2 to be available on the host CPU. #[inline] -#[target_feature(enable = "sse4.1,ssse3")] -pub(crate) unsafe fn div_85_sq(input: __m128i) -> __m128i { +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn div_85_sq(input: __m256i) -> __m256i { // m = ceil(2^44 / 7225) = 2_434_904_643; m * 7225 - 2^44 = 1259 < 2^12. - // SAFETY: caller upholds SSE4.1+SSSE3. + // SAFETY: caller upholds AVX2. unsafe { div_magic::<2_434_904_643, 12>(input) } } /// Lane-wise `input / 614125` (i.e. `input / 85³`). Valid for all u32 inputs. /// /// # Safety -/// Requires SSE4.1 + SSSE3 to be available on the host CPU. +/// Requires AVX2 to be available on the host CPU. #[inline] -#[target_feature(enable = "sse4.1,ssse3")] -pub(crate) unsafe fn div_85_cube(input: __m128i) -> __m128i { +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn div_85_cube(input: __m256i) -> __m256i { // m = ceil(2^51 / 614125) = 3_666_679_933; bound check passes for u32. - // SAFETY: caller upholds SSE4.1+SSSE3. + // SAFETY: caller upholds AVX2. unsafe { div_magic::<3_666_679_933, 19>(input) } } -/// Generic libdivide-style div: `q = (n * MAGIC) >> (32 + SHIFT)`. -/// -/// `_mm_mul_epu32` multiplies the **low u32 of each u64 lane**, so it -/// only touches input lanes 0 and 2 per call. We do two multiplies — -/// one as-is (for even lanes 0, 2), one after `_mm_srli_epi64::<32>` -/// (which moves odd lanes 1, 3 into the low u32 of each u64 lane). -/// -/// After the two multiplies we have: -/// even = [(in[0]·m)lo, (in[0]·m)hi, (in[2]·m)lo, (in[2]·m)hi] -/// odd = [(in[1]·m)lo, (in[1]·m)hi, (in[3]·m)lo, (in[3]·m)hi] +/// Generic libdivide-style div over 8 u32 lanes: +/// `q = (n * MAGIC) >> (32 + SHIFT)`. /// -/// We want u32x4 = [(in[k]·m)>>32 for k=0..3] = the four `hi` lanes. -/// One `_mm_shuffle_epi32::<0xF5>` per side broadcasts the `hi` u32 -/// across each u64 half (so even_hi = [hi0, hi0, hi2, hi2] etc.), and -/// `_mm_blend_epi16::<0xCC>` interleaves them into [hi0, hi1, hi2, hi3]. -/// Finally `_mm_srli_epi32::` applies the libdivide shift. +/// Same algorithm as the SSE4.1 version it replaces, just doubled. +/// `_mm256_mul_epu32` multiplies the low u32 of each u64 lane (so it +/// touches input lanes 0, 2, 4, 6 per call); we do two multiplies — +/// one as-is, one after `_mm256_srli_epi64::<32>` — and combine the +/// high u32 of each u64 product via `_mm256_shuffle_epi32::<0xF5>` + +/// `_mm256_blend_epi16::<0xCC>`. Both of those are lane-restricted ops +/// (per 128-bit half), but our blend pattern lives entirely within each +/// half so that's fine. /// /// # Safety -/// Requires SSE4.1 + SSSE3 (the function-level `target_feature` -/// satisfies this for any caller that has done the runtime check). +/// Requires AVX2 (the function-level `target_feature` satisfies this +/// for any caller that has done the runtime check). #[inline] -#[target_feature(enable = "sse4.1,ssse3")] -unsafe fn div_magic(input: __m128i) -> __m128i { - // SAFETY (1.85 MSRV): caller has SSE4.1+SSSE3; all intrinsics below - // are gated on those features. +#[target_feature(enable = "avx2")] +unsafe fn div_magic(input: __m256i) -> __m256i { + // SAFETY (1.85 MSRV): caller has AVX2; all intrinsics below are + // gated on AVX2 (or its prerequisites). unsafe { - let magic = _mm_set1_epi32(MAGIC as i32); - - // Even-lane multiply (touches input lanes 0 and 2). - let even = _mm_mul_epu32(input, magic); - // Shift right by 32 in u64 view to bring lanes 1 and 3 into - // the low u32 of each u64 lane, then multiply. - let input_odd = _mm_srli_epi64::<32>(input); - let odd = _mm_mul_epu32(input_odd, magic); - - // 0b11_11_01_01 picks lane 1 / lane 1 / lane 3 / lane 3 — i.e. - // broadcasts the high u32 of each u64 product across the pair. - let even_hi = _mm_shuffle_epi32::<0b11_11_01_01>(even); - let odd_hi = _mm_shuffle_epi32::<0b11_11_01_01>(odd); - - // Blend at u16 granularity: 0xCC = 0b1100_1100 picks the upper - // u16 pair (i.e. odd_hi) for lanes 1 and 3 of the u32 view, and - // the lower (even_hi) for lanes 0 and 2. - let combined = _mm_blend_epi16::<0xCC>(even_hi, odd_hi); - - _mm_srli_epi32::(combined) + let magic = _mm256_set1_epi32(MAGIC as i32); + + let even = _mm256_mul_epu32(input, magic); + let input_odd = _mm256_srli_epi64::<32>(input); + let odd = _mm256_mul_epu32(input_odd, magic); + + // Per-128-bit-lane shuffle: broadcasts hi u32 of each u64 + // product across the pair, in each lane independently. + let even_hi = _mm256_shuffle_epi32::<0b11_11_01_01>(even); + let odd_hi = _mm256_shuffle_epi32::<0b11_11_01_01>(odd); + + // Per-lane blend at u16 granularity. The 8-bit MASK is applied + // to each 128-bit lane, so 0xCC interleaves [hi0, hi1, hi2, hi3] + // within each lane independently. + let combined = _mm256_blend_epi16::<0xCC>(even_hi, odd_hi); + + _mm256_srli_epi32::(combined) } } @@ -122,12 +119,12 @@ mod tests { use super::*; use quickcheck::{Arbitrary, quickcheck}; use std::{ - arch::x86_64::{_mm_loadu_si128, _mm_storeu_si128}, + arch::x86_64::{_mm256_loadu_si256, _mm256_storeu_si256}, array, }; #[derive(Debug, Clone, Copy)] - struct InputBlock([u32; 4]); + struct InputBlock([u32; 8]); impl Arbitrary for InputBlock { fn arbitrary(g: &mut quickcheck::Gen) -> Self { @@ -135,41 +132,49 @@ mod tests { } } - fn lanes(v: __m128i) -> [u32; 4] { - let mut out = [0u32; 4]; - unsafe { _mm_storeu_si128(out.as_mut_ptr().cast::<__m128i>(), v) }; + fn lanes(v: __m256i) -> [u32; 8] { + let mut out = [0u32; 8]; + unsafe { _mm256_storeu_si256(out.as_mut_ptr().cast::<__m256i>(), v) }; out } - fn load(input: &[u32; 4]) -> __m128i { - unsafe { _mm_loadu_si128(input.as_ptr().cast::<__m128i>()) } + fn load(input: &[u32; 8]) -> __m256i { + unsafe { _mm256_loadu_si256(input.as_ptr().cast::<__m256i>()) } + } + + fn host_has_avx2() -> bool { + std::is_x86_feature_detected!("avx2") } quickcheck! { fn div_85_matches_scalar(block: InputBlock) -> bool { + if !host_has_avx2() { return true; } let InputBlock(input) = block; let q_out = lanes(unsafe { div_85(load(&input)) }); - (0..4).all(|i| q_out[i] == input[i] / 85) + (0..8).all(|i| q_out[i] == input[i] / 85) } fn div_85_sq_matches_scalar(block: InputBlock) -> bool { + if !host_has_avx2() { return true; } let InputBlock(input) = block; let q_out = lanes(unsafe { div_85_sq(load(&input)) }); - (0..4).all(|i| q_out[i] == input[i] / (85 * 85)) + (0..8).all(|i| q_out[i] == input[i] / (85 * 85)) } fn div_85_cube_matches_scalar(block: InputBlock) -> bool { + if !host_has_avx2() { return true; } let InputBlock(input) = block; let q_out = lanes(unsafe { div_85_cube(load(&input)) }); - (0..4).all(|i| q_out[i] == input[i] / (85 * 85 * 85)) + (0..8).all(|i| q_out[i] == input[i] / (85 * 85 * 85)) } fn div_85_to_the_4_via_composition(block: InputBlock) -> bool { + if !host_has_avx2() { return true; } let InputBlock(input) = block; let q2 = unsafe { div_85_sq(load(&input)) }; let q4 = unsafe { div_85_sq(q2) }; let q4_out = lanes(q4); - (0..4).all(|i| q4_out[i] == input[i] / (85u32.pow(4))) + (0..8).all(|i| q4_out[i] == input[i] / (85u32.pow(4))) } } } From 0abb7f4ad4c21376ac1686c6f346738e64450243 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Sat, 2 May 2026 14:38:44 +1000 Subject: [PATCH 5/7] docs: README updated with x86 AVX2 numbers + arch summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Splits the benchmarks section into per-architecture tables (aarch64 NEON / x86_64 AVX2) with the steady-state numbers from the GHA Ubuntu runner. Also rewords the top-of-README description so it mentions both SIMD targets up front instead of just NEON. Steady-state at ≥ 256 B: aarch64 NEON: 4.40 GiB/s encode (1.61× ref), 4.49 GiB/s decode (10.5× ref) x86_64 AVX2: 2.07 GiB/s encode (2.71× ref), 2.32 GiB/s decode (9.6× ref) NEON sustains roughly 2× the absolute throughput of AVX2 because its `vqtbl4q_u8` does a 64-entry table lookup in one instruction, where PSHUFB is limited to 16 entries (so the alphabet lookup expands to ~6 PSHUFB+OR per chunk on x86). AVX-512 VBMI's `vpermb` would close that gap but isn't on the GHA fleet. --- README.md | 121 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index c8286d6..93bef02 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,11 @@ Fast Base85 (RFC 1924 / Z85-style) encoder and decoder for Rust. -On `aarch64` the encode path uses NEON intrinsics to process 16 input bytes -per iteration; the decode path and the fallback for other architectures use -a portable scalar implementation. Output is byte-for-byte compatible with -the [`base85`](https://crates.io/crates/base85) crate. +SIMD-accelerated on aarch64 (NEON, 4 blocks per iteration) and x86_64 +(AVX2, 8 blocks per iteration), with a portable scalar fallback for +everything else and for x86_64 hosts lacking AVX2 (rare on server +hardware after ~2013). Output is byte-for-byte compatible with the +[`base85`](https://crates.io/crates/base85) crate. ## Usage @@ -33,7 +34,10 @@ for decoding. ## Status - Public API: `encode(&[u8]) -> String`, `decode(&str) -> Result, DecodeError>`. -- Encode and decode are both NEON-accelerated on `aarch64`, scalar elsewhere. +- aarch64: NEON-accelerated (4 blocks at a time, always available on aarch64). +- x86_64 with AVX2: AVX2-accelerated (8 blocks at a time). Runtime feature + detection at the public API entry — hosts without AVX2 fall back to scalar. +- Other architectures: portable scalar implementation. - The decode path validates char range and detects `u32` overflow lane-wise; any invalid input falls back to the scalar path so the resulting `DecodeError` carries a precise byte position. @@ -74,43 +78,82 @@ strictly more diagnostic information. Code that pattern-matches on ## Benchmarks -Apple M-series (aarch64), `cargo bench --bench encode`, criterion, release -profile, single-threaded. Times are the criterion-reported median; throughput -is computed from the median. - -### Encode - -| size | `base85` time | `base85-simd` time | speedup | `base85-simd` throughput | -|-------|--------------:|-------------------:|--------:|-------------------------:| -| 16 B | 17.4 ns | 16.3 ns | 1.07× | ~940 MiB/s | -| 64 B | 33.0 ns | 22.6 ns | 1.46× | 2.71 GiB/s | -| 256 B | 115.7 ns | 73.2 ns | 1.58× | 3.26 GiB/s | -| 1 KiB | 378.3 ns | 225.1 ns | 1.68× | 4.24 GiB/s | -| 16 KiB| 5.56 µs | 3.60 µs | 1.55× | 4.24 GiB/s | -| 256 KiB| 89.3 µs | 55.4 µs | 1.61× | 4.41 GiB/s | -| 1 MiB | 356 µs | 222 µs | 1.61× | 4.40 GiB/s | - -### Decode - -| size | `base85` time | `base85-simd` time | speedup | `base85-simd` throughput | -|-------|--------------:|-------------------:|--------:|-------------------------:| -| 16 B | 32.4 ns | 14.8 ns | 2.18× | ~1.0 GiB/s | -| 64 B | 123.5 ns | 24.6 ns | 5.02× | 2.42 GiB/s | -| 256 B | 579 ns | 57.6 ns | 10.05× | 4.14 GiB/s | -| 1 KiB | 2.27 µs | 226 ns | 10.06× | 4.22 GiB/s | -| 16 KiB| 36.8 µs | 3.49 µs | 10.55× | 4.38 GiB/s | -| 256 KiB| 591 µs | 54.3 µs | 10.89× | 4.50 GiB/s | -| 1 MiB | 2.28 ms | 217.6 µs | 10.49× | 4.49 GiB/s | +`cargo bench --bench encode`, criterion, release profile, single-threaded. +Times are the criterion-reported median; throughput computed from it. + +### aarch64 (Apple M-series) + +#### Encode + +| size | `base85` | `base85-simd` (NEON) | speedup | throughput | +|-------|---------:|---------------------:|--------:|-----------:| +| 16 B | 17.4 ns | 16.3 ns | 1.07× | ~940 MiB/s | +| 64 B | 33.0 ns | 22.6 ns | 1.46× | 2.71 GiB/s | +| 256 B | 115.7 ns | 73.2 ns | 1.58× | 3.26 GiB/s | +| 1 KiB | 378.3 ns | 225.1 ns | 1.68× | 4.24 GiB/s | +| 16 KiB| 5.56 µs | 3.60 µs | 1.55× | 4.24 GiB/s | +| 256 KiB| 89.3 µs | 55.4 µs | 1.61× | 4.41 GiB/s | +| 1 MiB | 356 µs | 222 µs | 1.61× | 4.40 GiB/s | + +#### Decode + +| size | `base85` | `base85-simd` (NEON) | speedup | throughput | +|-------|---------:|---------------------:|--------:|-----------:| +| 16 B | 32.4 ns | 14.8 ns | 2.18× | ~1.0 GiB/s | +| 64 B | 123.5 ns | 24.6 ns | 5.02× | 2.42 GiB/s | +| 256 B | 579 ns | 57.6 ns | 10.05× | 4.14 GiB/s | +| 1 KiB | 2.27 µs | 226 ns | 10.06× | 4.22 GiB/s | +| 16 KiB| 36.8 µs | 3.49 µs | 10.55× | 4.38 GiB/s | +| 256 KiB| 591 µs | 54.3 µs | 10.89× | 4.50 GiB/s | +| 1 MiB | 2.28 ms | 217.6 µs | 10.49× | 4.49 GiB/s | + +### x86_64 (AMD EPYC 7763, Zen 3) + +Numbers from a GitHub Actions hosted Ubuntu runner — shared/virtualised +hardware so noise is higher than aarch64 (~5–15% variance), but the +relative speedups are stable. + +#### Encode + +| size | `base85` | `base85-simd` (AVX2) | speedup | throughput | +|-------|---------:|---------------------:|--------:|-----------:| +| 16 B | 41.0 ns | 57.8 ns | 0.71× | (scalar fallback; chunk doesn't fit) | +| 64 B | 100.5 ns | 61.7 ns | 1.63× | 989 MiB/s | +| 256 B | 341.9 ns | 165.9 ns | 2.06× | 1.44 GiB/s | +| 1 KiB | 1.32 µs | 507.0 ns | 2.61× | 1.88 GiB/s | +| 16 KiB| 20.4 µs | 7.48 µs | 2.72× | 2.04 GiB/s | +| 256 KiB| 323.9 µs| 118.9 µs | 2.72× | 2.05 GiB/s | +| 1 MiB | 1.31 ms | 482.9 µs | 2.71× | 2.07 GiB/s | + +#### Decode + +| size | `base85` | `base85-simd` (AVX2) | speedup | throughput | +|-------|---------:|---------------------:|--------:|-----------:| +| 16 B | 70.2 ns | 61.2 ns | 1.15× | (scalar fallback) | +| 64 B | 244.8 ns | 67.6 ns | 3.62× | 903 MiB/s | +| 256 B | 1.046 µs | 164.4 ns | 6.36× | 1.45 GiB/s | +| 1 KiB | 4.19 µs | 466.5 ns | 8.98× | 2.04 GiB/s | +| 16 KiB| 65.7 µs | 6.75 µs | 9.73× | 2.26 GiB/s | +| 256 KiB| 1.058 ms| 107.2 µs | 9.86× | 2.28 GiB/s | +| 1 MiB | 4.14 ms | 431.6 µs | 9.59× | 2.32 GiB/s | ### Steady-state summary -At sizes large enough to amortise loop setup (≥ 256 B), `base85-simd` -sustains **~4.4 GiB/s** for both encode and decode on Apple M-series, -roughly **1.6× faster** than the reference for encode and **~10× -faster** for decode. The decode advantage comes from `TBL`-based ASCII -→ digit lookup that replaces the reference's per-character branchy -match; the encode advantage comes from 4-lane parallel `divmod 85` -plus `TBX`-based digit → ASCII / output permutation. +At sizes large enough to amortise the SIMD loop setup (≥ 256 B): + +| arch / ISA | encode throughput | encode speedup | decode throughput | decode speedup | +|-------------|------------------:|---------------:|------------------:|---------------:| +| aarch64 NEON| 4.40 GiB/s | 1.61× | 4.49 GiB/s | 10.49× | +| x86_64 AVX2 | 2.07 GiB/s | 2.71× | 2.32 GiB/s | 9.59× | + +The decode speedup ratio is roughly the same on both architectures +(~10×), driven by SIMD-accelerated ASCII → digit table lookup +replacing the reference's per-character branchy match. NEON sustains +roughly 2× the absolute throughput of AVX2 because its `vqtbl4q_u8` +does a 64-entry lookup in a single instruction, where x86 PSHUFB is +limited to 16 entries (so the lookup expands to ~6 PSHUFB+OR per +chunk on x86). AVX-512 VBMI's `vpermb` would close that gap but +isn't available on the AMD silicon used by GitHub's runner fleet. Reproduce with: From 73f1be6f9f22edfbf4c35644bab495ed6475ce54 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Sat, 2 May 2026 18:49:02 +1000 Subject: [PATCH 6/7] =?UTF-8?q?refactor:=20address=20PR=20#6=20review=20?= =?UTF-8?q?=E2=80=94=20drop=20redundant=20unsafe=20wrappers,=20consolidate?= =?UTF-8?q?=20scalar=20paths,=20hoist=20shared=20tables?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AVX2 module + ops::x86_64 use `#![allow(unsafe_op_in_unsafe_fn)]` at module scope, dropping every inner `unsafe { … }` block. Same code compiles on Rust 1.85 (MSRV) and on toolchains that would otherwise flag the wrappers as unused. Mirrors the comment thread on PR #6. - Delete duplicated quickcheck tests in `ops::aarch64` and `ops::x86_64`. `tests/base85_parity.rs` round-trips through these on every iteration, and the dedicated tests added zero unique coverage (and required a runtime AVX2 skip that we'd rather not have). - Collapse `encode_into_scalar` / `decode_into_scalar` (x86 fallback) and the `not(any(aarch64, x86_64))` `encode_into` / `decode_into` into a single `encode_scalar` / `decode_scalar` gated on `not(aarch64)`. The non-aarch64-non-x86_64 arm becomes a one-line wrapper. - Hoist `IDX_OUT_1`, `IDX_OUT_2`, and `TAIL_VALID_MASK` to file scope in `block.rs` so NEON and AVX2 share one definition each. `REV32_IDX` and `IDX_DEC` stay in `mod avx2` — their shape is x86-specific (NEON uses `vrev32q_u8` and a 32-byte `vqtbl2q_u8` source instead). No behavioural change. Verified: cargo fmt, cargo clippy --all-targets, cargo test (24 lib + 18 parity + 1 doctest) on stable + 1.85, both aarch64-apple-darwin and x86_64-apple-darwin. --- src/block.rs | 612 ++++++++++++++++++++------------------------- src/lib.rs | 82 ++---- src/ops/aarch64.rs | 63 +---- src/ops/x86_64.rs | 124 +++------ 4 files changed, 330 insertions(+), 551 deletions(-) diff --git a/src/block.rs b/src/block.rs index 2c2040a..096e473 100644 --- a/src/block.rs +++ b/src/block.rs @@ -150,6 +150,48 @@ pub(crate) fn decode_tail( Ok(()) } +// ───────────────────────────────────────────────────────────────────────── +// SIMD shared tables (used by both NEON and AVX2 paths) +// ───────────────────────────────────────────────────────────────────────── + +// Per-digit splice indexes for the 4-block encode shuffle. Output position +// of digit k of block i is `5*i + k`; positions ≥ 16 spill into out_2 (with +// offset −16). Each v_dk is a u32x4 with the digit in byte 0/4/8/12 of its +// lane, hence the source offsets `4*i`. Both NEON (vqtbx1q_u8) and AVX2 +// (PSHUFB, broadcast to both 128-bit halves) consume the identical patterns. + +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] +#[rustfmt::skip] +pub(super) static IDX_OUT_1: [[u8; 16]; 5] = [ + // k=0: out_1[0,5,10,15] ← src bytes 0,4,8,12 + [0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, 12], + // k=1: out_1[1,6,11] ← 0,4,8 (block 3 → out_2) + [0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF], + // k=2: out_1[2,7,12] ← 0,4,8 + [0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF], + // k=3: out_1[3,8,13] ← 0,4,8 + [0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF], + // k=4: out_1[4,9,14] ← 0,4,8 + [0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF], +]; + +// Indexes for digits 1..4 of block 3 (src byte 12) spilling into out_2. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] +#[rustfmt::skip] +pub(super) static IDX_OUT_2: [[u8; 16]; 4] = [ + [12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], + [0xFF, 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF], +]; + +// Mask used by the SIMD decoders: 0xFF for the 4 real-input lanes of the +// padded tail register, 0 for the 12 padding lanes. Lets us suppress +// validation noise from the padding bytes. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] +pub(super) static TAIL_VALID_MASK: [u8; 16] = + [0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + // ───────────────────────────────────────────────────────────────────────── // NEON encoder (aarch64) // ───────────────────────────────────────────────────────────────────────── @@ -172,52 +214,7 @@ mod neon { vreinterpretq_u8_u32, vreinterpretq_u32_u8, vrev32q_u8, vst1q_u8, vsubq_u8, }; - // Per-digit splice indexes, computed at compile time and loaded - // from .rodata each call. Output position of digit k of block i is - // `5*i + k`; positions ≥ 16 spill into out_2 (with offset −16). - // Each v_dk is a u32x4 with the digit in byte 0/4/8/12 of its lane, - // hence the source offsets `4*i`. - static IDX_OUT_1: [[u8; 16]; 5] = [ - // k=0: out_1[0,5,10,15] ← src bytes 0,4,8,12 - [ - 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, 12, - ], - // k=1: out_1[1,6,11] ← 0,4,8 (block 3 → out_2) - [ - 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, - ], - // k=2: out_1[2,7,12] ← 0,4,8 - [ - 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, - ], - // k=3: out_1[3,8,13] ← 0,4,8 - [ - 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, - ], - // k=4: out_1[4,9,14] ← 0,4,8 - [ - 0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, - ], - ]; - // Indexes for digits 1..4 spilling into out_2 (block 3, src byte 12). - static IDX_OUT_2: [[u8; 16]; 4] = [ - [ - 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - [ - 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - [ - 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - [ - 0xFF, 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - ]; + use super::{IDX_OUT_1, IDX_OUT_2, TAIL_VALID_MASK}; /// Encodes 16 input bytes into 20 output characters using NEON. /// @@ -339,10 +336,6 @@ mod neon { tail_buf[..4].copy_from_slice(&input[16..20]); let chars_tail = unsafe { vld1q_u8(tail_buf.as_ptr()) }; - // Mask: 0xFF for the 4 real-input lanes of `chars_tail`, 0 for the - // 12 padding lanes — used to suppress validation noise from padding. - static TAIL_VALID_MASK: [u8; 16] = - [0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let tail_valid_mask = unsafe { vld1q_u8(TAIL_VALID_MASK.as_ptr()) }; // Char range validation: any byte not in [33, 126] is invalid. @@ -547,13 +540,14 @@ mod avx2 { //! `is_x86_feature_detected!("avx2")` check; on hosts lacking //! AVX2 the crate routes to the scalar fallback. //! - //! Inner `unsafe { ... }` blocks are required on Rust 1.85 (MSRV) - //! for calls to target-feature-gated intrinsics inside an - //! `unsafe fn`. Newer toolchains relax this and may flag those - //! blocks as redundant — hence the module-level `unused_unsafe` - //! allow. - - #![allow(unused_unsafe)] + //! `#![allow(unsafe_op_in_unsafe_fn)]` lets each `unsafe fn` body + //! act as an implicit unsafe scope so individual intrinsic calls + //! don't need their own `unsafe { … }` wrapper. Without it, Rust + //! 1.85 (MSRV) would require those wrappers; newer toolchains + //! would then flag them as `unused_unsafe`. The override side- + //! steps both ends. + + #![allow(unsafe_op_in_unsafe_fn)] #![allow(clippy::indexing_slicing)] use crate::ops::{div_85, div_85_cube, div_85_sq}; @@ -566,47 +560,13 @@ mod avx2 { _mm256_sub_epi32, _mm256_xor_si256, }; - // Per-128-bit-lane index vectors. Loaded as 16-byte arrays from - // .rodata and broadcast to both lanes via VBROADCASTI128 — both - // halves of the 256-bit register share the same index pattern - // because each half is doing an independent 4-block computation. - - static IDX_OUT_1: [[u8; 16]; 5] = [ - [ - 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, 12, - ], - [ - 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, 0xFF, - ], - [ - 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, 0xFF, - ], - [ - 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, 0xFF, - ], - [ - 0xFF, 0xFF, 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 4, 0xFF, 0xFF, 0xFF, 0xFF, 8, 0xFF, - ], - ]; - static IDX_OUT_2: [[u8; 16]; 4] = [ - [ - 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - [ - 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - [ - 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - [ - 0xFF, 0xFF, 0xFF, 12, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, - ], - ]; + use super::{IDX_OUT_1, IDX_OUT_2, TAIL_VALID_MASK}; + // x86-only: NEON has `vrev32q_u8` for the byte-swap (no table needed), + // but PSHUFB needs an explicit shuffle index. Same for the per-position + // decode index pattern below — its shape is x86-specific (single 16-byte + // PSHUFB index per digit), where the NEON path uses `vqtbl2q_u8` over a + // 32-byte source and a different index layout. static REV32_IDX: [u8; 16] = [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]; #[rustfmt::skip] @@ -647,8 +607,6 @@ mod avx2 { ), ]; - static TAIL_VALID_MASK: [u8; 16] = [0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - /// Loads a 16-byte index from .rodata and broadcasts it to both /// 128-bit lanes of an `__m256i`. /// @@ -657,8 +615,7 @@ mod avx2 { #[inline] #[target_feature(enable = "avx2")] unsafe fn broadcast16(ptr: *const u8) -> __m256i { - // SAFETY: caller upholds the read-validity precondition. - unsafe { _mm256_broadcastsi128_si256(_mm_loadu_si128(ptr.cast::<__m128i>())) } + _mm256_broadcastsi128_si256(_mm_loadu_si128(ptr.cast::<__m128i>())) } /// AVX2 8-block encoder. Mirrors the structure of NEON's @@ -679,79 +636,74 @@ mod avx2 { #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn encode_block_x8(&self, in_bytes: &[u8; 32], out_bytes: &mut [u8; 40]) { - // SAFETY: AVX2 verified by caller; user-defined unsafe fns - // (div_85*, byte_to_char85_avx2) inherit the target_feature. - // Pointer writes at the end land within the [u8; 40] array. - unsafe { - // Load 32 bytes (8 × 4-byte blocks) and reverse each - // 4-byte group to get 8 BE u32s. - let raw = _mm256_loadu_si256(in_bytes.as_ptr().cast::<__m256i>()); - let n = _mm256_shuffle_epi8(raw, broadcast16(REV32_IDX.as_ptr())); - - // Parallel-magic divides — 8 lanes wide. - let q1 = div_85(n); - let q2 = div_85_sq(n); - let q3 = div_85_cube(n); - let q4 = div_85_sq(q2); - - // Five digits per block via lane-wise multiply-subtract. - let m85 = _mm256_set1_epi32(85); - let d0 = q4; - let d1 = _mm256_sub_epi32(q3, _mm256_mullo_epi32(q4, m85)); - let d2 = _mm256_sub_epi32(q2, _mm256_mullo_epi32(q3, m85)); - let d3 = _mm256_sub_epi32(q1, _mm256_mullo_epi32(q2, m85)); - let d4 = _mm256_sub_epi32(n, _mm256_mullo_epi32(q1, m85)); - - // Splice 5 digits into out_1 (32 bytes; 16 per lane — - // each lane holds the first 16 chars of its 4-block group). - // PSHUFB is per-lane; the same index broadcast to both - // lanes does the right thing. - let idx0 = broadcast16(IDX_OUT_1[0].as_ptr()); - let idx1 = broadcast16(IDX_OUT_1[1].as_ptr()); - let idx2 = broadcast16(IDX_OUT_1[2].as_ptr()); - let idx3 = broadcast16(IDX_OUT_1[3].as_ptr()); - let idx4 = broadcast16(IDX_OUT_1[4].as_ptr()); - let p0 = _mm256_shuffle_epi8(d0, idx0); - let p1 = _mm256_shuffle_epi8(d1, idx1); - let p2 = _mm256_shuffle_epi8(d2, idx2); - let p3 = _mm256_shuffle_epi8(d3, idx3); - let p4 = _mm256_shuffle_epi8(d4, idx4); - let out_1 = _mm256_or_si256( - _mm256_or_si256(p0, p1), - _mm256_or_si256(p2, _mm256_or_si256(p3, p4)), - ); - - // Same for out_2 (8 bytes total; 4 per lane — the - // trailing 4 chars of each 4-block group). - let i20 = broadcast16(IDX_OUT_2[0].as_ptr()); - let i21 = broadcast16(IDX_OUT_2[1].as_ptr()); - let i22 = broadcast16(IDX_OUT_2[2].as_ptr()); - let i23 = broadcast16(IDX_OUT_2[3].as_ptr()); - let q21 = _mm256_shuffle_epi8(d1, i20); - let q22 = _mm256_shuffle_epi8(d2, i21); - let q23 = _mm256_shuffle_epi8(d3, i22); - let q24 = _mm256_shuffle_epi8(d4, i23); - let out_2 = _mm256_or_si256(_mm256_or_si256(q21, q22), _mm256_or_si256(q23, q24)); - - // Digit value (0..84) → ASCII char. - let out_1 = byte_to_char85_avx2(out_1); - let out_2 = byte_to_char85_avx2(out_2); - - // Store. Output layout is non-contiguous between lanes: - // bytes 0..16 — lane 0's 16-char main - // bytes 16..20 — lane 0's 4-char tail - // bytes 20..36 — lane 1's 16-char main - // bytes 36..40 — lane 1's 4-char tail - let lo_main = _mm256_extracti128_si256::<0>(out_1); - let hi_main = _mm256_extracti128_si256::<1>(out_1); - let lo_tail = _mm256_extract_epi32::<0>(out_2) as u32; - let hi_tail = _mm256_extract_epi32::<4>(out_2) as u32; - - _mm_storeu_si128(out_bytes.as_mut_ptr().cast::<__m128i>(), lo_main); - std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(16).cast::(), lo_tail); - _mm_storeu_si128(out_bytes.as_mut_ptr().add(20).cast::<__m128i>(), hi_main); - std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(36).cast::(), hi_tail); - } + // Load 32 bytes (8 × 4-byte blocks) and reverse each 4-byte + // group to get 8 BE u32s. + let raw = _mm256_loadu_si256(in_bytes.as_ptr().cast::<__m256i>()); + let n = _mm256_shuffle_epi8(raw, broadcast16(REV32_IDX.as_ptr())); + + // Parallel-magic divides — 8 lanes wide. + let q1 = div_85(n); + let q2 = div_85_sq(n); + let q3 = div_85_cube(n); + let q4 = div_85_sq(q2); + + // Five digits per block via lane-wise multiply-subtract. + let m85 = _mm256_set1_epi32(85); + let d0 = q4; + let d1 = _mm256_sub_epi32(q3, _mm256_mullo_epi32(q4, m85)); + let d2 = _mm256_sub_epi32(q2, _mm256_mullo_epi32(q3, m85)); + let d3 = _mm256_sub_epi32(q1, _mm256_mullo_epi32(q2, m85)); + let d4 = _mm256_sub_epi32(n, _mm256_mullo_epi32(q1, m85)); + + // Splice 5 digits into out_1 (32 bytes; 16 per lane — each + // lane holds the first 16 chars of its 4-block group). + // PSHUFB is per-lane; the same index broadcast to both lanes + // does the right thing. + let idx0 = broadcast16(IDX_OUT_1[0].as_ptr()); + let idx1 = broadcast16(IDX_OUT_1[1].as_ptr()); + let idx2 = broadcast16(IDX_OUT_1[2].as_ptr()); + let idx3 = broadcast16(IDX_OUT_1[3].as_ptr()); + let idx4 = broadcast16(IDX_OUT_1[4].as_ptr()); + let p0 = _mm256_shuffle_epi8(d0, idx0); + let p1 = _mm256_shuffle_epi8(d1, idx1); + let p2 = _mm256_shuffle_epi8(d2, idx2); + let p3 = _mm256_shuffle_epi8(d3, idx3); + let p4 = _mm256_shuffle_epi8(d4, idx4); + let out_1 = _mm256_or_si256( + _mm256_or_si256(p0, p1), + _mm256_or_si256(p2, _mm256_or_si256(p3, p4)), + ); + + // Same for out_2 (8 bytes total; 4 per lane — the trailing + // 4 chars of each 4-block group). + let i20 = broadcast16(IDX_OUT_2[0].as_ptr()); + let i21 = broadcast16(IDX_OUT_2[1].as_ptr()); + let i22 = broadcast16(IDX_OUT_2[2].as_ptr()); + let i23 = broadcast16(IDX_OUT_2[3].as_ptr()); + let q21 = _mm256_shuffle_epi8(d1, i20); + let q22 = _mm256_shuffle_epi8(d2, i21); + let q23 = _mm256_shuffle_epi8(d3, i22); + let q24 = _mm256_shuffle_epi8(d4, i23); + let out_2 = _mm256_or_si256(_mm256_or_si256(q21, q22), _mm256_or_si256(q23, q24)); + + // Digit value (0..84) → ASCII char. + let out_1 = byte_to_char85_avx2(out_1); + let out_2 = byte_to_char85_avx2(out_2); + + // Store. Output layout is non-contiguous between lanes: + // bytes 0..16 — lane 0's 16-char main + // bytes 16..20 — lane 0's 4-char tail + // bytes 20..36 — lane 1's 16-char main + // bytes 36..40 — lane 1's 4-char tail + let lo_main = _mm256_extracti128_si256::<0>(out_1); + let hi_main = _mm256_extracti128_si256::<1>(out_1); + let lo_tail = _mm256_extract_epi32::<0>(out_2) as u32; + let hi_tail = _mm256_extract_epi32::<4>(out_2) as u32; + + _mm_storeu_si128(out_bytes.as_mut_ptr().cast::<__m128i>(), lo_main); + std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(16).cast::(), lo_tail); + _mm_storeu_si128(out_bytes.as_mut_ptr().add(20).cast::<__m128i>(), hi_main); + std::ptr::write_unaligned(out_bytes.as_mut_ptr().add(36).cast::(), hi_tail); } } @@ -762,41 +714,37 @@ mod avx2 { #[inline] #[target_feature(enable = "avx2")] unsafe fn byte_to_char85_avx2(x85: __m256i) -> __m256i { - // SAFETY: ALPHABET_PADDED is 96 bytes; the six 16-byte windows - // we read are all in-bounds. Caller has AVX2. - unsafe { - let high_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(x85), _mm256_set1_epi8(0x0F)); - let low_nib = _mm256_and_si256(x85, _mm256_set1_epi8(0x0F)); - - let t0 = broadcast16(ALPHABET_PADDED.as_ptr()); - let t1 = broadcast16(ALPHABET_PADDED.as_ptr().add(16)); - let t2 = broadcast16(ALPHABET_PADDED.as_ptr().add(32)); - let t3 = broadcast16(ALPHABET_PADDED.as_ptr().add(48)); - let t4 = broadcast16(ALPHABET_PADDED.as_ptr().add(64)); - let t5 = broadcast16(ALPHABET_PADDED.as_ptr().add(80)); - - let r0 = _mm256_shuffle_epi8(t0, low_nib); - let r1 = _mm256_shuffle_epi8(t1, low_nib); - let r2 = _mm256_shuffle_epi8(t2, low_nib); - let r3 = _mm256_shuffle_epi8(t3, low_nib); - let r4 = _mm256_shuffle_epi8(t4, low_nib); - let r5 = _mm256_shuffle_epi8(t5, low_nib); - - let m0 = _mm256_cmpeq_epi8(high_nib, _mm256_setzero_si256()); - let m1 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(1)); - let m2 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(2)); - let m3 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(3)); - let m4 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(4)); - let m5 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(5)); - + let high_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(x85), _mm256_set1_epi8(0x0F)); + let low_nib = _mm256_and_si256(x85, _mm256_set1_epi8(0x0F)); + + let t0 = broadcast16(ALPHABET_PADDED.as_ptr()); + let t1 = broadcast16(ALPHABET_PADDED.as_ptr().add(16)); + let t2 = broadcast16(ALPHABET_PADDED.as_ptr().add(32)); + let t3 = broadcast16(ALPHABET_PADDED.as_ptr().add(48)); + let t4 = broadcast16(ALPHABET_PADDED.as_ptr().add(64)); + let t5 = broadcast16(ALPHABET_PADDED.as_ptr().add(80)); + + let r0 = _mm256_shuffle_epi8(t0, low_nib); + let r1 = _mm256_shuffle_epi8(t1, low_nib); + let r2 = _mm256_shuffle_epi8(t2, low_nib); + let r3 = _mm256_shuffle_epi8(t3, low_nib); + let r4 = _mm256_shuffle_epi8(t4, low_nib); + let r5 = _mm256_shuffle_epi8(t5, low_nib); + + let m0 = _mm256_cmpeq_epi8(high_nib, _mm256_setzero_si256()); + let m1 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(1)); + let m2 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(2)); + let m3 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(3)); + let m4 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(4)); + let m5 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(5)); + + _mm256_or_si256( _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256(_mm256_and_si256(r0, m0), _mm256_and_si256(r1, m1)), - _mm256_or_si256(_mm256_and_si256(r2, m2), _mm256_and_si256(r3, m3)), - ), - _mm256_or_si256(_mm256_and_si256(r4, m4), _mm256_and_si256(r5, m5)), - ) - } + _mm256_or_si256(_mm256_and_si256(r0, m0), _mm256_and_si256(r1, m1)), + _mm256_or_si256(_mm256_and_si256(r2, m2), _mm256_and_si256(r3, m3)), + ), + _mm256_or_si256(_mm256_and_si256(r4, m4), _mm256_and_si256(r5, m5)), + ) } /// Maps 32 ASCII chars to base85 digit values (0..84) lane-wise. @@ -804,43 +752,38 @@ mod avx2 { #[inline] #[target_feature(enable = "avx2")] unsafe fn char85_to_byte_avx2(chars: __m256i) -> __m256i { - // SAFETY: caller has AVX2; CHAR_TO_85_PADDED is 96 bytes with - // six 16-byte readable windows. - unsafe { - let normalised = _mm256_sub_epi32(chars, _mm256_set1_epi8(33)); - let high_nib = - _mm256_and_si256(_mm256_srli_epi16::<4>(normalised), _mm256_set1_epi8(0x0F)); - let low_nib = _mm256_and_si256(normalised, _mm256_set1_epi8(0x0F)); - - let t0 = broadcast16(CHAR_TO_85_PADDED.as_ptr()); - let t1 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(16)); - let t2 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(32)); - let t3 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(48)); - let t4 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(64)); - let t5 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(80)); - - let r0 = _mm256_shuffle_epi8(t0, low_nib); - let r1 = _mm256_shuffle_epi8(t1, low_nib); - let r2 = _mm256_shuffle_epi8(t2, low_nib); - let r3 = _mm256_shuffle_epi8(t3, low_nib); - let r4 = _mm256_shuffle_epi8(t4, low_nib); - let r5 = _mm256_shuffle_epi8(t5, low_nib); - - let m0 = _mm256_cmpeq_epi8(high_nib, _mm256_setzero_si256()); - let m1 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(1)); - let m2 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(2)); - let m3 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(3)); - let m4 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(4)); - let m5 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(5)); - + let normalised = _mm256_sub_epi32(chars, _mm256_set1_epi8(33)); + let high_nib = _mm256_and_si256(_mm256_srli_epi16::<4>(normalised), _mm256_set1_epi8(0x0F)); + let low_nib = _mm256_and_si256(normalised, _mm256_set1_epi8(0x0F)); + + let t0 = broadcast16(CHAR_TO_85_PADDED.as_ptr()); + let t1 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(16)); + let t2 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(32)); + let t3 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(48)); + let t4 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(64)); + let t5 = broadcast16(CHAR_TO_85_PADDED.as_ptr().add(80)); + + let r0 = _mm256_shuffle_epi8(t0, low_nib); + let r1 = _mm256_shuffle_epi8(t1, low_nib); + let r2 = _mm256_shuffle_epi8(t2, low_nib); + let r3 = _mm256_shuffle_epi8(t3, low_nib); + let r4 = _mm256_shuffle_epi8(t4, low_nib); + let r5 = _mm256_shuffle_epi8(t5, low_nib); + + let m0 = _mm256_cmpeq_epi8(high_nib, _mm256_setzero_si256()); + let m1 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(1)); + let m2 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(2)); + let m3 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(3)); + let m4 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(4)); + let m5 = _mm256_cmpeq_epi8(high_nib, _mm256_set1_epi8(5)); + + _mm256_or_si256( _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256(_mm256_and_si256(r0, m0), _mm256_and_si256(r1, m1)), - _mm256_or_si256(_mm256_and_si256(r2, m2), _mm256_and_si256(r3, m3)), - ), - _mm256_or_si256(_mm256_and_si256(r4, m4), _mm256_and_si256(r5, m5)), - ) - } + _mm256_or_si256(_mm256_and_si256(r0, m0), _mm256_and_si256(r1, m1)), + _mm256_or_si256(_mm256_and_si256(r2, m2), _mm256_and_si256(r3, m3)), + ), + _mm256_or_si256(_mm256_and_si256(r4, m4), _mm256_and_si256(r5, m5)), + ) } /// AVX2 8-block decoder. 40 chars → 32 bytes. @@ -858,101 +801,98 @@ mod avx2 { input: &[u8; 40], out: &mut [u8; 32], ) -> Result<(), ()> { - // SAFETY: caller has AVX2. - unsafe { - // Layout the 40 chars into the 256-bit register so each - // 128-bit lane handles 4 blocks. Lane 0 ← chars 0..15 - // (first 16 chars of blocks 0..3); lane 1 ← chars 20..35 - // (first 16 chars of blocks 4..7). The 4-char tails (chars - // 16..19 and 36..39) go into a separate "chars_tail" with - // 12 zero-padding bytes per lane. - let chars_main_lo = _mm_loadu_si128(input.as_ptr().cast::<__m128i>()); - let chars_main_hi = _mm_loadu_si128(input.as_ptr().add(20).cast::<__m128i>()); - let chars_main = _mm256_set_m128i(chars_main_hi, chars_main_lo); - - let mut tail_buf_lo = [0u8; 16]; - let mut tail_buf_hi = [0u8; 16]; - tail_buf_lo[..4].copy_from_slice(&input[16..20]); - tail_buf_hi[..4].copy_from_slice(&input[36..40]); - let chars_tail = _mm256_set_m128i( - _mm_loadu_si128(tail_buf_hi.as_ptr().cast::<__m128i>()), - _mm_loadu_si128(tail_buf_lo.as_ptr().cast::<__m128i>()), - ); - let tail_valid_mask = broadcast16(TAIL_VALID_MASK.as_ptr()); - - // Range validation: chars must be in [33, 126]. - let too_low_main = _mm256_cmpgt_epi8(_mm256_set1_epi8(33), chars_main); - let too_high_main = _mm256_cmpgt_epi8(chars_main, _mm256_set1_epi8(126)); - let invalid_range_main = _mm256_or_si256(too_low_main, too_high_main); - - let too_low_tail = _mm256_cmpgt_epi8(_mm256_set1_epi8(33), chars_tail); - let too_high_tail = _mm256_cmpgt_epi8(chars_tail, _mm256_set1_epi8(126)); - let invalid_range_tail = _mm256_and_si256( - _mm256_or_si256(too_low_tail, too_high_tail), - tail_valid_mask, - ); - - // ASCII → digit lookup (0xFF for in-range invalid chars). - let digits_main = char85_to_byte_avx2(chars_main); - let digits_tail = char85_to_byte_avx2(chars_tail); - - let invalid_dig_main = _mm256_cmpeq_epi8(digits_main, _mm256_set1_epi8(0xFF_u8 as i8)); - let invalid_dig_tail = _mm256_and_si256( - _mm256_cmpeq_epi8(digits_tail, _mm256_set1_epi8(0xFF_u8 as i8)), - tail_valid_mask, - ); - - let any_invalid = _mm256_or_si256( - _mm256_or_si256(invalid_range_main, invalid_range_tail), - _mm256_or_si256(invalid_dig_main, invalid_dig_tail), - ); - // _mm256_movemask_epi8 returns a 32-bit mask (one bit per - // byte of the 256-bit register). - if _mm256_movemask_epi8(any_invalid) != 0 { - return Err(()); - } + // Layout the 40 chars into the 256-bit register so each + // 128-bit lane handles 4 blocks. Lane 0 ← chars 0..15 + // (first 16 chars of blocks 0..3); lane 1 ← chars 20..35 + // (first 16 chars of blocks 4..7). The 4-char tails (chars + // 16..19 and 36..39) go into a separate "chars_tail" with + // 12 zero-padding bytes per lane. + let chars_main_lo = _mm_loadu_si128(input.as_ptr().cast::<__m128i>()); + let chars_main_hi = _mm_loadu_si128(input.as_ptr().add(20).cast::<__m128i>()); + let chars_main = _mm256_set_m128i(chars_main_hi, chars_main_lo); + + let mut tail_buf_lo = [0u8; 16]; + let mut tail_buf_hi = [0u8; 16]; + tail_buf_lo[..4].copy_from_slice(&input[16..20]); + tail_buf_hi[..4].copy_from_slice(&input[36..40]); + let chars_tail = _mm256_set_m128i( + _mm_loadu_si128(tail_buf_hi.as_ptr().cast::<__m128i>()), + _mm_loadu_si128(tail_buf_lo.as_ptr().cast::<__m128i>()), + ); + let tail_valid_mask = broadcast16(TAIL_VALID_MASK.as_ptr()); + + // Range validation: chars must be in [33, 126]. + let too_low_main = _mm256_cmpgt_epi8(_mm256_set1_epi8(33), chars_main); + let too_high_main = _mm256_cmpgt_epi8(chars_main, _mm256_set1_epi8(126)); + let invalid_range_main = _mm256_or_si256(too_low_main, too_high_main); + + let too_low_tail = _mm256_cmpgt_epi8(_mm256_set1_epi8(33), chars_tail); + let too_high_tail = _mm256_cmpgt_epi8(chars_tail, _mm256_set1_epi8(126)); + let invalid_range_tail = _mm256_and_si256( + _mm256_or_si256(too_low_tail, too_high_tail), + tail_valid_mask, + ); + + // ASCII → digit lookup (0xFF for in-range invalid chars). + let digits_main = char85_to_byte_avx2(chars_main); + let digits_tail = char85_to_byte_avx2(chars_tail); + + let invalid_dig_main = _mm256_cmpeq_epi8(digits_main, _mm256_set1_epi8(0xFF_u8 as i8)); + let invalid_dig_tail = _mm256_and_si256( + _mm256_cmpeq_epi8(digits_tail, _mm256_set1_epi8(0xFF_u8 as i8)), + tail_valid_mask, + ); + + let any_invalid = _mm256_or_si256( + _mm256_or_si256(invalid_range_main, invalid_range_tail), + _mm256_or_si256(invalid_dig_main, invalid_dig_tail), + ); + // _mm256_movemask_epi8 returns a 32-bit mask (one bit per + // byte of the 256-bit register). + if _mm256_movemask_epi8(any_invalid) != 0 { + return Err(()); + } - // Permute digits into per-position vectors. Same indexes - // as the SSE path, broadcast to both lanes. - let v_dk = |idx_main: &[u8; 16], idx_tail: &[u8; 16]| -> __m256i { - let im = broadcast16(idx_main.as_ptr()); - let it = broadcast16(idx_tail.as_ptr()); - _mm256_or_si256( - _mm256_shuffle_epi8(digits_main, im), - _mm256_shuffle_epi8(digits_tail, it), - ) - }; - let v_d0 = v_dk(&IDX_DEC[0].0, &IDX_DEC[0].1); - let v_d1 = v_dk(&IDX_DEC[1].0, &IDX_DEC[1].1); - let v_d2 = v_dk(&IDX_DEC[2].0, &IDX_DEC[2].1); - let v_d3 = v_dk(&IDX_DEC[3].0, &IDX_DEC[3].1); - let v_d4 = v_dk(&IDX_DEC[4].0, &IDX_DEC[4].1); - - // Horner: n = ((((d0·85 + d1)·85 + d2)·85 + d3)·85 + d4) - let m85 = _mm256_set1_epi32(85); - let n = _mm256_add_epi32(v_d1, _mm256_mullo_epi32(v_d0, m85)); - let n = _mm256_add_epi32(v_d2, _mm256_mullo_epi32(n, m85)); - let n = _mm256_add_epi32(v_d3, _mm256_mullo_epi32(n, m85)); - let n = _mm256_add_epi32(v_d4, _mm256_mullo_epi32(n, m85)); - - // Overflow detection (bias-and-signed-cmpgt). - let d0_85_4 = _mm256_mullo_epi32(v_d0, _mm256_set1_epi32(52_200_625)); - let bias = _mm256_set1_epi32(0x80000000_u32 as i32); - let n_b = _mm256_xor_si256(n, bias); - let d0_b = _mm256_xor_si256(d0_85_4, bias); - let overflow_mask = _mm256_cmpgt_epi32(d0_b, n_b); - if _mm256_movemask_epi8(overflow_mask) != 0 { - return Err(()); - } + // Permute digits into per-position vectors. Same indexes + // as the SSE path, broadcast to both lanes. + let v_dk = |idx_main: &[u8; 16], idx_tail: &[u8; 16]| -> __m256i { + let im = broadcast16(idx_main.as_ptr()); + let it = broadcast16(idx_tail.as_ptr()); + _mm256_or_si256( + _mm256_shuffle_epi8(digits_main, im), + _mm256_shuffle_epi8(digits_tail, it), + ) + }; + let v_d0 = v_dk(&IDX_DEC[0].0, &IDX_DEC[0].1); + let v_d1 = v_dk(&IDX_DEC[1].0, &IDX_DEC[1].1); + let v_d2 = v_dk(&IDX_DEC[2].0, &IDX_DEC[2].1); + let v_d3 = v_dk(&IDX_DEC[3].0, &IDX_DEC[3].1); + let v_d4 = v_dk(&IDX_DEC[4].0, &IDX_DEC[4].1); + + // Horner: n = ((((d0·85 + d1)·85 + d2)·85 + d3)·85 + d4) + let m85 = _mm256_set1_epi32(85); + let n = _mm256_add_epi32(v_d1, _mm256_mullo_epi32(v_d0, m85)); + let n = _mm256_add_epi32(v_d2, _mm256_mullo_epi32(n, m85)); + let n = _mm256_add_epi32(v_d3, _mm256_mullo_epi32(n, m85)); + let n = _mm256_add_epi32(v_d4, _mm256_mullo_epi32(n, m85)); + + // Overflow detection (bias-and-signed-cmpgt). + let d0_85_4 = _mm256_mullo_epi32(v_d0, _mm256_set1_epi32(52_200_625)); + let bias = _mm256_set1_epi32(0x80000000_u32 as i32); + let n_b = _mm256_xor_si256(n, bias); + let d0_b = _mm256_xor_si256(d0_85_4, bias); + let overflow_mask = _mm256_cmpgt_epi32(d0_b, n_b); + if _mm256_movemask_epi8(overflow_mask) != 0 { + return Err(()); + } - // Byte-swap each u32 lane to BE and store. Output is 32 - // contiguous bytes (16 from each lane), so a single - // _mm256_storeu_si256 works. - let bytes_be = _mm256_shuffle_epi8(n, broadcast16(REV32_IDX.as_ptr())); - _mm256_storeu_si256(out.as_mut_ptr().cast::<__m256i>(), bytes_be); + // Byte-swap each u32 lane to BE and store. Output is 32 + // contiguous bytes (16 from each lane), so a single + // _mm256_storeu_si256 works. + let bytes_be = _mm256_shuffle_epi8(n, broadcast16(REV32_IDX.as_ptr())); + _mm256_storeu_si256(out.as_mut_ptr().cast::<__m256i>(), bytes_be); - Ok(()) - } + Ok(()) } } diff --git a/src/lib.rs b/src/lib.rs index e297ce5..4077b35 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -206,7 +206,7 @@ fn encode_into(input: &[u8], out: &mut [u8]) { // back to the scalar path otherwise. The branch predicts // perfectly across calls so the cost is negligible. if !std::is_x86_feature_detected!("avx2") { - encode_into_scalar(input, out); + encode_scalar(input, out); return; } @@ -252,39 +252,16 @@ fn encode_into(input: &[u8], out: &mut [u8]) { } } -/// Pure-scalar fallback used by the x86 path on CPUs lacking AVX2 -/// (rare on x86 server hardware after ~2013, but possible). -#[cfg(target_arch = "x86_64")] -fn encode_into_scalar(input: &[u8], out: &mut [u8]) { - let mut in_off = 0usize; - let mut out_off = 0usize; - - while in_off.saturating_add(4) <= input.len() { - let Some(block) = chunk_at::<4>(input, in_off) else { - return; - }; - let Some(chunk) = chunk_at_mut::<5>(out, out_off) else { - return; - }; - encode_block(block, chunk); - in_off = in_off.saturating_add(4); - out_off = out_off.saturating_add(5); - } - - let tail_in_len = input.len().saturating_sub(in_off); - if tail_in_len > 0 { - let tail_out_len = tail_in_len.saturating_add(1); - if let (Some(tail_in), Some(tail_out)) = ( - input.get(in_off..in_off.saturating_add(tail_in_len)), - out.get_mut(out_off..out_off.saturating_add(tail_out_len)), - ) { - encode_tail(tail_in, tail_out); - } - } -} - #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn encode_into(input: &[u8], out: &mut [u8]) { + encode_scalar(input, out); +} + +/// Pure-scalar encode loop. Used by the x86 path on CPUs lacking AVX2 +/// (rare on x86 server hardware after ~2013, but possible) and as the +/// only encoder on non-aarch64, non-x86_64 targets. +#[cfg(not(target_arch = "aarch64"))] +fn encode_scalar(input: &[u8], out: &mut [u8]) { let mut in_off = 0usize; let mut out_off = 0usize; @@ -389,7 +366,7 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { // Runtime feature gate: AVX2 needed for the fast path; otherwise // route to scalar fallback. if !std::is_x86_feature_detected!("avx2") { - return decode_into_scalar(input, out); + return decode_scalar(input, out); } let mut in_off = 0usize; @@ -444,40 +421,15 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { Ok(()) } -/// Pure-scalar fallback used by the x86 decode path on CPUs lacking AVX2. -#[cfg(target_arch = "x86_64")] -fn decode_into_scalar(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { - let mut in_off = 0usize; - let mut out_off = 0usize; - - while in_off.saturating_add(5) <= input.len() { - let Some(block) = chunk_at::<5>(input, in_off) else { - return Ok(()); - }; - let Some(chunk) = chunk_at_mut::<4>(out, out_off) else { - return Ok(()); - }; - decode_block(block, chunk, in_off)?; - in_off = in_off.saturating_add(5); - out_off = out_off.saturating_add(4); - } - - let tail_in_len = input.len().saturating_sub(in_off); - if tail_in_len > 0 { - let tail_out_len = tail_in_len.saturating_sub(1); - if let (Some(tail_in), Some(tail_out)) = ( - input.get(in_off..in_off.saturating_add(tail_in_len)), - out.get_mut(out_off..out_off.saturating_add(tail_out_len)), - ) { - decode_tail(tail_in, tail_out, in_off)?; - } - } - - Ok(()) -} - #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))] fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { + decode_scalar(input, out) +} + +/// Pure-scalar decode loop. Used by the x86 path on CPUs lacking AVX2 +/// and as the only decoder on non-aarch64, non-x86_64 targets. +#[cfg(not(target_arch = "aarch64"))] +fn decode_scalar(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { let mut in_off = 0usize; let mut out_off = 0usize; diff --git a/src/ops/aarch64.rs b/src/ops/aarch64.rs index 1fa0a2f..f9b1aed 100644 --- a/src/ops/aarch64.rs +++ b/src/ops/aarch64.rs @@ -44,61 +44,8 @@ fn div_magic(input: uint32x4_t) -> uint32x4_ } } -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{Arbitrary, quickcheck}; - use std::{ - arch::aarch64::{vld1q_u32, vst1q_u32}, - array, - }; - - #[derive(Debug, Clone, Copy)] - struct InputBlock([u32; 4]); - - impl Arbitrary for InputBlock { - fn arbitrary(g: &mut quickcheck::Gen) -> Self { - InputBlock(array::from_fn(|_| u32::arbitrary(g))) - } - } - - fn lanes(v: uint32x4_t) -> [u32; 4] { - let mut out = [0u32; 4]; - unsafe { vst1q_u32(out.as_mut_ptr(), v) }; - out - } - - fn load(input: &[u32; 4]) -> uint32x4_t { - unsafe { vld1q_u32(input.as_ptr()) } - } - - quickcheck! { - fn div_85_matches_scalar(block: InputBlock) -> bool { - let InputBlock(input) = block; - let q_out = lanes(div_85(load(&input))); - (0..4).all(|i| q_out[i] == input[i] / 85) - } - - fn div_85_sq_matches_scalar(block: InputBlock) -> bool { - let InputBlock(input) = block; - let q_out = lanes(div_85_sq(load(&input))); - (0..4).all(|i| q_out[i] == input[i] / (85 * 85)) - } - - fn div_85_cube_matches_scalar(block: InputBlock) -> bool { - let InputBlock(input) = block; - let q_out = lanes(div_85_cube(load(&input))); - (0..4).all(|i| q_out[i] == input[i] / (85 * 85 * 85)) - } - - // q4 = q2 / 85^2 — the composed path that yields n / 85^4. Direct - // /85^4 magic would not satisfy the libdivide precision bound. - fn div_85_to_the_4_via_composition(block: InputBlock) -> bool { - let InputBlock(input) = block; - let q2 = div_85_sq(load(&input)); - let q4 = div_85_sq(q2); - let q4_out = lanes(q4); - (0..4).all(|i| q4_out[i] == input[i] / (85u32.pow(4))) - } - } -} +// Correctness of `div_85*` is exercised transitively by the parity +// tests in `tests/base85_parity.rs` (round-trips through encode/decode +// on aarch64 hit this code on every iteration). The dedicated +// quickchecks that previously lived here added no coverage worth the +// duplication. diff --git a/src/ops/x86_64.rs b/src/ops/x86_64.rs index a90efb8..bcbeb17 100644 --- a/src/ops/x86_64.rs +++ b/src/ops/x86_64.rs @@ -25,12 +25,16 @@ //! `is_x86_feature_detected!("avx2")` check before invoking; on hosts //! lacking AVX2 the crate routes to the scalar fallback. //! -//! Inner `unsafe { ... }` blocks are required on Rust 1.85 (MSRV) for -//! calls to target-feature-gated intrinsics inside an `unsafe fn`. -//! Newer toolchains may flag those blocks as redundant, hence the -//! module-level `unused_unsafe` allow. +//! `#![allow(unsafe_op_in_unsafe_fn)]` at the module level lets every +//! `unsafe fn` body act as an implicit `unsafe` scope — without it, +//! Rust 1.85 (MSRV) requires explicit `unsafe { … }` around each +//! target-feature-gated intrinsic call even inside an `unsafe fn` +//! with matching `target_feature`. Newer toolchains relax this, so +//! the alternative would be 1.85-only inner blocks plus an +//! `unused_unsafe` allow elsewhere; the override keeps the code +//! readable across the full toolchain range. -#![allow(unused_unsafe)] +#![allow(unsafe_op_in_unsafe_fn)] use std::arch::x86_64::{ __m256i, _mm256_blend_epi16, _mm256_mul_epu32, _mm256_set1_epi32, _mm256_shuffle_epi32, @@ -45,8 +49,7 @@ use std::arch::x86_64::{ #[target_feature(enable = "avx2")] pub(crate) unsafe fn div_85(input: __m256i) -> __m256i { // m = ceil(2^38 / 85) = 3_233_857_729; m * 85 - 2^38 = 21 < 2^6. - // SAFETY: caller upholds AVX2 (function-level target_feature). - unsafe { div_magic::<3_233_857_729, 6>(input) } + div_magic::<3_233_857_729, 6>(input) } /// Lane-wise `input / 7225` (i.e. `input / 85²`). Valid for all u32 inputs. @@ -57,8 +60,7 @@ pub(crate) unsafe fn div_85(input: __m256i) -> __m256i { #[target_feature(enable = "avx2")] pub(crate) unsafe fn div_85_sq(input: __m256i) -> __m256i { // m = ceil(2^44 / 7225) = 2_434_904_643; m * 7225 - 2^44 = 1259 < 2^12. - // SAFETY: caller upholds AVX2. - unsafe { div_magic::<2_434_904_643, 12>(input) } + div_magic::<2_434_904_643, 12>(input) } /// Lane-wise `input / 614125` (i.e. `input / 85³`). Valid for all u32 inputs. @@ -69,8 +71,7 @@ pub(crate) unsafe fn div_85_sq(input: __m256i) -> __m256i { #[target_feature(enable = "avx2")] pub(crate) unsafe fn div_85_cube(input: __m256i) -> __m256i { // m = ceil(2^51 / 614125) = 3_666_679_933; bound check passes for u32. - // SAFETY: caller upholds AVX2. - unsafe { div_magic::<3_666_679_933, 19>(input) } + div_magic::<3_666_679_933, 19>(input) } /// Generic libdivide-style div over 8 u32 lanes: @@ -91,90 +92,29 @@ pub(crate) unsafe fn div_85_cube(input: __m256i) -> __m256i { #[inline] #[target_feature(enable = "avx2")] unsafe fn div_magic(input: __m256i) -> __m256i { - // SAFETY (1.85 MSRV): caller has AVX2; all intrinsics below are - // gated on AVX2 (or its prerequisites). - unsafe { - let magic = _mm256_set1_epi32(MAGIC as i32); + let magic = _mm256_set1_epi32(MAGIC as i32); - let even = _mm256_mul_epu32(input, magic); - let input_odd = _mm256_srli_epi64::<32>(input); - let odd = _mm256_mul_epu32(input_odd, magic); + let even = _mm256_mul_epu32(input, magic); + let input_odd = _mm256_srli_epi64::<32>(input); + let odd = _mm256_mul_epu32(input_odd, magic); - // Per-128-bit-lane shuffle: broadcasts hi u32 of each u64 - // product across the pair, in each lane independently. - let even_hi = _mm256_shuffle_epi32::<0b11_11_01_01>(even); - let odd_hi = _mm256_shuffle_epi32::<0b11_11_01_01>(odd); + // Per-128-bit-lane shuffle: broadcasts hi u32 of each u64 product + // across the pair, in each lane independently. + let even_hi = _mm256_shuffle_epi32::<0b11_11_01_01>(even); + let odd_hi = _mm256_shuffle_epi32::<0b11_11_01_01>(odd); - // Per-lane blend at u16 granularity. The 8-bit MASK is applied - // to each 128-bit lane, so 0xCC interleaves [hi0, hi1, hi2, hi3] - // within each lane independently. - let combined = _mm256_blend_epi16::<0xCC>(even_hi, odd_hi); + // Per-lane blend at u16 granularity. The 8-bit MASK is applied to + // each 128-bit lane, so 0xCC interleaves [hi0, hi1, hi2, hi3] + // within each lane independently. + let combined = _mm256_blend_epi16::<0xCC>(even_hi, odd_hi); - _mm256_srli_epi32::(combined) - } + _mm256_srli_epi32::(combined) } -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{Arbitrary, quickcheck}; - use std::{ - arch::x86_64::{_mm256_loadu_si256, _mm256_storeu_si256}, - array, - }; - - #[derive(Debug, Clone, Copy)] - struct InputBlock([u32; 8]); - - impl Arbitrary for InputBlock { - fn arbitrary(g: &mut quickcheck::Gen) -> Self { - InputBlock(array::from_fn(|_| u32::arbitrary(g))) - } - } - - fn lanes(v: __m256i) -> [u32; 8] { - let mut out = [0u32; 8]; - unsafe { _mm256_storeu_si256(out.as_mut_ptr().cast::<__m256i>(), v) }; - out - } - - fn load(input: &[u32; 8]) -> __m256i { - unsafe { _mm256_loadu_si256(input.as_ptr().cast::<__m256i>()) } - } - - fn host_has_avx2() -> bool { - std::is_x86_feature_detected!("avx2") - } - - quickcheck! { - fn div_85_matches_scalar(block: InputBlock) -> bool { - if !host_has_avx2() { return true; } - let InputBlock(input) = block; - let q_out = lanes(unsafe { div_85(load(&input)) }); - (0..8).all(|i| q_out[i] == input[i] / 85) - } - - fn div_85_sq_matches_scalar(block: InputBlock) -> bool { - if !host_has_avx2() { return true; } - let InputBlock(input) = block; - let q_out = lanes(unsafe { div_85_sq(load(&input)) }); - (0..8).all(|i| q_out[i] == input[i] / (85 * 85)) - } - - fn div_85_cube_matches_scalar(block: InputBlock) -> bool { - if !host_has_avx2() { return true; } - let InputBlock(input) = block; - let q_out = lanes(unsafe { div_85_cube(load(&input)) }); - (0..8).all(|i| q_out[i] == input[i] / (85 * 85 * 85)) - } - - fn div_85_to_the_4_via_composition(block: InputBlock) -> bool { - if !host_has_avx2() { return true; } - let InputBlock(input) = block; - let q2 = unsafe { div_85_sq(load(&input)) }; - let q4 = unsafe { div_85_sq(q2) }; - let q4_out = lanes(q4); - (0..8).all(|i| q4_out[i] == input[i] / (85u32.pow(4))) - } - } -} +// `div_85*` correctness is exercised transitively by the parity tests +// in `tests/base85_parity.rs`, which round-trip every encoded byte +// through the AVX2 path on hosts where the runtime feature gate +// admits it. A separate quickcheck here would either need a runtime +// `is_x86_feature_detected!` skip (a smell) or RUSTFLAGS-based +// compile-time gating for CI — neither earns its keep against the +// existing transitive coverage. From 9f18b98ffb2942ef40e85603b860f45032381f35 Mon Sep 17 00:00:00 2001 From: Dan Draper Date: Sat, 2 May 2026 19:20:26 +1000 Subject: [PATCH 7/7] chore: trim narration / tombstone comments after PR #6 cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Shorten the `#![allow(unsafe_op_in_unsafe_fn)]` rationale in mod avx2 and in src/ops/x86_64.rs to one sentence — the original prose read as change-history narration. - Drop the "tests removed because…" tombstones at the bottom of src/ops/{aarch64,x86_64}.rs. - Drop the explanatory comment on REV32_IDX in mod avx2; the location already says "x86 only". - Drop redundant doc comments on encode_scalar / decode_scalar that just restated the cfg gate. - Fix stale `crate::encode_into_scalar / crate::decode_into_scalar` cross-reference in block.rs (now `encode_scalar / decode_scalar`). --- src/block.rs | 17 ++++------------- src/lib.rs | 5 ----- src/ops/aarch64.rs | 6 ------ src/ops/x86_64.rs | 19 +++---------------- 4 files changed, 7 insertions(+), 40 deletions(-) diff --git a/src/block.rs b/src/block.rs index 096e473..3a66f35 100644 --- a/src/block.rs +++ b/src/block.rs @@ -518,8 +518,7 @@ mod neon { // each lane handles 4 independent blocks. // // On hosts without AVX2 the crate's runtime feature gate routes to -// the scalar fallback (see `crate::encode_into_scalar` / -// `crate::decode_into_scalar`). +// the scalar fallback (see `crate::encode_scalar` / `crate::decode_scalar`). #[cfg(target_arch = "x86_64")] pub(crate) use avx2::{Avx2Encoder, try_decode_block_x8}; @@ -540,12 +539,9 @@ mod avx2 { //! `is_x86_feature_detected!("avx2")` check; on hosts lacking //! AVX2 the crate routes to the scalar fallback. //! - //! `#![allow(unsafe_op_in_unsafe_fn)]` lets each `unsafe fn` body - //! act as an implicit unsafe scope so individual intrinsic calls - //! don't need their own `unsafe { … }` wrapper. Without it, Rust - //! 1.85 (MSRV) would require those wrappers; newer toolchains - //! would then flag them as `unused_unsafe`. The override side- - //! steps both ends. + //! The module-level `#![allow(unsafe_op_in_unsafe_fn)]` keeps the + //! body of each `unsafe fn` an implicit unsafe scope on MSRV 1.85 + //! (without it, each intrinsic call needs its own `unsafe { … }`). #![allow(unsafe_op_in_unsafe_fn)] #![allow(clippy::indexing_slicing)] @@ -562,11 +558,6 @@ mod avx2 { use super::{IDX_OUT_1, IDX_OUT_2, TAIL_VALID_MASK}; - // x86-only: NEON has `vrev32q_u8` for the byte-swap (no table needed), - // but PSHUFB needs an explicit shuffle index. Same for the per-position - // decode index pattern below — its shape is x86-specific (single 16-byte - // PSHUFB index per digit), where the NEON path uses `vqtbl2q_u8` over a - // 32-byte source and a different index layout. static REV32_IDX: [u8; 16] = [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]; #[rustfmt::skip] diff --git a/src/lib.rs b/src/lib.rs index 4077b35..6a759cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -257,9 +257,6 @@ fn encode_into(input: &[u8], out: &mut [u8]) { encode_scalar(input, out); } -/// Pure-scalar encode loop. Used by the x86 path on CPUs lacking AVX2 -/// (rare on x86 server hardware after ~2013, but possible) and as the -/// only encoder on non-aarch64, non-x86_64 targets. #[cfg(not(target_arch = "aarch64"))] fn encode_scalar(input: &[u8], out: &mut [u8]) { let mut in_off = 0usize; @@ -426,8 +423,6 @@ fn decode_into(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { decode_scalar(input, out) } -/// Pure-scalar decode loop. Used by the x86 path on CPUs lacking AVX2 -/// and as the only decoder on non-aarch64, non-x86_64 targets. #[cfg(not(target_arch = "aarch64"))] fn decode_scalar(input: &[u8], out: &mut [u8]) -> Result<(), DecodeError> { let mut in_off = 0usize; diff --git a/src/ops/aarch64.rs b/src/ops/aarch64.rs index f9b1aed..0f017d8 100644 --- a/src/ops/aarch64.rs +++ b/src/ops/aarch64.rs @@ -43,9 +43,3 @@ fn div_magic(input: uint32x4_t) -> uint32x4_ vshrq_n_u32(high_u32, SHIFT) } } - -// Correctness of `div_85*` is exercised transitively by the parity -// tests in `tests/base85_parity.rs` (round-trips through encode/decode -// on aarch64 hit this code on every iteration). The dedicated -// quickchecks that previously lived here added no coverage worth the -// duplication. diff --git a/src/ops/x86_64.rs b/src/ops/x86_64.rs index bcbeb17..91dfac5 100644 --- a/src/ops/x86_64.rs +++ b/src/ops/x86_64.rs @@ -25,14 +25,9 @@ //! `is_x86_feature_detected!("avx2")` check before invoking; on hosts //! lacking AVX2 the crate routes to the scalar fallback. //! -//! `#![allow(unsafe_op_in_unsafe_fn)]` at the module level lets every -//! `unsafe fn` body act as an implicit `unsafe` scope — without it, -//! Rust 1.85 (MSRV) requires explicit `unsafe { … }` around each -//! target-feature-gated intrinsic call even inside an `unsafe fn` -//! with matching `target_feature`. Newer toolchains relax this, so -//! the alternative would be 1.85-only inner blocks plus an -//! `unused_unsafe` allow elsewhere; the override keeps the code -//! readable across the full toolchain range. +//! The module-level `#![allow(unsafe_op_in_unsafe_fn)]` keeps the body +//! of each `unsafe fn` an implicit unsafe scope on MSRV 1.85 (without +//! it, each intrinsic call needs its own `unsafe { … }`). #![allow(unsafe_op_in_unsafe_fn)] @@ -110,11 +105,3 @@ unsafe fn div_magic(input: __m256i) -> __m25 _mm256_srli_epi32::(combined) } - -// `div_85*` correctness is exercised transitively by the parity tests -// in `tests/base85_parity.rs`, which round-trip every encoded byte -// through the AVX2 path on hosts where the runtime feature gate -// admits it. A separate quickcheck here would either need a runtime -// `is_x86_feature_detected!` skip (a smell) or RUSTFLAGS-based -// compile-time gating for CI — neither earns its keep against the -// existing transitive coverage.