diff --git a/.claude/board/AGENT_LOG.md b/.claude/board/AGENT_LOG.md index b6948deb..cf9133a9 100644 --- a/.claude/board/AGENT_LOG.md +++ b/.claude/board/AGENT_LOG.md @@ -28,6 +28,60 @@ ## Entries (append below; newest first) +## 2026-05-21T16:00 — substrate-graduation batch 3 (opus 4.7) + +**Branch:** `claude/continue-ndarray-x0Oaw` +**Continues:** PR #194 batch of 5 (`bitwise`/`heel_f64x8`/`distance`/`byte_scan`/`spatial_hash`) + #193 (`simd_caps`). +**Verdict:** SHIP — `cargo check`, `cargo clippy --features approx,serde,rayon -- -D warnings`, doctest suite (15 graduated-module doctests pass), and unit tests (104 lib tests pass) all green. + +**Modules graduated (4):** + +| Module | Old path | New path | Internal hpc/ deps? | +|---|---|---|---| +| `aabb` | `src/hpc/aabb.rs` | `src/aabb.rs` | None — only `super::simd_caps` (now resolves via crate root) | +| `nibble` | `src/hpc/nibble.rs` | `src/nibble.rs` | None — only `super::simd_caps` | +| `palette_codec` | `src/hpc/palette_codec.rs` | `src/palette_codec.rs` | None — pure logic | +| `property_mask` | `src/hpc/property_mask.rs` | `src/property_mask.rs` | None — only `super::simd_caps` | + +**Why these four, why now (criteria carried over from #194 wrap-up):** +1. No internal `hpc/` dependencies. All four only reach into `crate::simd::*` (the polyfill surface) and `super::simd_caps` (itself at crate root post-#192). +2. Already polyfill-clean — no raw-intrinsic refactor required before the move. +3. Single in-tree downstream caller (`hpc::framebuffer` imports `palette_codec`) → the `pub use crate::palette_codec;` back-compat shim in `hpc/mod.rs` keeps that resolution working zero-touch. + +**Changes:** +- `git mv src/hpc/{aabb,nibble,palette_codec,property_mask}.rs src/` +- Added `pub mod {aabb, nibble, palette_codec, property_mask};` to `src/lib.rs` (with `# Example` rustdoc blocks per CLAUDE.md hard rule "all public APIs need /// doc comments with examples"). +- Replaced the four `pub mod` declarations in `src/hpc/mod.rs` with `pub use crate::{aabb, nibble, palette_codec, property_mask};` back-compat re-exports. + +**Lint follow-ups (graduated modules lose the `#![allow(clippy::all, …)]` umbrella that `hpc/mod.rs` carries):** + +17 clippy errors surfaced under `-D warnings`. All fixed at the canonical Rust idiom rather than re-applying the umbrella, per the #194 cleanup precedent (417131bc): + +- **`manual_div_ceil` (6 sites)**: `(n + d - 1) / d` → `n.div_ceil(d)` in `nibble.rs` (×2), `palette_codec.rs` (×3), `property_mask.rs` (×1). +- **`needless_range_loop` (10 sites)**: `for i in start..vec.len() { vec[i] }` → `for x in &vec[start..]` or `for (i, &x) in iter().enumerate()` depending on whether the index is used. Sites in `aabb.rs` (×4), `nibble.rs` (×3), `palette_codec.rs` (×1), `property_mask.rs` (×2). +- **`missing_docs` (4 sites)**: Added field doc comments on `pub struct Aabb { min, max }` and `pub struct Ray { origin, inv_dir }` — these were previously caught by the `hpc/mod.rs` umbrella's `#![allow(missing_docs)]`. + +**Doctest fix:** Initial `bits_for_palette_size(1) → 1` in the `lib.rs` `# Example` block was wrong — the actual impl returns 0 for `palette_size <= 1` (trivial-palette special case; the bits/indices table in `palette_codec.rs`'s module docstring overpromises). Changed example to `bits_for_palette_size(2) → 1`. + +**Verification:** + +``` +cargo check --lib → clean +cargo clippy --lib -- -D warnings → clean +cargo clippy --lib --features rayon -- -D warnings → clean +cargo clippy --features approx,serde,rayon -- -D warnings → clean +cargo test --doc (filtered: graduated modules) → 15 doctests pass +cargo test --lib aabb::tests nibble::tests palette_codec::tests property_mask::tests → 104 unit tests pass +``` + +**No back-compat break:** every existing `use ndarray::hpc::{aabb, nibble, palette_codec, property_mask}::*` continues to resolve via the `pub use crate::*` shims in `hpc/mod.rs`. Verified via `cargo check` of the full workspace — `framebuffer.rs:29` (the one in-tree downstream consumer of `palette_codec`) compiles unchanged. + +**Remaining hpc/ inventory after this batch:** ~55 → ~51 modules at crate root path `crate::hpc::*`. Next-batch candidates (still low-hanging by the same criteria) — to be audited in a separate pass before move: `framebuffer` (depends on `palette_codec` shim, otherwise pure crate-root), `ocr_simd`/`ocr_felt` (need dep audit), `audio` (depends on `crate::simd`). + +**Commit:** TBD (pending push). + +--- + ## 2026-05-13T00:00 — agent #3 polyfill-ops (sonnet) **File:** `src/simd_ops.rs` (288 lines) diff --git a/src/hpc/aabb.rs b/src/aabb.rs similarity index 97% rename from src/hpc/aabb.rs rename to src/aabb.rs index 43770fe8..180dbf1a 100644 --- a/src/hpc/aabb.rs +++ b/src/aabb.rs @@ -17,7 +17,9 @@ #[derive(Debug, Clone, Copy, PartialEq)] #[repr(C)] pub struct Aabb { + /// Minimum corner of the bounding box (x, y, z). pub min: [f32; 3], + /// Maximum corner of the bounding box (x, y, z). pub max: [f32; 3], } @@ -97,7 +99,10 @@ impl Aabb { #[derive(Debug, Clone, Copy, PartialEq)] #[repr(C)] pub struct Ray { + /// Ray origin point (x, y, z). pub origin: [f32; 3], + /// Per-axis reciprocal of the ray direction (1 / dx, 1 / dy, 1 / dz); + /// `inf` is valid (encodes a zero-component direction, slab test skips it). pub inv_dir: [f32; 3], } @@ -122,8 +127,7 @@ impl Ray { #[inline] fn sq_dist_point_aabb(point: [f32; 3], aabb: &Aabb) -> f32 { let mut dist_sq = 0.0f32; - for axis in 0..3 { - let v = point[axis]; + for (axis, &v) in point.iter().enumerate() { if v < aabb.min[axis] { let d = aabb.min[axis] - v; dist_sq += d * d; @@ -230,8 +234,8 @@ unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec< } // Scalar tail - for i in (chunks * 16)..candidates.len() { - result.push(query.intersects(&candidates[i])); + for cand in &candidates[chunks * 16..] { + result.push(query.intersects(cand)); } result @@ -403,16 +407,15 @@ unsafe fn ray_aabb_slab_test_avx512(ray: &Ray, aabbs: &[Aabb]) -> (Vec, Ve let t_enter_clamped = t_enter.simd_max(zero); let t_arr = t_enter_clamped.to_array(); - for i in 0..16 { + for (i, &t) in t_arr.iter().enumerate() { let hit = (hit_mask >> i) & 1 != 0; hits.push(hit); - t_values.push(if hit { t_arr[i] } else { f32::MAX }); + t_values.push(if hit { t } else { f32::MAX }); } } // Scalar tail for remainder - for i in (chunks * 16)..aabbs.len() { - let aabb = &aabbs[i]; + for aabb in &aabbs[chunks * 16..] { let mut t_enter = f32::NEG_INFINITY; let mut t_exit = f32::INFINITY; diff --git a/src/hpc/bitwise.rs b/src/bitwise.rs similarity index 99% rename from src/hpc/bitwise.rs rename to src/bitwise.rs index 870314e8..0d849dde 100644 --- a/src/hpc/bitwise.rs +++ b/src/bitwise.rs @@ -107,7 +107,7 @@ unsafe fn hamming_avx512bw(a: &[u8], b: &[u8]) -> u64 { let hi = xor.shr_epi16(4) & low_mask; let popcnt_lo = lookup.shuffle_bytes(lo); let popcnt_hi = lookup.shuffle_bytes(hi); - acc = acc + (popcnt_lo + popcnt_hi); + acc += popcnt_lo + popcnt_hi; i += 64; inner_count += 1; @@ -152,7 +152,7 @@ unsafe fn popcount_avx512bw(a: &[u8]) -> u64 { let hi = va.shr_epi16(4) & low_mask; let popcnt_lo = lookup.shuffle_bytes(lo); let popcnt_hi = lookup.shuffle_bytes(hi); - acc = acc + (popcnt_lo + popcnt_hi); + acc += popcnt_lo + popcnt_hi; i += 64; inner_count += 1; diff --git a/src/hpc/byte_scan.rs b/src/byte_scan.rs similarity index 97% rename from src/hpc/byte_scan.rs rename to src/byte_scan.rs index 4f692cc6..6667bf74 100644 --- a/src/hpc/byte_scan.rs +++ b/src/byte_scan.rs @@ -35,9 +35,9 @@ pub(crate) mod simd_impl { i += 32; } // Scalar tail - for j in i..n { - if haystack[j] == needle { - result.push(j); + for (offset, &byte) in haystack[i..n].iter().enumerate() { + if byte == needle { + result.push(i + offset); } } result @@ -68,9 +68,9 @@ pub(crate) mod simd_impl { i += 64; } // Scalar tail - for j in i..n { - if haystack[j] == needle { - result.push(j); + for (offset, &byte) in haystack[i..n].iter().enumerate() { + if byte == needle { + result.push(i + offset); } } result @@ -98,8 +98,8 @@ pub(crate) mod simd_impl { } i += 32; } - for j in i..n { - if haystack[j] == needle { + for &byte in &haystack[i..n] { + if byte == needle { total += 1; } } @@ -126,8 +126,8 @@ pub(crate) mod simd_impl { total += mask.count_ones() as usize; i += 64; } - for j in i..n { - if haystack[j] == needle { + for &byte in &haystack[i..n] { + if byte == needle { total += 1; } } diff --git a/src/hpc/distance.rs b/src/distance.rs similarity index 99% rename from src/hpc/distance.rs rename to src/distance.rs index 79f4229e..d85e3242 100644 --- a/src/hpc/distance.rs +++ b/src/distance.rs @@ -96,10 +96,10 @@ pub(crate) mod simd_impl { } // Scalar tail - for j in i..n { - let dx = query[0] - points[j][0]; - let dy = query[1] - points[j][1]; - let dz = query[2] - points[j][2]; + for p in &points[i..n] { + let dx = query[0] - p[0]; + let dy = query[1] - p[1]; + let dz = query[2] - p[2]; out.push(dx * dx + dy * dy + dz * dz); } } @@ -211,7 +211,7 @@ pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 { for i in 0..chunks { let va = F64x8::from_slice(&a[i * 8..]); let vb = F64x8::from_slice(&b[i * 8..]); - acc = acc + (va - vb).abs(); + acc += (va - vb).abs(); } let mut sum = acc.reduce_sum(); let offset = chunks * 8; diff --git a/src/hpc/heel_f64x8.rs b/src/heel_f64x8.rs similarity index 100% rename from src/hpc/heel_f64x8.rs rename to src/heel_f64x8.rs diff --git a/src/hpc/linalg/mod.rs b/src/hpc/linalg/mod.rs index 58608f40..a2963c94 100644 --- a/src/hpc/linalg/mod.rs +++ b/src/hpc/linalg/mod.rs @@ -40,7 +40,8 @@ //! //! - **No SIMD primitives** — use `crate::simd::{F32x16, …}` directly. //! - **No `#[target_feature]` annotations** — those live in `simd_avx512.rs`. -//! - **No distance metrics** — those live in `crate::hpc::distance`. +//! - **No distance metrics** — those live in `crate::distance` (graduated +//! from `crate::hpc::distance`; back-compat re-export in `crate::hpc::*`). mod matrix; pub use matrix::{Mat2, Mat3, Mat4, MatN, Spd2, Spd3}; diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index ff7981fc..3e195b8b 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -27,7 +27,8 @@ pub mod reductions; pub mod statistics; pub mod activations; pub mod hdc; -pub mod bitwise; +// Bitwise SIMD primitives — graduated to crate root. Back-compat re-export. +pub use crate::bitwise; pub mod projection; pub mod cogrecord; pub mod graph; @@ -56,8 +57,8 @@ pub mod soa; pub mod node; #[allow(missing_docs)] pub mod cascade; -#[allow(missing_docs)] -pub mod heel_f64x8; +// HEEL F64x8 distance kernels — graduated to crate root. Back-compat re-export. +pub use crate::heel_f64x8; // AMX is an x86_64-only ISA (Intel Sapphire Rapids+); both modules use // `asm!` with `rcx`/`rax` register names that don't exist on other // architectures (rejected at parse time on s390x / aarch64 / wasm32). @@ -169,22 +170,21 @@ pub mod parallel_search; // ZeckF64 progressive edge encoding + batch/top-k pub mod zeck; -// SIMD-accelerated spatial / byte-scan / hash utilities -pub mod distance; -pub mod byte_scan; -pub mod spatial_hash; - -// Variable-width palette index codec (Minecraft-style bit packing) -#[allow(missing_docs)] -pub mod palette_codec; - -// SIMD-accelerated HPC modules (block properties, nibble light data, AABB collision) -#[allow(missing_docs)] -pub mod property_mask; -#[allow(missing_docs)] -pub mod nibble; -#[allow(missing_docs)] -pub mod aabb; +// SIMD-accelerated spatial / byte-scan / hash utilities — graduated to crate root. +// Back-compat re-exports for existing `use ndarray::hpc::{distance,byte_scan,spatial_hash}::*`. +pub use crate::byte_scan; +pub use crate::distance; +pub use crate::spatial_hash; + +// Variable-width palette index codec — graduated to crate root. +// Back-compat re-export for existing `use ndarray::hpc::palette_codec::*`. +pub use crate::palette_codec; + +// SIMD-accelerated HPC modules (block properties, nibble light data, AABB +// collision) — all three graduated to crate root. Back-compat re-exports. +pub use crate::aabb; +pub use crate::nibble; +pub use crate::property_mask; // Holographic phase-space operations (ported from rustynum-holo) #[allow(missing_docs)] diff --git a/src/lib.rs b/src/lib.rs index 5b5851fd..c19d6fc7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -272,6 +272,143 @@ pub mod simd_amx; #[cfg(feature = "std")] pub mod simd_caps; +/// Bitwise SIMD primitives — popcount, Hamming distance over byte slices. +/// Graduated from `crate::hpc::bitwise::*` (substrate-tier; uses +/// `crate::simd::U64x8` polyfill internally). Back-compat re-export in +/// `crate::hpc::*` preserves existing import paths. +/// +/// # Example +/// +/// ``` +/// use ndarray::bitwise::{hamming_distance_raw, popcount_raw}; +/// let a = [0xFFu8; 16]; +/// let b = [0x00u8; 16]; +/// assert_eq!(hamming_distance_raw(&a, &b), 128); +/// assert_eq!(popcount_raw(&a), 128); +/// ``` +#[cfg(feature = "std")] +pub mod bitwise; + +/// F64x8 HEEL distance kernels — 8-plane weighted Hamming, f64 SIMD +/// dot / cosine / sum-of-squares. Graduated from `crate::hpc::heel_f64x8::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::heel_f64x8::{cosine_f64_simd, dot_f64_simd}; +/// let a = vec![1.0_f64; 32]; +/// let b = vec![1.0_f64; 32]; +/// assert!((cosine_f64_simd(&a, &b) - 1.0).abs() < 1e-10); +/// assert!((dot_f64_simd(&a, &b) - 32.0).abs() < 1e-10); +/// ``` +#[cfg(feature = "std")] +pub mod heel_f64x8; + +/// Batch distance computations — spatial 3D-point queries + +/// slice-shape L1 / L2 / L∞ (PR-X10 A6). Graduated from +/// `crate::hpc::distance::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::distance::{l1_f64_simd, l2_f64_simd, linf_f64_simd}; +/// let a = vec![3.0_f64, 0.0]; +/// let b = vec![0.0_f64, 4.0]; +/// assert!((l1_f64_simd(&a, &b) - 7.0).abs() < 1e-12); +/// assert!((l2_f64_simd(&a, &b) - 5.0).abs() < 1e-12); +/// assert!((linf_f64_simd(&a, &b) - 4.0).abs() < 1e-12); +/// ``` +#[cfg(feature = "std")] +pub mod distance; + +/// SIMD-accelerated byte-scan utilities — needle search, delimiter +/// finding, parallel byte comparison. Graduated from +/// `crate::hpc::byte_scan::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::byte_scan::byte_find_all; +/// let haystack = b"hello world, hello rust"; +/// let hits = byte_find_all(haystack, b'l'); +/// assert_eq!(hits, vec![2, 3, 9, 15, 16]); +/// ``` +#[cfg(feature = "std")] +pub mod byte_scan; + +/// SIMD-accelerated spatial hash — bucketing, candidate gather, hash +/// collision detection. Graduated from `crate::hpc::spatial_hash::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::spatial_hash::SpatialHash; +/// let mut grid = SpatialHash::new(1.0); +/// grid.insert(0, 0.0, 0.0, 0.0); +/// grid.insert(1, 10.0, 10.0, 10.0); +/// assert_eq!(grid.len(), 2); +/// ``` +#[cfg(feature = "std")] +pub mod spatial_hash; + +/// Axis-aligned bounding box batch operations — SIMD-accelerated +/// intersection, expansion, distance queries. Graduated from +/// `crate::hpc::aabb::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::aabb::Aabb; +/// let a = Aabb::new([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]); +/// let b = Aabb::new([0.5, 0.5, 0.5], [1.5, 1.5, 1.5]); +/// assert!(a.intersects(&b)); +/// ``` +#[cfg(feature = "std")] +pub mod aabb; + +/// Nibble batch operations for 4-bit packed data (light levels, palettes). +/// Graduated from `crate::hpc::nibble::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::nibble::nibble_unpack; +/// let unpacked = nibble_unpack(&[0x3A], 2); +/// assert_eq!(unpacked, vec![0xA, 0x3]); +/// ``` +#[cfg(feature = "std")] +pub mod nibble; + +/// Variable-width palette index codec (Minecraft-style bit packing). +/// Packs/unpacks palette indices (0–255) into 1–8 bit widths. +/// Graduated from `crate::hpc::palette_codec::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::palette_codec::bits_for_palette_size; +/// assert_eq!(bits_for_palette_size(2), 1); +/// assert_eq!(bits_for_palette_size(16), 4); +/// assert_eq!(bits_for_palette_size(256), 8); +/// ``` +#[cfg(feature = "std")] +pub mod palette_codec; + +/// Block property mask — compiled bitset queries on block state bits. +/// AVX-512 VPTERNLOGD tests 3 conditions in 1 cycle. Graduated from +/// `crate::hpc::property_mask::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::property_mask::PropertyMask; +/// let mask = PropertyMask::new().require_bit(0).forbid_bit(3); +/// assert!(mask.test(0b0000_0001)); // bit 0 set, bit 3 clear → match +/// assert!(!mask.test(0b0000_1001)); // bit 3 set → no match +/// ``` +#[cfg(feature = "std")] +pub mod property_mask; + #[cfg(feature = "std")] #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] pub mod simd_neon; diff --git a/src/hpc/nibble.rs b/src/nibble.rs similarity index 96% rename from src/hpc/nibble.rs rename to src/nibble.rs index 05659f95..560c7c72 100644 --- a/src/hpc/nibble.rs +++ b/src/nibble.rs @@ -21,7 +21,7 @@ /// assert_eq!(nibble_unpack(packed, 2), vec![0xA, 0x3]); /// ``` pub fn nibble_unpack(packed: &[u8], count: usize) -> Vec { - assert!(packed.len() >= (count + 1) / 2, "packed buffer too small"); + assert!(packed.len() >= count.div_ceil(2), "packed buffer too small"); let mut out = Vec::with_capacity(count); @@ -105,7 +105,7 @@ pub(crate) unsafe fn nibble_unpack_avx2(packed: &[u8], count: usize, out: &mut V /// assert_eq!(packed, vec![0x3A]); /// ``` pub fn nibble_pack(values: &[u8]) -> Vec { - let out_len = (values.len() + 1) / 2; + let out_len = values.len().div_ceil(2); let mut out = vec![0u8; out_len]; for (i, &v) in values.iter().enumerate() { @@ -175,10 +175,10 @@ unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) { let mut data = [0u8; 32]; data.copy_from_slice(&packed[offset..offset + 32]); - for j in 0..32 { - let lo = (data[j] & 0x0F).saturating_sub(delta); - let hi = ((data[j] >> 4) & 0x0F).saturating_sub(delta); - data[j] = lo | (hi << 4); + for byte in &mut data { + let lo = (*byte & 0x0F).saturating_sub(delta); + let hi = ((*byte >> 4) & 0x0F).saturating_sub(delta); + *byte = lo | (hi << 4); } packed[offset..offset + 32].copy_from_slice(&data); @@ -263,9 +263,9 @@ pub(crate) unsafe fn nibble_above_threshold_avx2(packed: &[u8], threshold: u8) - let base_byte = c * 32; let chunk = &packed[base_byte..base_byte + 32]; - for j in 0..32 { - let lo = chunk[j] & 0x0F; - let hi = (chunk[j] >> 4) & 0x0F; + for (j, &b) in chunk.iter().enumerate() { + let lo = b & 0x0F; + let hi = (b >> 4) & 0x0F; if lo > threshold { result.push((base_byte + j) * 2); } @@ -277,9 +277,9 @@ pub(crate) unsafe fn nibble_above_threshold_avx2(packed: &[u8], threshold: u8) - // Scalar tail let tail_start = chunks * 32; - for byte_idx in tail_start..packed.len() { - let lo = packed[byte_idx] & 0x0F; - let hi = packed[byte_idx] >> 4; + for (byte_idx, &b) in packed.iter().enumerate().skip(tail_start) { + let lo = b & 0x0F; + let hi = b >> 4; if lo > threshold { result.push(byte_idx * 2); } diff --git a/src/hpc/palette_codec.rs b/src/palette_codec.rs similarity index 98% rename from src/hpc/palette_codec.rs rename to src/palette_codec.rs index 9dc4d8a5..7447a76f 100644 --- a/src/hpc/palette_codec.rs +++ b/src/palette_codec.rs @@ -56,7 +56,7 @@ pub fn pack_indices(indices: &[u8], bits_per_index: usize) -> Vec { assert!(bits_per_index > 0 && bits_per_index <= 8, "bits_per_index must be 1..=8"); let indices_per_word = 64 / bits_per_index; - let n_words = (indices.len() + indices_per_word - 1) / indices_per_word; + let n_words = indices.len().div_ceil(indices_per_word); let mut packed = vec![0u64; n_words]; let mask = (1u64 << bits_per_index) - 1; @@ -110,7 +110,7 @@ pub fn pack_indices_bytes(indices: &[u8], bits_per_index: usize) -> Vec { /// /// Inverse of [`pack_indices_bytes`]. pub fn unpack_indices_bytes(packed: &[u8], bits_per_index: usize, count: usize) -> Vec { - let n_words = (packed.len() + 7) / 8; + let n_words = packed.len().div_ceil(8); let mut words = Vec::with_capacity(n_words); for chunk in packed.chunks(8) { let mut buf = [0u8; 8]; @@ -145,7 +145,7 @@ pub fn transcode(packed: &[u64], old_bits: usize, new_bits: usize, count: usize) let old_per_word = 64 / old_bits; let new_per_word = 64 / new_bits; - let n_new_words = (count + new_per_word - 1) / new_per_word; + let n_new_words = count.div_ceil(new_per_word); let old_mask = (1u64 << old_bits) - 1; let new_mask = (1u64 << new_bits) - 1; @@ -309,8 +309,7 @@ unsafe fn unpack_generic_avx512(packed: &[u64], bits_per_index: usize, count: us let mut result = Vec::with_capacity(count); let mut emitted = 0usize; - for word_idx in 0..packed.len() { - let word = packed[word_idx]; + for &word in packed { for slot in 0..indices_per_word { if emitted >= count { return result; @@ -336,7 +335,7 @@ unsafe fn unpack_generic_avx512(packed: &[u64], bits_per_index: usize, count: us unsafe fn pack_generic_avx512(indices: &[u8], bits_per_index: usize) -> Vec { assert!(bits_per_index > 0 && bits_per_index <= 8); let indices_per_word = 64 / bits_per_index; - let n_words = (indices.len() + indices_per_word - 1) / indices_per_word; + let n_words = indices.len().div_ceil(indices_per_word); let mask = (1u64 << bits_per_index) - 1; let mut packed = vec![0u64; n_words]; diff --git a/src/hpc/property_mask.rs b/src/property_mask.rs similarity index 99% rename from src/hpc/property_mask.rs rename to src/property_mask.rs index f16f6dab..063b709e 100644 --- a/src/hpc/property_mask.rs +++ b/src/property_mask.rs @@ -90,7 +90,7 @@ impl PropertyMask { /// The returned vector has `ceil(states.len() / 64)` entries. pub fn test_section(&self, states: &[u64]) -> Vec { let n = states.len(); - let result_len = (n + 63) / 64; + let result_len = n.div_ceil(64); let mut result = vec![0u64; result_len]; #[cfg(target_arch = "x86_64")] @@ -233,8 +233,8 @@ impl PropertyMask { } // Scalar tail - for i in (chunks * 8)..states.len() { - if self.test(states[i]) { + for &state in &states[chunks * 8..] { + if self.test(state) { total += 1; } } @@ -376,8 +376,7 @@ unsafe fn count_section_multi_avx512(masks: &[PropertyMask], states: &[u64]) -> } // Scalar tail - for i in (chunks * 8)..states.len() { - let state = states[i]; + for &state in &states[chunks * 8..] { for (m_idx, mask) in masks.iter().enumerate() { if mask.test(state) { counts[m_idx] += 1; diff --git a/src/hpc/spatial_hash.rs b/src/spatial_hash.rs similarity index 98% rename from src/hpc/spatial_hash.rs rename to src/spatial_hash.rs index ae4303a2..84a4f117 100644 --- a/src/hpc/spatial_hash.rs +++ b/src/spatial_hash.rs @@ -331,18 +331,19 @@ pub(crate) unsafe fn batch_sq_dist_avx2(query: [f32; 3], candidates: &[[f32; 3]] // Compare: d2 <= radius_sq (scalar array comparison — no F32x8 cmp polyfill) let d2_arr = d2.to_array(); - for lane in 0..8 { - if d2_arr[lane] <= radius_sq { - result.push((base + lane, d2_arr[lane])); + for (lane, &d2_lane) in d2_arr.iter().enumerate() { + if d2_lane <= radius_sq { + result.push((base + lane, d2_lane)); } } } // Scalar tail - for i in (chunks * 8)..candidates.len() { - let d2 = sq_dist_f32(query, candidates[i]); + let tail_start = chunks * 8; + for (offset, &cand) in candidates[tail_start..].iter().enumerate() { + let d2 = sq_dist_f32(query, cand); if d2 <= radius_sq { - result.push((i, d2)); + result.push((tail_start + offset, d2)); } }