Skip to content

Commit 5ee172a

Browse files
committed
feat(pr-x2): soa_struct! gains #[soa(pad_to_lanes = N)] attribute
Worker B of PR-X2 per .claude/knowledge/pr-x2-design.md § "Worker decomposition" line 459. SIMD-staged kernels need each SoA field's underlying Vec to be a multiple of the lane width N so the consumer walks the buffer with one uniform N-lane loop — no scalar tail-case branch. Pre-PR-X2 callers achieved this by hand (W3-W6 GaussianBatch::with_capacity + eager-zero fill); this PR makes it declarative on the field. Macro surface: soa_struct! { pub struct Cells { #[soa(pad_to_lanes = 8)] pub palette: u8, pub label: u32, // unpadded } } let mut c = Cells::new(); c.push(7, 100); assert_eq!(c.len(), 1); // logical row count assert_eq!(c.palette.len(), 8); // physical, rounded to lane 8 assert_eq!(c.label.len(), 1); // unpadded: physical == logical Implementation: - Added optional `$(#[soa(pad_to_lanes = $pad:literal)])?` per field in the macro_rules! head — Rust 1.32+ optional-meta repetition. - Generated struct grows a private `_logical_len: usize` so `len()` / `is_empty()` return the **semantic** row count independent of any field's lane padding. - `push()` dispatches per-field through internal `@push_field` arms: • padded arm grows the Vec to `(logical+1).div_ceil(N)*N` filling with `<$ty as Default>::default()`, then writes the new value at `[logical]` • plain arm is the pre-PR-X2 `Vec::push` call The dispatch uses macro_rules! tt-munching with literal-token separators (`pad = $pad`) so a single repetition handles both shapes. - Compile-time guard: `const { assert!($pad > 0) }` inside the padded arm — `pad_to_lanes = 0` is rejected at expansion, not at runtime. - `with_capacity(cap)` reserves `cap` on each field but does NOT pre-pad; padding happens lazily on push (matches the original `with_capacity` semantics modulo the lane-tail). - `clear()` resets _logical_len + .clear() on each field. Re-pushing rebuilds padding from scratch. Breaking change: `len()` no longer mirrors `self.<field>.len()` after direct field mutation (e.g. `s.x.push(...)` bypasses `_logical_len`). The canonical entry point is the macro-generated `push`. Pre-existing `macro_public_visibility_passthrough` test updated to use `push`. New tests (`src/hpc/soa.rs`, 5 added): - pad_to_lanes_single_push_grows_to_lane — mixed cadence 8+16+none - pad_to_lanes_crosses_lane_boundary — 9 pushes against lane 8 - pad_to_lanes_clear_resets_both — clear() round-trips - pad_to_lanes_uniform_cadence — all-padded variant - pad_to_lanes_with_capacity_empty — empty state invariants Plus a `# Example — #[soa(pad_to_lanes = N)] field attribute` doctest on the `soa_struct!` macro itself. Verified: cargo test -p ndarray --lib hpc::soa 38 passed cargo test --doc -p ndarray hpc::soa 14 passed cargo fmt --check clean cargo clippy --features approx,serde,rayon -- -D warnings clean
1 parent fb95cb3 commit 5ee172a

1 file changed

Lines changed: 231 additions & 19 deletions

File tree

src/hpc/soa.rs

Lines changed: 231 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -323,59 +323,145 @@ impl<'a, T, const N: usize> Iterator for SoaChunks<'a, T, N> {
323323
/// assert_eq!(b.means_y.as_slice(), &[2.0, 5.0]);
324324
/// assert_eq!(b.means_z.as_slice(), &[3.0, 6.0]);
325325
/// ```
326+
///
327+
/// # Example — `#[soa(pad_to_lanes = N)]` field attribute (PR-X2 Worker B)
328+
///
329+
/// Tag a field with `#[soa(pad_to_lanes = N)]` to make `push` pad the
330+
/// underlying `Vec` up to the next multiple of `N` (filling with
331+
/// `Default::default()`). SIMD-staged kernels then walk the field with
332+
/// one uniform N-lane loop — no tail-case branch.
333+
///
334+
/// `len()` returns the **logical** row count (unchanged by padding);
335+
/// `self.<field>.len()` returns the **physical** Vec length. The difference
336+
/// is the lane-alignment tail.
337+
///
338+
/// ```
339+
/// use ndarray::soa_struct;
340+
///
341+
/// soa_struct! {
342+
/// pub struct Cells {
343+
/// #[soa(pad_to_lanes = 8)]
344+
/// pub palette: u8,
345+
/// pub label: u32, // unpadded
346+
/// }
347+
/// }
348+
///
349+
/// let mut c = Cells::new();
350+
/// c.push(7, 100);
351+
/// assert_eq!(c.len(), 1); // logical: 1 row
352+
/// assert_eq!(c.palette.len(), 8); // physical: rounded up to lane 8
353+
/// assert_eq!(c.label.len(), 1); // unpadded: physical == logical
354+
/// assert_eq!(c.palette[0], 7);
355+
/// assert_eq!(c.palette[1..8], [0u8; 7]); // padded tail is Default::default()
356+
/// ```
326357
#[macro_export]
327358
macro_rules! soa_struct {
328359
(
329360
$(#[$meta:meta])*
330361
$vis:vis struct $name:ident {
331-
$($field_vis:vis $field:ident : $ty:ty),* $(,)?
362+
$(
363+
$(#[soa(pad_to_lanes = $pad:literal)])?
364+
$field_vis:vis $field:ident : $ty:ty
365+
),* $(,)?
332366
}
333367
) => {
334368
$(#[$meta])*
335369
$vis struct $name {
336-
$($field_vis $field: ::std::vec::Vec<$ty>),*
370+
$($field_vis $field: ::std::vec::Vec<$ty>,)*
371+
/// Shared logical row count across all fields. Padded fields may
372+
/// have `self.<field>.len() > _logical_len` after `push`.
373+
/// Updated by `push` / `clear`; treat as private.
374+
#[doc(hidden)]
375+
_logical_len: usize,
337376
}
338377

339378
impl $name {
340379
/// Construct an empty instance.
341380
pub fn new() -> Self {
342-
Self { $($field: ::std::vec::Vec::new()),* }
381+
Self {
382+
$($field: ::std::vec::Vec::new(),)*
383+
_logical_len: 0,
384+
}
343385
}
344386

345387
/// Construct with each field pre-allocated to `cap`.
388+
///
389+
/// Padded fields per `#[soa(pad_to_lanes = N)]` get
390+
/// `cap` worth of physical capacity, not `cap.div_ceil(N) * N` —
391+
/// the lane padding happens lazily inside `push` so the up-front
392+
/// reservation is a hint, not a hard size guarantee.
346393
pub fn with_capacity(cap: usize) -> Self {
347-
Self { $($field: ::std::vec::Vec::with_capacity(cap)),* }
394+
Self {
395+
$($field: ::std::vec::Vec::with_capacity(cap),)*
396+
_logical_len: 0,
397+
}
348398
}
349399

350400
/// Append one row across all fields.
401+
///
402+
/// For fields tagged `#[soa(pad_to_lanes = N)]`, the underlying
403+
/// `Vec` is padded with `<$ty as Default>::default()` up to the
404+
/// next multiple of `N` before the new value is written. Padded
405+
/// elements occupy slots `[_logical_len + 1 .. padded_len)` and
406+
/// are guaranteed to compare equal to `Default::default()`.
351407
#[allow(clippy::too_many_arguments)]
352408
pub fn push(&mut self, $($field: $ty),*) {
353-
$(self.$field.push($field);)*
409+
let logical = self._logical_len;
410+
$(
411+
$crate::soa_struct!(@push_field
412+
self, $field, $field, $ty, logical
413+
$(, pad = $pad)?
414+
);
415+
)*
416+
self._logical_len = logical + 1;
354417
}
355418

356-
/// Length (all fields share this length; debug-asserted).
419+
/// Logical row count (shared across all fields).
420+
///
421+
/// For padded fields this may be **less than** `self.<field>.len()`;
422+
/// the difference is the lane-alignment tail. Use `len()` for the
423+
/// semantic count, `self.<field>.len()` for the physical Vec length.
357424
pub fn len(&self) -> usize {
358-
let lens = [$(self.$field.len()),*];
359-
debug_assert!(
360-
lens.iter().all(|&l| l == lens[0]),
361-
concat!(stringify!($name), ": field-length invariant violated")
362-
);
363-
lens[0]
425+
self._logical_len
364426
}
365427

366-
/// Returns `true` if there are zero rows.
367-
pub fn is_empty(&self) -> bool { self.len() == 0 }
428+
/// Returns `true` if there are zero logical rows.
429+
pub fn is_empty(&self) -> bool { self._logical_len == 0 }
368430

369-
/// Clear all fields. Capacity is retained.
431+
/// Clear all fields. Capacity is retained; logical length resets to 0.
432+
///
433+
/// Padded fields' physical `Vec`s are cleared along with the
434+
/// unpadded ones — re-pushing into a cleared struct rebuilds the
435+
/// padding from scratch.
370436
pub fn clear(&mut self) {
371437
$(self.$field.clear();)*
438+
self._logical_len = 0;
372439
}
373440
}
374441

375442
impl ::std::default::Default for $name {
376443
fn default() -> Self { Self::new() }
377444
}
378445
};
446+
447+
// Internal — padded field push: grow Vec to the next multiple of $pad
448+
// with Default::default() before writing the new value at `logical`.
449+
(@push_field $self:ident, $vec:ident, $val:ident, $ty:ty, $logical:ident, pad = $pad:literal) => {{
450+
const _: () = {
451+
// Compile-time guard: pad_to_lanes = 0 is nonsensical.
452+
assert!($pad > 0, "soa_struct! #[soa(pad_to_lanes = N)] requires N > 0");
453+
};
454+
let needed = ($logical + 1).div_ceil($pad) * $pad;
455+
while $self.$vec.len() < needed {
456+
$self.$vec.push(<$ty as ::std::default::Default>::default());
457+
}
458+
$self.$vec[$logical] = $val;
459+
}};
460+
461+
// Internal — plain (unpadded) field push.
462+
(@push_field $self:ident, $vec:ident, $val:ident, $ty:ty, $logical:ident) => {{
463+
$self.$vec.push($val);
464+
}};
379465
}
380466

381467
/// Deinterleave an AoS slice into a [`SoaVec<U, N>`] by extracting `N`
@@ -791,12 +877,16 @@ mod tests {
791877
#[test]
792878
fn macro_public_visibility_passthrough() {
793879
// Soa3 has `pub` fields; verify the field is accessible
794-
// (compilation alone proves visibility).
880+
// (compilation alone proves visibility). Use the macro-generated
881+
// `push` (canonical entry point) so `_logical_len` stays in sync;
882+
// direct `s.x.push(...)` would bypass `_logical_len` since PR-X2 B.
795883
let mut s = Soa3::new();
796-
s.x.push(1.0);
797-
s.y.push(2.0);
798-
s.z.push(3.0);
884+
s.push(1.0, 2.0, 3.0);
799885
assert_eq!(s.len(), 1);
886+
// Field access works (visibility test):
887+
assert_eq!(s.x.as_slice(), &[1.0]);
888+
assert_eq!(s.y.as_slice(), &[2.0]);
889+
assert_eq!(s.z.as_slice(), &[3.0]);
800890
}
801891

802892
#[test]
@@ -994,6 +1084,128 @@ mod tests {
9941084
assert_eq!(back, aos);
9951085
}
9961086

1087+
// ------------------------------------------------------------------
1088+
// PR-X2 Worker B — `#[soa(pad_to_lanes = N)]` field attribute
1089+
// ------------------------------------------------------------------
1090+
1091+
soa_struct! {
1092+
/// 3-field SoA with two padded fields at different lane widths and
1093+
/// one unpadded field. Exercises the mixed-cadence macro arm.
1094+
pub struct PadMixed {
1095+
#[soa(pad_to_lanes = 8)]
1096+
pub palette: u8,
1097+
#[soa(pad_to_lanes = 16)]
1098+
pub depth: u16,
1099+
pub label: u32,
1100+
}
1101+
}
1102+
1103+
/// Single push into a `pad_to_lanes = 8` field rounds the physical Vec
1104+
/// up to 8 elements; logical len is 1.
1105+
#[test]
1106+
fn pad_to_lanes_single_push_grows_to_lane() {
1107+
let mut s = PadMixed::new();
1108+
s.push(7u8, 0x1234u16, 99u32);
1109+
assert_eq!(s.len(), 1, "logical len = 1");
1110+
assert_eq!(s.palette.len(), 8, "palette padded to lane 8");
1111+
assert_eq!(s.depth.len(), 16, "depth padded to lane 16");
1112+
assert_eq!(s.label.len(), 1, "label unpadded — physical = logical");
1113+
assert_eq!(s.palette[0], 7);
1114+
assert_eq!(s.depth[0], 0x1234);
1115+
assert_eq!(s.label[0], 99);
1116+
// Padded tail is Default::default().
1117+
for &b in &s.palette[1..8] {
1118+
assert_eq!(b, 0u8);
1119+
}
1120+
for &d in &s.depth[1..16] {
1121+
assert_eq!(d, 0u16);
1122+
}
1123+
}
1124+
1125+
/// Crossing a lane boundary on a padded field grows the Vec by another N.
1126+
#[test]
1127+
fn pad_to_lanes_crosses_lane_boundary() {
1128+
let mut s = PadMixed::new();
1129+
for i in 0..9u8 {
1130+
s.push(i, i as u16, i as u32);
1131+
}
1132+
assert_eq!(s.len(), 9);
1133+
// palette: 9 pushes → next multiple of 8 is 16
1134+
assert_eq!(s.palette.len(), 16);
1135+
// depth: 9 pushes → still inside lane 16
1136+
assert_eq!(s.depth.len(), 16);
1137+
// label: unpadded
1138+
assert_eq!(s.label.len(), 9);
1139+
// first 9 slots carry user values
1140+
for i in 0..9 {
1141+
assert_eq!(s.palette[i], i as u8);
1142+
assert_eq!(s.depth[i], i as u16);
1143+
assert_eq!(s.label[i], i as u32);
1144+
}
1145+
// tail is default-zeroed
1146+
for &b in &s.palette[9..16] {
1147+
assert_eq!(b, 0u8);
1148+
}
1149+
}
1150+
1151+
/// `clear()` resets logical_len and clears physical Vecs.
1152+
#[test]
1153+
fn pad_to_lanes_clear_resets_both() {
1154+
let mut s = PadMixed::new();
1155+
s.push(1, 2, 3);
1156+
s.push(4, 5, 6);
1157+
assert_eq!(s.len(), 2);
1158+
s.clear();
1159+
assert_eq!(s.len(), 0);
1160+
assert!(s.is_empty());
1161+
assert_eq!(s.palette.len(), 0);
1162+
assert_eq!(s.depth.len(), 0);
1163+
assert_eq!(s.label.len(), 0);
1164+
// Reuse after clear works — padding rebuilds from scratch.
1165+
s.push(99, 0xFFFF, 7);
1166+
assert_eq!(s.len(), 1);
1167+
assert_eq!(s.palette.len(), 8);
1168+
assert_eq!(s.depth.len(), 16);
1169+
}
1170+
1171+
soa_struct! {
1172+
/// All-padded variant — every field gets the same lane width.
1173+
pub struct PadUniform {
1174+
#[soa(pad_to_lanes = 4)]
1175+
pub a: i32,
1176+
#[soa(pad_to_lanes = 4)]
1177+
pub b: i32,
1178+
}
1179+
}
1180+
1181+
/// All-padded struct: every field grows in sync with the lane cadence.
1182+
#[test]
1183+
fn pad_to_lanes_uniform_cadence() {
1184+
let mut s = PadUniform::new();
1185+
s.push(10, 20);
1186+
s.push(30, 40);
1187+
s.push(50, 60);
1188+
assert_eq!(s.len(), 3);
1189+
// 3 pushes → next multiple of 4 is 4
1190+
assert_eq!(s.a.len(), 4);
1191+
assert_eq!(s.b.len(), 4);
1192+
assert_eq!(s.a[0..3], [10, 30, 50]);
1193+
assert_eq!(s.b[0..3], [20, 40, 60]);
1194+
assert_eq!(s.a[3], 0);
1195+
assert_eq!(s.b[3], 0);
1196+
}
1197+
1198+
/// `with_capacity` initialises an empty padded struct correctly.
1199+
#[test]
1200+
fn pad_to_lanes_with_capacity_empty() {
1201+
let s = PadMixed::with_capacity(64);
1202+
assert_eq!(s.len(), 0);
1203+
assert!(s.is_empty());
1204+
assert_eq!(s.palette.len(), 0);
1205+
assert_eq!(s.depth.len(), 0);
1206+
assert_eq!(s.label.len(), 0);
1207+
}
1208+
9971209
/// Inference-only entry: caller relies on closure return-type ascription,
9981210
/// no turbofish at all.
9991211
#[test]

0 commit comments

Comments
 (0)