From 4b52052f4dc733e1148c09f026fed28e386606c1 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 15:46:42 +0200 Subject: [PATCH 01/34] ore: add pager feature flag Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ore/Cargo.toml b/src/ore/Cargo.toml index 8078cef5ccf14..bc62bbaa442d0 100644 --- a/src/ore/Cargo.toml +++ b/src/ore/Cargo.toml @@ -93,6 +93,7 @@ mz-ore = { path = "../ore", features = ["id_gen", "chrono"] } proptest.workspace = true scopeguard.workspace = true serde_json.workspace = true +tempfile.workspace = true tokio.workspace = true tokio-test.workspace = true tracing-subscriber.workspace = true @@ -145,6 +146,7 @@ assert-no-tracing = [] assert = ["assert-no-tracing", "ctor", "tracing"] proptest = ["dep:proptest", "proptest-derive"] overflowing = ["assert"] +pager = ["dep:bytemuck", "libc", "rand", "dep:tracing"] [[test]] name = "future" From 13d3239fccbfde0731c84143b326c2967a3e5199 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:05:33 +0200 Subject: [PATCH 02/34] ore: skeleton mz_ore::pager module with Backend enum --- src/ore/src/lib.rs | 3 +++ src/ore/src/pager.rs | 51 +++++++++++++++++++++++++++++++++++++++ src/ore/src/pager/file.rs | 1 + src/ore/src/pager/swap.rs | 1 + 4 files changed, 56 insertions(+) create mode 100644 src/ore/src/pager.rs create mode 100644 src/ore/src/pager/file.rs create mode 100644 src/ore/src/pager/swap.rs diff --git a/src/ore/src/lib.rs b/src/ore/src/lib.rs index 9cbafa6350c88..2c8a4c0e8d946 100644 --- a/src/ore/src/lib.rs +++ b/src/ore/src/lib.rs @@ -62,6 +62,9 @@ pub mod option; #[cfg_attr(nightly_doc_features, doc(cfg(feature = "overflowing")))] #[cfg(feature = "overflowing")] pub mod overflowing; +#[cfg_attr(nightly_doc_features, doc(cfg(feature = "pager")))] +#[cfg(feature = "pager")] +pub mod pager; #[cfg(not(target_family = "wasm"))] #[cfg(feature = "panic")] pub mod panic; diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs new file mode 100644 index 0000000000000..abc868938730f --- /dev/null +++ b/src/ore/src/pager.rs @@ -0,0 +1,51 @@ +//! Explicit pager for cold data. See `doc/developer/design/20260504_pager.md`. + +use std::sync::atomic::{AtomicU8, Ordering}; + +mod file; +mod swap; + +/// Selects which backend stores paged-out data. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum Backend { + /// Hold allocations resident; hint the kernel via `MADV_COLD`. + Swap, + /// Write to a named scratch file; no file descriptor retained. + File, +} + +const BACKEND_SWAP: u8 = 0; +const BACKEND_FILE: u8 = 1; + +static BACKEND: AtomicU8 = AtomicU8::new(BACKEND_SWAP); + +/// Returns the currently active backend. +pub fn backend() -> Backend { + match BACKEND.load(Ordering::Relaxed) { + BACKEND_SWAP => Backend::Swap, + BACKEND_FILE => Backend::File, + _ => unreachable!("BACKEND atomic holds invalid discriminant"), + } +} + +/// Sets the active backend for future `pageout` calls. Existing handles are unaffected. +pub fn set_backend(b: Backend) { + let raw = match b { + Backend::Swap => BACKEND_SWAP, + Backend::File => BACKEND_FILE, + }; + BACKEND.store(raw, Ordering::Relaxed); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[mz_ore::test] + fn backend_round_trip() { + set_backend(Backend::File); + assert_eq!(backend(), Backend::File); + set_backend(Backend::Swap); + assert_eq!(backend(), Backend::Swap); + } +} diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs new file mode 100644 index 0000000000000..fb75cb8233a62 --- /dev/null +++ b/src/ore/src/pager/file.rs @@ -0,0 +1 @@ +//! File backend for the pager. See `mz_ore::pager` for the public API. diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs new file mode 100644 index 0000000000000..f897c2e3c61c6 --- /dev/null +++ b/src/ore/src/pager/swap.rs @@ -0,0 +1 @@ +//! Swap backend for the pager. See `mz_ore::pager` for the public API. From 8dea6b449d00260a7491c0848d8c981a6b45667f Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:30:52 +0200 Subject: [PATCH 03/34] ore: pager scratch dir lifecycle and stale-subdir reaper Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager.rs | 2 + src/ore/src/pager/file.rs | 110 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index abc868938730f..572c34cc5a0ba 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -5,6 +5,8 @@ use std::sync::atomic::{AtomicU8, Ordering}; mod file; mod swap; +pub use file::set_scratch_dir; + /// Selects which backend stores paged-out data. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum Backend { diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index fb75cb8233a62..bed87de44d992 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -1 +1,111 @@ //! File backend for the pager. See `mz_ore::pager` for the public API. + +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; +use std::sync::atomic::{AtomicU64, Ordering}; + +static SCRATCH_DIR: OnceLock = OnceLock::new(); +static SUBDIR: OnceLock = OnceLock::new(); +static SCRATCH_ID: AtomicU64 = AtomicU64::new(0); + +/// Configures the scratch directory for the file backend. Idempotent across multiple +/// calls with the same path; logs and ignores subsequent calls with a different path. +pub fn set_scratch_dir(root: PathBuf) { + if let Err(existing) = SCRATCH_DIR.set(root.clone()) { + if existing != root { + tracing::warn!( + ?root, + ?existing, + "mz_ore::pager scratch dir already set; ignoring", + ); + } + return; + } + if let Err(err) = init_subdir(&root) { + tracing::warn!(?root, %err, "mz_ore::pager: failed to initialize scratch subdir"); + } +} + +fn init_subdir(root: &Path) -> std::io::Result<()> { + let nonce: u64 = rand::random(); + let pid = std::process::id(); + let subdir = root.join(format!("mz-pager-{pid}-{nonce:016x}")); + std::fs::create_dir_all(&subdir)?; + let _ = SUBDIR.set(subdir); + reap_stale(root); + Ok(()) +} + +fn reap_stale(root: &Path) { + let entries = match std::fs::read_dir(root) { + Ok(e) => e, + Err(err) => { + tracing::warn!(?root, %err, "mz_ore::pager: scratch dir scan failed"); + return; + } + }; + for entry in entries.flatten() { + let name = entry.file_name(); + let name = match name.to_str() { + Some(s) => s, + None => continue, + }; + let Some(rest) = name.strip_prefix("mz-pager-") else { + continue; + }; + let pid: u32 = match rest.split_once('-').and_then(|(p, _)| p.parse().ok()) { + Some(p) => p, + None => continue, + }; + if pid == std::process::id() { + continue; + } + if std::path::Path::new(&format!("/proc/{pid}")).exists() { + continue; + } + if let Err(err) = std::fs::remove_dir_all(entry.path()) { + tracing::warn!(path = ?entry.path(), %err, "mz_ore::pager: reap failed"); + } + } +} + +pub(crate) fn scratch_path(id: u64) -> PathBuf { + SUBDIR + .get() + .expect("mz_ore::pager file backend used before set_scratch_dir") + .join(format!("{id}.bin")) +} + +pub(crate) fn alloc_scratch_id() -> u64 { + SCRATCH_ID.fetch_add(1, Ordering::Relaxed) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + static TEST_DIR: std::sync::OnceLock = std::sync::OnceLock::new(); + + pub(super) fn shared_scratch() -> &'static std::path::Path { + let dir = TEST_DIR.get_or_init(|| tempdir().expect("tempdir")); + set_scratch_dir(dir.path().to_owned()); + dir.path() + } + + #[mz_ore::test] + fn set_scratch_dir_creates_subdir() { + let root = shared_scratch(); + let subdir = SUBDIR.get().expect("subdir was initialized"); + assert!(subdir.exists()); + assert!(subdir.starts_with(root)); + assert!( + subdir + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("mz-pager-") + ); + } +} From 01d5359aa93b0389b6c8ece6143ceed5aa8b95d7 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:36:11 +0200 Subject: [PATCH 04/34] ore: pager Handle type and inner storage scaffolding Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager.rs | 48 +++++++++++++++++++++++++++++++++++++++ src/ore/src/pager/file.rs | 26 +++++++++++++++++++++ src/ore/src/pager/swap.rs | 27 ++++++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 572c34cc5a0ba..7139d77f079b3 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -7,6 +7,54 @@ mod swap; pub use file::set_scratch_dir; +use crate::pager::file::FileInner; +use crate::pager::swap::SwapInner; + +/// An opaque handle to data paged out via [`pageout`]. The handle's backend variant +/// is fixed at `pageout` time and is independent of any later `set_backend` call. +#[derive(Debug)] +pub struct Handle { + inner: HandleInner, +} + +#[derive(Debug)] +enum HandleInner { + Swap(SwapInner), + File(FileInner), +} + +impl Handle { + /// Returns the logical length of the handle's payload in `u64`s. + pub fn len(&self) -> usize { + match &self.inner { + HandleInner::Swap(s) => *s.prefix.last().unwrap_or(&0), + HandleInner::File(f) => f.len_u64s, + } + } + + /// Returns the logical length of the handle's payload in bytes (`len() * 8`). + pub fn len_bytes(&self) -> usize { + self.len() * 8 + } + + /// Returns `true` if the handle holds no data. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub(crate) fn from_swap(inner: SwapInner) -> Self { + Self { + inner: HandleInner::Swap(inner), + } + } + + pub(crate) fn from_file(inner: FileInner) -> Self { + Self { + inner: HandleInner::File(inner), + } + } +} + /// Selects which backend stores paged-out data. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum Backend { diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index bed87de44d992..8b26bbd1d34aa 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -80,6 +80,32 @@ pub(crate) fn alloc_scratch_id() -> u64 { SCRATCH_ID.fetch_add(1, Ordering::Relaxed) } +/// Storage for a file-backed handle. The file at `scratch_path(id)` holds the bytes. +/// No file descriptor is retained. +#[derive(Debug)] +pub(crate) struct FileInner { + pub(crate) id: u64, + pub(crate) len_u64s: usize, +} + +impl FileInner { + pub(crate) fn new(id: u64, len_u64s: usize) -> Self { + Self { id, len_u64s } + } +} + +impl Drop for FileInner { + fn drop(&mut self) { + let path = scratch_path(self.id); + if let Err(err) = std::fs::remove_file(&path) { + // ENOENT is fine: a successful `take` already unlinked. + if err.kind() != std::io::ErrorKind::NotFound { + tracing::warn!(?path, %err, "mz_ore::pager: failed to unlink scratch file"); + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index f897c2e3c61c6..4857e7bf1964a 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -1 +1,28 @@ //! Swap backend for the pager. See `mz_ore::pager` for the public API. + +/// Storage for a swap-backed handle. +#[derive(Debug)] +pub(crate) struct SwapInner { + /// Logical chunks; logical layout is concatenation in this order. + pub(crate) chunks: Vec>, + /// Cumulative element counts; `prefix[i]` = sum of `chunks[..i]` lengths. + /// `prefix[0] == 0`, `prefix.last() == total_len`. + pub(crate) prefix: Vec, +} + +impl SwapInner { + pub(crate) fn new(chunks: Vec>) -> Self { + let mut prefix = Vec::with_capacity(chunks.len() + 1); + prefix.push(0); + let mut sum = 0; + for c in &chunks { + sum += c.len(); + prefix.push(sum); + } + Self { chunks, prefix } + } + + pub(crate) fn total_len(&self) -> usize { + *self.prefix.last().unwrap_or(&0) + } +} From 027334bb6310922ef240c456e6eb603b8e40248b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:39:56 +0200 Subject: [PATCH 05/34] ore: pager swap backend pageout with MADV_COLD Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager/swap.rs | 69 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 4857e7bf1964a..ebcfb6f71f018 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -1,5 +1,7 @@ //! Swap backend for the pager. See `mz_ore::pager` for the public API. +use crate::pager::Handle; + /// Storage for a swap-backed handle. #[derive(Debug)] pub(crate) struct SwapInner { @@ -26,3 +28,70 @@ impl SwapInner { *self.prefix.last().unwrap_or(&0) } } + +pub(crate) fn pageout_swap(chunks: &mut [Vec]) -> Handle { + let mut taken: Vec> = Vec::with_capacity(chunks.len()); + for c in chunks.iter_mut() { + taken.push(std::mem::take(c)); + } + for c in &taken { + madvise_cold(c); + } + Handle::from_swap(SwapInner::new(taken)) +} + +#[cfg(target_os = "linux")] +fn madvise_cold(chunk: &[u64]) { + if chunk.is_empty() { + return; + } + let page = page_size(); + let ptr = chunk.as_ptr() as usize; + let len_bytes = chunk.len() * std::mem::size_of::(); + let aligned_start = (ptr + page - 1) & !(page - 1); + let aligned_end = (ptr + len_bytes) & !(page - 1); + if aligned_end <= aligned_start { + return; + } + // SAFETY: pointer/length come from a live `&[u64]`; we restrict to a fully + // page-aligned subrange contained within that slice; `MADV_COLD` does not + // mutate the contents. + unsafe { + libc::madvise( + aligned_start as *mut libc::c_void, + aligned_end - aligned_start, + libc::MADV_COLD, + ); + } +} + +#[cfg(not(target_os = "linux"))] +fn madvise_cold(_chunk: &[u64]) {} + +#[cfg(target_os = "linux")] +fn page_size() -> usize { + // SAFETY: `sysconf` with a valid argument is safe. + unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize } +} + +#[cfg(not(target_os = "linux"))] +fn page_size() -> usize { + 4096 +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::pager::Handle; + + #[mz_ore::test] + fn pageout_takes_chunks_and_records_lengths() { + let a = vec![1u64, 2, 3]; + let b = vec![4u64, 5]; + let mut chunks = [a, b]; + let h: Handle = pageout_swap(&mut chunks); + assert_eq!(h.len(), 5); + assert!(chunks[0].is_empty()); + assert!(chunks[1].is_empty()); + } +} From 1c27263aa4e07dee2b5a8e1707f542d34c5f2ef2 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:44:36 +0200 Subject: [PATCH 06/34] ore: pager swap backend read_at_many Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager.rs | 14 ++++++++ src/ore/src/pager/swap.rs | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 7139d77f079b3..303fa99568801 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -53,6 +53,20 @@ impl Handle { inner: HandleInner::File(inner), } } + + pub(crate) fn swap_inner(&self) -> Option<&SwapInner> { + match &self.inner { + HandleInner::Swap(s) => Some(s), + _ => None, + } + } + + pub(crate) fn file_inner(&self) -> Option<&FileInner> { + match &self.inner { + HandleInner::File(f) => Some(f), + _ => None, + } + } } /// Selects which backend stores paged-out data. diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index ebcfb6f71f018..5a31898680026 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -79,6 +79,45 @@ fn page_size() -> usize { 4096 } +pub(crate) fn read_at_swap(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec) { + let inner = handle + .swap_inner() + .expect("read_at_swap called on non-swap handle"); + let total = inner.total_len(); + let total_out: usize = ranges.iter().map(|(_, l)| *l).sum(); + dst.reserve(total_out); + for &(off, len) in ranges { + let end = off.checked_add(len).expect("range offset+len overflow"); + assert!( + end <= total, + "read range out of bounds: {off}+{len} > {total}" + ); + copy_range(inner, off, len, dst); + } +} + +fn copy_range(inner: &SwapInner, off: usize, len: usize, dst: &mut Vec) { + if len == 0 { + return; + } + let mut remaining = len; + let mut cur = off; + let mut idx = match inner.prefix.binary_search(&cur) { + Ok(i) => i, + Err(i) => i.saturating_sub(1), + }; + while remaining > 0 { + let chunk_start = inner.prefix[idx]; + let chunk = &inner.chunks[idx]; + let local = cur - chunk_start; + let take = std::cmp::min(remaining, chunk.len() - local); + dst.extend_from_slice(&chunk[local..local + take]); + cur += take; + remaining -= take; + idx += 1; + } +} + #[cfg(test)] mod tests { use super::*; @@ -94,4 +133,40 @@ mod tests { assert!(chunks[0].is_empty()); assert!(chunks[1].is_empty()); } + + #[mz_ore::test] + fn read_at_within_single_chunk() { + let mut chunks = [vec![10u64, 11, 12, 13, 14]]; + let h = pageout_swap(&mut chunks); + let mut dst = Vec::new(); + read_at_swap(&h, &[(1, 3)], &mut dst); + assert_eq!(dst, vec![11, 12, 13]); + } + + #[mz_ore::test] + fn read_at_spans_chunks() { + let mut chunks = [vec![1u64, 2, 3], vec![4, 5, 6]]; + let h = pageout_swap(&mut chunks); + let mut dst = Vec::new(); + read_at_swap(&h, &[(2, 3)], &mut dst); + assert_eq!(dst, vec![3, 4, 5]); + } + + #[mz_ore::test] + fn read_at_many_concats() { + let mut chunks = [vec![1u64, 2, 3, 4, 5]]; + let h = pageout_swap(&mut chunks); + let mut dst = Vec::new(); + read_at_swap(&h, &[(0, 2), (3, 2)], &mut dst); + assert_eq!(dst, vec![1, 2, 4, 5]); + } + + #[mz_ore::test] + #[should_panic(expected = "out of bounds")] + fn read_at_panics_on_oob() { + let mut chunks = [vec![1u64, 2]]; + let h = pageout_swap(&mut chunks); + let mut dst = Vec::new(); + read_at_swap(&h, &[(1, 5)], &mut dst); + } } From c1522ab3188c373b17f5ca060a389e5b0dbfb34e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:45:06 +0200 Subject: [PATCH 07/34] ore: pager swap backend take with zero-copy fast path Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager.rs | 14 +++++++++++++ src/ore/src/pager/swap.rs | 44 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 303fa99568801..693fc3773c9ca 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -67,6 +67,20 @@ impl Handle { _ => None, } } + + pub(crate) fn into_swap_inner(self) -> Option { + match self.inner { + HandleInner::Swap(s) => Some(s), + _ => None, + } + } + + pub(crate) fn into_file_inner(self) -> Option { + match self.inner { + HandleInner::File(f) => Some(f), + _ => None, + } + } } /// Selects which backend stores paged-out data. diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 5a31898680026..7036544aead72 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -118,6 +118,25 @@ fn copy_range(inner: &SwapInner, off: usize, len: usize, dst: &mut Vec) { } } +pub(crate) fn take_swap(handle: Handle, dst: &mut Vec) { + let inner = match handle.into_swap_inner() { + Some(s) => s, + None => panic!("take_swap called on non-swap handle"), + }; + dst.clear(); + let mut chunks = inner.chunks; + if chunks.len() == 1 && dst.capacity() == 0 { + let only = chunks.pop().unwrap(); + *dst = only; + return; + } + let total: usize = chunks.iter().map(|c| c.len()).sum(); + dst.reserve(total); + for c in chunks { + dst.extend_from_slice(&c); + } +} + #[cfg(test)] mod tests { use super::*; @@ -169,4 +188,29 @@ mod tests { let mut dst = Vec::new(); read_at_swap(&h, &[(1, 5)], &mut dst); } + + #[mz_ore::test] + fn take_single_chunk_zero_copy() { + let v = vec![100u64; 1024]; + let ptr_before = v.as_ptr(); + let mut chunks = [v]; + let h = pageout_swap(&mut chunks); + let mut dst = Vec::new(); + take_swap(h, &mut dst); + assert_eq!(dst.len(), 1024); + assert_eq!( + dst.as_ptr(), + ptr_before, + "single-chunk take should be zero-copy" + ); + } + + #[mz_ore::test] + fn take_multi_chunk_concats() { + let mut chunks = [vec![1u64, 2], vec![3, 4, 5]]; + let h = pageout_swap(&mut chunks); + let mut dst = Vec::new(); + take_swap(h, &mut dst); + assert_eq!(dst, vec![1, 2, 3, 4, 5]); + } } From b9e1d10a3f5953955365227f7e6f36bad7e67696 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:45:40 +0200 Subject: [PATCH 08/34] ore: pager public dispatch surface (pageout/read_at/take) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager.rs | 63 +++++++++++++++++++++++++++++++++++++++ src/ore/src/pager/file.rs | 14 +++++++++ 2 files changed, 77 insertions(+) diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 693fc3773c9ca..e718cf49f7819 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -115,6 +115,47 @@ pub fn set_backend(b: Backend) { BACKEND.store(raw, Ordering::Relaxed); } +/// Scatter pageout. Logical layout = chunks concatenated in order. +/// After return, each `Vec` in `chunks` is empty. +/// File backend preserves capacity; swap backend moves the alloc into the handle. +/// Empty input returns a `len == 0` handle and performs no I/O. +pub fn pageout(chunks: &mut [Vec]) -> Handle { + if total_len(chunks) == 0 { + return Handle::from_swap(SwapInner::new(Vec::new())); + } + match backend() { + Backend::Swap => swap::pageout_swap(chunks), + Backend::File => file::pageout_file(chunks), + } +} + +/// Reads multiple ranges. Output appended to `dst` in request order (concat). +/// Panics if any range is out of bounds. +pub fn read_at_many(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec) { + match &handle.inner { + HandleInner::Swap(_) => swap::read_at_swap(handle, ranges, dst), + HandleInner::File(_) => file::read_at_file(handle, ranges, dst), + } +} + +/// Reads a single range. Convenience wrapper around `read_at_many`. +pub fn read_at(handle: &Handle, offset: usize, len: usize, dst: &mut Vec) { + read_at_many(handle, &[(offset, len)], dst); +} + +/// Consumes handle, writing the entire payload into `dst` (cleared first), then reclaims storage. +/// Swap fast path: single-chunk handle into empty `dst` swaps in place, no copy. +pub fn take(handle: Handle, dst: &mut Vec) { + match &handle.inner { + HandleInner::Swap(_) => swap::take_swap(handle, dst), + HandleInner::File(_) => file::take_file(handle, dst), + } +} + +fn total_len(chunks: &[Vec]) -> usize { + chunks.iter().map(|c| c.len()).sum() +} + #[cfg(test)] mod tests { use super::*; @@ -127,3 +168,25 @@ mod tests { assert_eq!(backend(), Backend::Swap); } } + +#[cfg(test)] +mod dispatch_tests { + use super::*; + + #[mz_ore::test] + fn end_to_end_swap() { + set_backend(Backend::Swap); + let mut chunks = [vec![1u64, 2, 3, 4]]; + let h = pageout(&mut chunks); + assert_eq!(h.len(), 4); + assert!(chunks[0].is_empty()); + + let mut dst = Vec::new(); + read_at(&h, 1, 2, &mut dst); + assert_eq!(dst, vec![2, 3]); + + let mut dst2 = Vec::new(); + take(h, &mut dst2); + assert_eq!(dst2, vec![1, 2, 3, 4]); + } +} diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 8b26bbd1d34aa..34a5d1c28691c 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -106,6 +106,20 @@ impl Drop for FileInner { } } +use crate::pager::Handle; + +pub(crate) fn pageout_file(_chunks: &mut [Vec]) -> Handle { + unimplemented!("file backend pageout: see Task 9") +} + +pub(crate) fn read_at_file(_h: &Handle, _ranges: &[(usize, usize)], _dst: &mut Vec) { + unimplemented!("file backend read_at: see Task 10") +} + +pub(crate) fn take_file(_h: Handle, _dst: &mut Vec) { + unimplemented!("file backend take: see Task 11") +} + #[cfg(test)] mod tests { use super::*; From 70f035ceaf63efe0d85c821b49af994df0d48d35 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:47:24 +0200 Subject: [PATCH 09/34] ore: pager file backend pageout with pwritev Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager/file.rs | 140 +++++++++++++++++++++++++++++++++++--- 1 file changed, 131 insertions(+), 9 deletions(-) diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 34a5d1c28691c..38263b734d070 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -1,28 +1,31 @@ //! File backend for the pager. See `mz_ore::pager` for the public API. use std::path::{Path, PathBuf}; -use std::sync::OnceLock; use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Once, OnceLock}; static SCRATCH_DIR: OnceLock = OnceLock::new(); static SUBDIR: OnceLock = OnceLock::new(); static SCRATCH_ID: AtomicU64 = AtomicU64::new(0); +static SCRATCH_INIT: Once = Once::new(); /// Configures the scratch directory for the file backend. Idempotent across multiple /// calls with the same path; logs and ignores subsequent calls with a different path. pub fn set_scratch_dir(root: PathBuf) { - if let Err(existing) = SCRATCH_DIR.set(root.clone()) { - if existing != root { + SCRATCH_INIT.call_once(|| { + if let Err(err) = init_subdir(&root) { + tracing::warn!(?root, %err, "mz_ore::pager: failed to initialize scratch subdir"); + } + let _ = SCRATCH_DIR.set(root.clone()); + }); + if let Some(existing) = SCRATCH_DIR.get() { + if *existing != root { tracing::warn!( ?root, ?existing, "mz_ore::pager scratch dir already set; ignoring", ); } - return; - } - if let Err(err) = init_subdir(&root) { - tracing::warn!(?root, %err, "mz_ore::pager: failed to initialize scratch subdir"); } } @@ -106,10 +109,99 @@ impl Drop for FileInner { } } +use std::fs::File; +use std::io::IoSlice; + use crate::pager::Handle; +use crate::pager::swap::{SwapInner, pageout_swap}; -pub(crate) fn pageout_file(_chunks: &mut [Vec]) -> Handle { - unimplemented!("file backend pageout: see Task 9") +pub(crate) fn pageout_file(chunks: &mut [Vec]) -> Handle { + let total: usize = chunks.iter().map(|c| c.len()).sum(); + if total == 0 { + return Handle::from_swap(SwapInner::new(Vec::new())); + } + let id = alloc_scratch_id(); + let path = scratch_path(id); + match write_chunks(&path, chunks) { + Ok(()) => { + for c in chunks.iter_mut() { + c.clear(); + } + Handle::from_file(FileInner::new(id, total)) + } + Err(err) => { + tracing::warn!(?path, %err, "mz_ore::pager: file pageout failed; falling back to swap"); + let _ = std::fs::remove_file(&path); + pageout_swap(chunks) + } + } +} + +fn write_chunks(path: &Path, chunks: &[Vec]) -> std::io::Result<()> { + let file = File::options().write(true).create_new(true).open(path)?; + let slices: Vec> = chunks + .iter() + .filter(|c| !c.is_empty()) + .map(|c| IoSlice::new(bytemuck::cast_slice(c.as_slice()))) + .collect(); + write_all_vectored(&file, &slices)?; + Ok(()) +} + +#[cfg(unix)] +fn write_all_vectored(file: &File, slices: &[IoSlice<'_>]) -> std::io::Result<()> { + use std::os::unix::io::AsRawFd; + let fd = file.as_raw_fd(); + let mut offset: i64 = 0; + let mut idx = 0; + let mut consumed_in_idx: usize = 0; + while idx < slices.len() { + let remaining = &slices[idx..]; + let iovs: Vec = remaining + .iter() + .enumerate() + .map(|(i, s)| { + let base_off = if i == 0 { consumed_in_idx } else { 0 }; + // SAFETY: building an iovec from a live `IoSlice` is safe; + // the pointer/length describe the caller's buffer. + libc::iovec { + iov_base: unsafe { s.as_ptr().add(base_off) } as *mut libc::c_void, + iov_len: s.len() - base_off, + } + }) + .collect(); + // SAFETY: fd is valid and open for writing; iovs point into the live `slices` + // owned by the caller; pwritev does not retain pointers past the syscall. + let written = + unsafe { libc::pwritev(fd, iovs.as_ptr(), iovs.len() as libc::c_int, offset) }; + if written < 0 { + return Err(std::io::Error::last_os_error()); + } + let mut left = written as usize; + offset += written as i64; + while left > 0 && idx < slices.len() { + let avail = slices[idx].len() - consumed_in_idx; + if left >= avail { + left -= avail; + idx += 1; + consumed_in_idx = 0; + } else { + consumed_in_idx += left; + left = 0; + } + } + } + Ok(()) +} + +#[cfg(not(unix))] +fn write_all_vectored(file: &File, slices: &[IoSlice<'_>]) -> std::io::Result<()> { + use std::io::Write; + let mut file = file; + for s in slices { + file.write_all(s)?; + } + Ok(()) } pub(crate) fn read_at_file(_h: &Handle, _ranges: &[(usize, usize)], _dst: &mut Vec) { @@ -120,6 +212,36 @@ pub(crate) fn take_file(_h: Handle, _dst: &mut Vec) { unimplemented!("file backend take: see Task 11") } +#[cfg(test)] +mod backend_tests { + use super::*; + + fn setup_dir() { + let _ = super::tests::shared_scratch(); + } + + #[mz_ore::test] + fn pageout_writes_file_and_clears_capacity() { + setup_dir(); + let mut chunks = [vec![10u64, 20, 30], vec![40, 50]]; + let cap_before_0 = chunks[0].capacity(); + let cap_before_1 = chunks[1].capacity(); + let h = pageout_file(&mut chunks); + assert_eq!(h.len(), 5); + assert!(chunks[0].is_empty()); + assert!(chunks[1].is_empty()); + // File backend preserves capacity: + assert_eq!(chunks[0].capacity(), cap_before_0); + assert_eq!(chunks[1].capacity(), cap_before_1); + + let inner = h.file_inner().expect("file inner"); + let path = scratch_path(inner.id); + assert!(path.exists()); + let bytes = std::fs::read(&path).expect("read scratch"); + assert_eq!(bytes.len(), 5 * 8); + } +} + #[cfg(test)] mod tests { use super::*; From 8c5b92b4373ad2124bf89d34d10ebcd6c9777839 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:48:33 +0200 Subject: [PATCH 10/34] ore: pager file backend read_at_many with coalescing Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager/file.rs | 85 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 38263b734d070..d0a57077cfb8b 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -204,8 +204,58 @@ fn write_all_vectored(file: &File, slices: &[IoSlice<'_>]) -> std::io::Result<() Ok(()) } -pub(crate) fn read_at_file(_h: &Handle, _ranges: &[(usize, usize)], _dst: &mut Vec) { - unimplemented!("file backend read_at: see Task 10") +pub(crate) fn read_at_file(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec) { + use std::os::unix::fs::FileExt; + + let inner = handle + .file_inner() + .expect("read_at_file called on non-file handle"); + let total = inner.len_u64s; + for &(off, len) in ranges { + let end = off.checked_add(len).expect("range offset+len overflow"); + assert!( + end <= total, + "read range out of bounds: {off}+{len} > {total}" + ); + } + let path = scratch_path(inner.id); + let file = match File::open(&path) { + Ok(f) => f, + Err(err) => panic!("mz_ore::pager: failed to open scratch file {path:?}: {err}"), + }; + + let coalesced = coalesce(ranges); + for (off, len) in coalesced { + let byte_off = (off * 8) as u64; + let byte_len = len * 8; + let buf_start = dst.len(); + dst.resize(buf_start + len, 0); + let buf: &mut [u8] = bytemuck::cast_slice_mut(&mut dst[buf_start..buf_start + len]); + let mut filled = 0; + while filled < byte_len { + let n = file + .read_at(&mut buf[filled..byte_len], byte_off + filled as u64) + .expect("pager pread failed"); + if n == 0 { + panic!("pager pread short: expected {byte_len} got {filled}"); + } + filled += n; + } + } +} + +fn coalesce(ranges: &[(usize, usize)]) -> Vec<(usize, usize)> { + let mut out: Vec<(usize, usize)> = Vec::with_capacity(ranges.len()); + for &(off, len) in ranges { + if let Some(last) = out.last_mut() { + if last.0 + last.1 == off { + last.1 += len; + continue; + } + } + out.push((off, len)); + } + out } pub(crate) fn take_file(_h: Handle, _dst: &mut Vec) { @@ -240,6 +290,37 @@ mod backend_tests { let bytes = std::fs::read(&path).expect("read scratch"); assert_eq!(bytes.len(), 5 * 8); } + + #[mz_ore::test] + fn file_read_at_basic() { + setup_dir(); + let mut chunks = [vec![1u64, 2, 3, 4, 5]]; + let h = pageout_file(&mut chunks); + let mut dst = Vec::new(); + read_at_file(&h, &[(1, 3)], &mut dst); + assert_eq!(dst, vec![2, 3, 4]); + } + + #[mz_ore::test] + fn file_read_at_many_concats_and_coalesces() { + setup_dir(); + let mut chunks = [vec![10u64, 20, 30, 40, 50, 60]]; + let h = pageout_file(&mut chunks); + let mut dst = Vec::new(); + // (0,2) and (2,2) are adjacent => single pread internally. + read_at_file(&h, &[(0, 2), (2, 2), (5, 1)], &mut dst); + assert_eq!(dst, vec![10, 20, 30, 40, 60]); + } + + #[mz_ore::test] + #[should_panic(expected = "out of bounds")] + fn file_read_at_panics_on_oob() { + setup_dir(); + let mut chunks = [vec![1u64, 2]]; + let h = pageout_file(&mut chunks); + let mut dst = Vec::new(); + read_at_file(&h, &[(0, 99)], &mut dst); + } } #[cfg(test)] From 8db7d6c9b5f8430f2dd65313b18ebd9c5ae7f15b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:49:14 +0200 Subject: [PATCH 11/34] ore: pager file backend take and drop reclaim Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/src/pager/file.rs | 52 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index d0a57077cfb8b..4a18167e6a06e 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -258,8 +258,30 @@ fn coalesce(ranges: &[(usize, usize)]) -> Vec<(usize, usize)> { out } -pub(crate) fn take_file(_h: Handle, _dst: &mut Vec) { - unimplemented!("file backend take: see Task 11") +pub(crate) fn take_file(handle: Handle, dst: &mut Vec) { + use std::os::unix::fs::FileExt; + + let inner = handle + .into_file_inner() + .expect("take_file called on non-file handle"); + dst.clear(); + let path = scratch_path(inner.id); + let file = File::open(&path).unwrap_or_else(|err| panic!("pager take: open {path:?}: {err}")); + dst.resize(inner.len_u64s, 0); + let buf: &mut [u8] = bytemuck::cast_slice_mut(dst.as_mut_slice()); + let mut filled = 0; + while filled < buf.len() { + let n = file + .read_at(&mut buf[filled..], filled as u64) + .unwrap_or_else(|err| panic!("pager take: pread {path:?}: {err}")); + if n == 0 { + panic!("pager take: short read at {filled}"); + } + filled += n; + } + drop(file); + // FileInner::drop will unlink the scratch file. + drop(inner); } #[cfg(test)] @@ -321,6 +343,32 @@ mod backend_tests { let mut dst = Vec::new(); read_at_file(&h, &[(0, 99)], &mut dst); } + + #[mz_ore::test] + fn file_take_returns_data_and_unlinks() { + setup_dir(); + let mut chunks = [vec![7u64; 100]]; + let h = pageout_file(&mut chunks); + let inner_id = h.file_inner().unwrap().id; + let path = scratch_path(inner_id); + assert!(path.exists()); + let mut dst = Vec::new(); + take_file(h, &mut dst); + assert_eq!(dst, vec![7u64; 100]); + assert!(!path.exists(), "scratch file should be unlinked after take"); + } + + #[mz_ore::test] + fn file_drop_unlinks_when_not_taken() { + setup_dir(); + let mut chunks = [vec![1u64, 2, 3]]; + let h = pageout_file(&mut chunks); + let id = h.file_inner().unwrap().id; + let path = scratch_path(id); + assert!(path.exists()); + drop(h); + assert!(!path.exists(), "scratch file should be unlinked on drop"); + } } #[cfg(test)] From 83f8c34b77d45b243f918363230d322739caf362 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:52:00 +0200 Subject: [PATCH 12/34] ore: pager cross-backend integration tests Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/tests/pager_integration.rs | 74 ++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 src/ore/tests/pager_integration.rs diff --git a/src/ore/tests/pager_integration.rs b/src/ore/tests/pager_integration.rs new file mode 100644 index 0000000000000..788c8f69e6554 --- /dev/null +++ b/src/ore/tests/pager_integration.rs @@ -0,0 +1,74 @@ +#![cfg(feature = "pager")] + +use mz_ore::pager::{Backend, Handle, pageout, read_at, set_backend, set_scratch_dir, take}; +use tempfile::tempdir; + +fn ensure_scratch() { + static INIT: std::sync::OnceLock = std::sync::OnceLock::new(); + let dir = INIT.get_or_init(|| tempdir().unwrap()); + set_scratch_dir(dir.path().to_owned()); +} + +#[test] +fn round_trip_swap() { + set_backend(Backend::Swap); + let payload: Vec = (0..1024).collect(); + let mut chunks = [payload.clone()]; + let h = pageout(&mut chunks); + let mut dst = Vec::new(); + take(h, &mut dst); + assert_eq!(dst, payload); +} + +#[test] +fn round_trip_file() { + ensure_scratch(); + set_backend(Backend::File); + let payload: Vec = (0..4096).collect(); + let mut chunks = [payload.clone()]; + let h = pageout(&mut chunks); + let mut dst = Vec::new(); + take(h, &mut dst); + assert_eq!(dst, payload); + set_backend(Backend::Swap); +} + +#[test] +fn handle_survives_backend_flip() { + ensure_scratch(); + set_backend(Backend::File); + let payload: Vec = (0..256).collect(); + let mut chunks = [payload.clone()]; + let h: Handle = pageout(&mut chunks); + + // Flip to Swap; existing handle should still be readable as File. + set_backend(Backend::Swap); + + let mut dst = Vec::new(); + read_at(&h, 0, payload.len(), &mut dst); + assert_eq!(dst, payload); + + let mut dst2 = Vec::new(); + take(h, &mut dst2); + assert_eq!(dst2, payload); +} + +#[test] +fn empty_input_yields_zero_len_handle() { + set_backend(Backend::Swap); + let mut chunks: [Vec; 0] = []; + let h = pageout(&mut chunks); + assert_eq!(h.len(), 0); + assert!(h.is_empty()); +} + +#[test] +fn scatter_round_trip() { + set_backend(Backend::Swap); + let mut chunks = [vec![1u64, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; + let h = pageout(&mut chunks); + assert_eq!(h.len(), 9); + let mut dst = Vec::new(); + take(h, &mut dst); + assert_eq!(dst, vec![1, 2, 3, 4, 5, 6, 7, 8, 9]); +} From 84189cd63ad0245bcbf085a5ab944c7cbe8e9cbc Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:53:42 +0200 Subject: [PATCH 13/34] ore: pager Criterion bench harness Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/Cargo.toml | 5 ++ src/ore/benches/pager.rs | 115 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 src/ore/benches/pager.rs diff --git a/src/ore/Cargo.toml b/src/ore/Cargo.toml index bc62bbaa442d0..d446f694bc2d4 100644 --- a/src/ore/Cargo.toml +++ b/src/ore/Cargo.toml @@ -169,6 +169,11 @@ name = "bytes" harness = false required-features = ["bytes", "region", "tracing"] +[[bench]] +name = "pager" +harness = false +required-features = ["pager"] + [package.metadata.cargo-udeps.ignore] # Only used in doc-tests. development = ["tokio-test"] diff --git a/src/ore/benches/pager.rs b/src/ore/benches/pager.rs new file mode 100644 index 0000000000000..036f11cae9bf3 --- /dev/null +++ b/src/ore/benches/pager.rs @@ -0,0 +1,115 @@ +#![cfg(feature = "pager")] + +use std::path::PathBuf; +use std::time::Duration; + +use criterion::{ + BatchSize, BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main, +}; +use mz_ore::pager::{self, Backend}; + +fn ensure_scratch() { + static INIT: std::sync::Once = std::sync::Once::new(); + INIT.call_once(|| { + let dir: PathBuf = std::env::var_os("MZ_PAGER_SCRATCH") + .map(PathBuf::from) + .unwrap_or_else(std::env::temp_dir); + pager::set_scratch_dir(dir); + }); +} + +fn fill_payload(len_u64s: usize) -> Vec { + (0..len_u64s as u64).collect() +} + +fn bench_pageout_single(c: &mut Criterion) { + ensure_scratch(); + let mut group = c.benchmark_group("pager/pageout/single_chunk"); + group.measurement_time(Duration::from_secs(2)); + for size_kib in [4usize, 64, 1024, 16384] { + let len = (size_kib * 1024) / 8; + for backend in [Backend::Swap, Backend::File] { + pager::set_backend(backend); + group.throughput(Throughput::Bytes((size_kib * 1024) as u64)); + group.bench_function( + BenchmarkId::new(format!("{backend:?}"), size_kib), + |b| { + b.iter_batched( + || [fill_payload(len)], + |mut chunks| { + let h = pager::pageout(&mut chunks); + black_box(h); + }, + BatchSize::SmallInput, + ); + }, + ); + } + } + group.finish(); +} + +fn bench_pageout_scatter(c: &mut Criterion) { + ensure_scratch(); + let mut group = c.benchmark_group("pager/pageout/scatter_2MiB"); + group.measurement_time(Duration::from_secs(2)); + let total_bytes = 2 * 1024 * 1024; + for chunk_count in [1usize, 2, 64] { + let chunk_bytes = total_bytes / chunk_count; + let chunk_len_u64s = chunk_bytes / 8; + for backend in [Backend::Swap, Backend::File] { + pager::set_backend(backend); + group.throughput(Throughput::Bytes(total_bytes as u64)); + group.bench_function( + BenchmarkId::new(format!("{backend:?}"), chunk_count), + |b| { + b.iter_batched( + || { + (0..chunk_count) + .map(|_| fill_payload(chunk_len_u64s)) + .collect::>() + }, + |mut chunks| { + let h = pager::pageout(chunks.as_mut_slice()); + black_box(h); + }, + BatchSize::SmallInput, + ); + }, + ); + } + } + group.finish(); +} + +fn bench_round_trip(c: &mut Criterion) { + ensure_scratch(); + let mut group = c.benchmark_group("pager/round_trip"); + group.measurement_time(Duration::from_secs(3)); + let len = (256 * 1024) / 8; + for backend in [Backend::Swap, Backend::File] { + pager::set_backend(backend); + group.throughput(Throughput::Bytes(256 * 1024)); + group.bench_function(BenchmarkId::new(format!("{backend:?}"), 256), |b| { + b.iter_batched( + || [fill_payload(len)], + |mut chunks| { + let h = pager::pageout(&mut chunks); + let mut dst = Vec::new(); + pager::take(h, &mut dst); + black_box(dst); + }, + BatchSize::SmallInput, + ); + }); + } + group.finish(); +} + +criterion_group!( + benches, + bench_pageout_single, + bench_pageout_scatter, + bench_round_trip +); +criterion_main!(benches); From e584512c99d5a9f71fdde0d4b3986fc58f85c417 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 16:59:41 +0200 Subject: [PATCH 14/34] ore: pager clippy + lint cleanups (write_vectored, cast_from, exhaustive matches) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/benches/pager.rs | 70 +++++++++++++++++------------------- src/ore/src/pager.rs | 8 ++--- src/ore/src/pager/file.rs | 75 ++++++++++----------------------------- src/ore/src/pager/swap.rs | 2 ++ 4 files changed, 58 insertions(+), 97 deletions(-) diff --git a/src/ore/benches/pager.rs b/src/ore/benches/pager.rs index 036f11cae9bf3..6f79ea464d091 100644 --- a/src/ore/benches/pager.rs +++ b/src/ore/benches/pager.rs @@ -1,11 +1,11 @@ #![cfg(feature = "pager")] +use std::hint::black_box; use std::path::PathBuf; use std::time::Duration; -use criterion::{ - BatchSize, BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main, -}; +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use mz_ore::cast::CastFrom; use mz_ore::pager::{self, Backend}; fn ensure_scratch() { @@ -19,7 +19,7 @@ fn ensure_scratch() { } fn fill_payload(len_u64s: usize) -> Vec { - (0..len_u64s as u64).collect() + (0..u64::cast_from(len_u64s)).collect() } fn bench_pageout_single(c: &mut Criterion) { @@ -30,20 +30,18 @@ fn bench_pageout_single(c: &mut Criterion) { let len = (size_kib * 1024) / 8; for backend in [Backend::Swap, Backend::File] { pager::set_backend(backend); - group.throughput(Throughput::Bytes((size_kib * 1024) as u64)); - group.bench_function( - BenchmarkId::new(format!("{backend:?}"), size_kib), - |b| { - b.iter_batched( - || [fill_payload(len)], - |mut chunks| { - let h = pager::pageout(&mut chunks); - black_box(h); - }, - BatchSize::SmallInput, - ); - }, - ); + let total_bytes = u64::cast_from(size_kib * 1024); + group.throughput(Throughput::Bytes(total_bytes)); + group.bench_function(BenchmarkId::new(format!("{backend:?}"), size_kib), |b| { + b.iter_batched( + || [fill_payload(len)], + |mut chunks| { + let h = pager::pageout(&mut chunks); + black_box(h); + }, + BatchSize::SmallInput, + ); + }); } } group.finish(); @@ -53,30 +51,28 @@ fn bench_pageout_scatter(c: &mut Criterion) { ensure_scratch(); let mut group = c.benchmark_group("pager/pageout/scatter_2MiB"); group.measurement_time(Duration::from_secs(2)); - let total_bytes = 2 * 1024 * 1024; + let total_bytes: usize = 2 * 1024 * 1024; for chunk_count in [1usize, 2, 64] { let chunk_bytes = total_bytes / chunk_count; let chunk_len_u64s = chunk_bytes / 8; for backend in [Backend::Swap, Backend::File] { pager::set_backend(backend); - group.throughput(Throughput::Bytes(total_bytes as u64)); - group.bench_function( - BenchmarkId::new(format!("{backend:?}"), chunk_count), - |b| { - b.iter_batched( - || { - (0..chunk_count) - .map(|_| fill_payload(chunk_len_u64s)) - .collect::>() - }, - |mut chunks| { - let h = pager::pageout(chunks.as_mut_slice()); - black_box(h); - }, - BatchSize::SmallInput, - ); - }, - ); + let total_bytes_u64 = u64::cast_from(total_bytes); + group.throughput(Throughput::Bytes(total_bytes_u64)); + group.bench_function(BenchmarkId::new(format!("{backend:?}"), chunk_count), |b| { + b.iter_batched( + || { + (0..chunk_count) + .map(|_| fill_payload(chunk_len_u64s)) + .collect::>() + }, + |mut chunks| { + let h = pager::pageout(chunks.as_mut_slice()); + black_box(h); + }, + BatchSize::SmallInput, + ); + }); } } group.finish(); diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index e718cf49f7819..3769fd9c54b68 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -57,28 +57,28 @@ impl Handle { pub(crate) fn swap_inner(&self) -> Option<&SwapInner> { match &self.inner { HandleInner::Swap(s) => Some(s), - _ => None, + HandleInner::File(_) => None, } } pub(crate) fn file_inner(&self) -> Option<&FileInner> { match &self.inner { HandleInner::File(f) => Some(f), - _ => None, + HandleInner::Swap(_) => None, } } pub(crate) fn into_swap_inner(self) -> Option { match self.inner { HandleInner::Swap(s) => Some(s), - _ => None, + HandleInner::File(_) => None, } } pub(crate) fn into_file_inner(self) -> Option { match self.inner { HandleInner::File(f) => Some(f), - _ => None, + HandleInner::Swap(_) => None, } } } diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 4a18167e6a06e..1385c77083a26 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -1,6 +1,8 @@ //! File backend for the pager. See `mz_ore::pager` for the public API. use std::path::{Path, PathBuf}; + +use crate::cast::CastFrom; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Once, OnceLock}; @@ -139,67 +141,26 @@ pub(crate) fn pageout_file(chunks: &mut [Vec]) -> Handle { fn write_chunks(path: &Path, chunks: &[Vec]) -> std::io::Result<()> { let file = File::options().write(true).create_new(true).open(path)?; - let slices: Vec> = chunks + let mut slices: Vec> = chunks .iter() .filter(|c| !c.is_empty()) .map(|c| IoSlice::new(bytemuck::cast_slice(c.as_slice()))) .collect(); - write_all_vectored(&file, &slices)?; + write_all_vectored(&file, slices.as_mut_slice())?; Ok(()) } -#[cfg(unix)] -fn write_all_vectored(file: &File, slices: &[IoSlice<'_>]) -> std::io::Result<()> { - use std::os::unix::io::AsRawFd; - let fd = file.as_raw_fd(); - let mut offset: i64 = 0; - let mut idx = 0; - let mut consumed_in_idx: usize = 0; - while idx < slices.len() { - let remaining = &slices[idx..]; - let iovs: Vec = remaining - .iter() - .enumerate() - .map(|(i, s)| { - let base_off = if i == 0 { consumed_in_idx } else { 0 }; - // SAFETY: building an iovec from a live `IoSlice` is safe; - // the pointer/length describe the caller's buffer. - libc::iovec { - iov_base: unsafe { s.as_ptr().add(base_off) } as *mut libc::c_void, - iov_len: s.len() - base_off, - } - }) - .collect(); - // SAFETY: fd is valid and open for writing; iovs point into the live `slices` - // owned by the caller; pwritev does not retain pointers past the syscall. - let written = - unsafe { libc::pwritev(fd, iovs.as_ptr(), iovs.len() as libc::c_int, offset) }; - if written < 0 { - return Err(std::io::Error::last_os_error()); - } - let mut left = written as usize; - offset += written as i64; - while left > 0 && idx < slices.len() { - let avail = slices[idx].len() - consumed_in_idx; - if left >= avail { - left -= avail; - idx += 1; - consumed_in_idx = 0; - } else { - consumed_in_idx += left; - left = 0; - } - } - } - Ok(()) -} - -#[cfg(not(unix))] -fn write_all_vectored(file: &File, slices: &[IoSlice<'_>]) -> std::io::Result<()> { +fn write_all_vectored(mut file: &File, mut slices: &mut [IoSlice<'_>]) -> std::io::Result<()> { use std::io::Write; - let mut file = file; - for s in slices { - file.write_all(s)?; + while !slices.is_empty() { + let written = file.write_vectored(slices)?; + if written == 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::WriteZero, + "write_vectored returned 0", + )); + } + IoSlice::advance_slices(&mut slices, written); } Ok(()) } @@ -226,15 +187,16 @@ pub(crate) fn read_at_file(handle: &Handle, ranges: &[(usize, usize)], dst: &mut let coalesced = coalesce(ranges); for (off, len) in coalesced { - let byte_off = (off * 8) as u64; + let byte_off = u64::cast_from(off * 8); let byte_len = len * 8; let buf_start = dst.len(); dst.resize(buf_start + len, 0); let buf: &mut [u8] = bytemuck::cast_slice_mut(&mut dst[buf_start..buf_start + len]); let mut filled = 0; while filled < byte_len { + let pos = byte_off + u64::cast_from(filled); let n = file - .read_at(&mut buf[filled..byte_len], byte_off + filled as u64) + .read_at(&mut buf[filled..byte_len], pos) .expect("pager pread failed"); if n == 0 { panic!("pager pread short: expected {byte_len} got {filled}"); @@ -271,8 +233,9 @@ pub(crate) fn take_file(handle: Handle, dst: &mut Vec) { let buf: &mut [u8] = bytemuck::cast_slice_mut(dst.as_mut_slice()); let mut filled = 0; while filled < buf.len() { + let pos = u64::cast_from(filled); let n = file - .read_at(&mut buf[filled..], filled as u64) + .read_at(&mut buf[filled..], pos) .unwrap_or_else(|err| panic!("pager take: pread {path:?}: {err}")); if n == 0 { panic!("pager take: short read at {filled}"); diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 7036544aead72..49650686da6f9 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -41,6 +41,7 @@ pub(crate) fn pageout_swap(chunks: &mut [Vec]) -> Handle { } #[cfg(target_os = "linux")] +#[allow(clippy::as_conversions)] // ptr<->usize and *mut c_void casts have no safe wrapper fn madvise_cold(chunk: &[u64]) { if chunk.is_empty() { return; @@ -69,6 +70,7 @@ fn madvise_cold(chunk: &[u64]) { fn madvise_cold(_chunk: &[u64]) {} #[cfg(target_os = "linux")] +#[allow(clippy::as_conversions)] // libc::c_long -> usize is FFI; sysconf returns >0 here fn page_size() -> usize { // SAFETY: `sysconf` with a valid argument is safe. unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize } From e469b6b593f1b8f80e264ee0ec9e0c63a2ad0492 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 17:01:53 +0200 Subject: [PATCH 15/34] ore: pager copyright headers and test-attribute lint compliance Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/benches/pager.rs | 15 +++++++++++++++ src/ore/src/pager.rs | 15 +++++++++++++++ src/ore/src/pager/file.rs | 15 +++++++++++++++ src/ore/src/pager/swap.rs | 15 +++++++++++++++ src/ore/tests/pager_integration.rs | 25 ++++++++++++++++++++----- 5 files changed, 80 insertions(+), 5 deletions(-) diff --git a/src/ore/benches/pager.rs b/src/ore/benches/pager.rs index 6f79ea464d091..d30f9b95c7214 100644 --- a/src/ore/benches/pager.rs +++ b/src/ore/benches/pager.rs @@ -1,3 +1,18 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #![cfg(feature = "pager")] use std::hint::black_box; diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 3769fd9c54b68..15cbf8bb20024 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -1,3 +1,18 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //! Explicit pager for cold data. See `doc/developer/design/20260504_pager.md`. use std::sync::atomic::{AtomicU8, Ordering}; diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 1385c77083a26..1e468d03f5478 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -1,3 +1,18 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //! File backend for the pager. See `mz_ore::pager` for the public API. use std::path::{Path, PathBuf}; diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 49650686da6f9..91296faf03fc8 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -1,3 +1,18 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //! Swap backend for the pager. See `mz_ore::pager` for the public API. use crate::pager::Handle; diff --git a/src/ore/tests/pager_integration.rs b/src/ore/tests/pager_integration.rs index 788c8f69e6554..2436daacf88d9 100644 --- a/src/ore/tests/pager_integration.rs +++ b/src/ore/tests/pager_integration.rs @@ -1,3 +1,18 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #![cfg(feature = "pager")] use mz_ore::pager::{Backend, Handle, pageout, read_at, set_backend, set_scratch_dir, take}; @@ -9,7 +24,7 @@ fn ensure_scratch() { set_scratch_dir(dir.path().to_owned()); } -#[test] +#[test] // allow(test-attribute) fn round_trip_swap() { set_backend(Backend::Swap); let payload: Vec = (0..1024).collect(); @@ -20,7 +35,7 @@ fn round_trip_swap() { assert_eq!(dst, payload); } -#[test] +#[test] // allow(test-attribute) fn round_trip_file() { ensure_scratch(); set_backend(Backend::File); @@ -33,7 +48,7 @@ fn round_trip_file() { set_backend(Backend::Swap); } -#[test] +#[test] // allow(test-attribute) fn handle_survives_backend_flip() { ensure_scratch(); set_backend(Backend::File); @@ -53,7 +68,7 @@ fn handle_survives_backend_flip() { assert_eq!(dst2, payload); } -#[test] +#[test] // allow(test-attribute) fn empty_input_yields_zero_len_handle() { set_backend(Backend::Swap); let mut chunks: [Vec; 0] = []; @@ -62,7 +77,7 @@ fn empty_input_yields_zero_len_handle() { assert!(h.is_empty()); } -#[test] +#[test] // allow(test-attribute) fn scatter_round_trip() { set_backend(Backend::Swap); let mut chunks = [vec![1u64, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; From 4cf0cd820c840526ec0ec1a57a1d111f5723197c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 19:43:29 +0200 Subject: [PATCH 16/34] ore: pager bench round-trip with touch-every-page readback Reuse buffers across iterations via iter_custom so allocator cost is paid once at setup. Read one u64 per page after take to force the kernel to actually fault pages in (relevant under memory pressure). 2 MiB single-chunk plus scatter sweep. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/benches/pager.rs | 145 +++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 65 deletions(-) diff --git a/src/ore/benches/pager.rs b/src/ore/benches/pager.rs index d30f9b95c7214..f087dcc1c5fb8 100644 --- a/src/ore/benches/pager.rs +++ b/src/ore/benches/pager.rs @@ -17,11 +17,14 @@ use std::hint::black_box; use std::path::PathBuf; -use std::time::Duration; +use std::time::{Duration, Instant}; -use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use mz_ore::cast::CastFrom; -use mz_ore::pager::{self, Backend}; +use mz_ore::pager::{self, Backend, Handle}; + +const PAGE_BYTES: usize = 4096; +const PAGE_U64S: usize = PAGE_BYTES / 8; fn ensure_scratch() { static INIT: std::sync::Once = std::sync::Once::new(); @@ -37,90 +40,102 @@ fn fill_payload(len_u64s: usize) -> Vec { (0..u64::cast_from(len_u64s)).collect() } -fn bench_pageout_single(c: &mut Criterion) { +/// Reads one `u64` from each page of `buf` to force the kernel to fault them in. +/// Returns a side-effecting sum so the compiler cannot elide the loads. +fn touch_every_page(buf: &[u64]) -> u64 { + let mut s: u64 = 0; + let mut i = 0; + while i < buf.len() { + s = s.wrapping_add(buf[i]); + i += PAGE_U64S; + } + s +} + +/// Round-trip a single-chunk payload through the pager and touch every page on +/// readback. Reuses the buffer between iterations so allocation/page-fault tax +/// is paid once at setup, not measured. +fn round_trip_single(c: &mut Criterion) { ensure_scratch(); - let mut group = c.benchmark_group("pager/pageout/single_chunk"); - group.measurement_time(Duration::from_secs(2)); - for size_kib in [4usize, 64, 1024, 16384] { + let mut group = c.benchmark_group("pager/round_trip_touch/single"); + group.measurement_time(Duration::from_secs(5)); + for size_kib in [4usize, 64, 1024, 2048, 16384] { let len = (size_kib * 1024) / 8; for backend in [Backend::Swap, Backend::File] { pager::set_backend(backend); - let total_bytes = u64::cast_from(size_kib * 1024); - group.throughput(Throughput::Bytes(total_bytes)); + group.throughput(Throughput::Bytes(u64::cast_from(size_kib * 1024))); group.bench_function(BenchmarkId::new(format!("{backend:?}"), size_kib), |b| { - b.iter_batched( - || [fill_payload(len)], - |mut chunks| { - let h = pager::pageout(&mut chunks); - black_box(h); - }, - BatchSize::SmallInput, - ); + b.iter_custom(|iters| { + let mut payload = fill_payload(len); + let mut tmp: Vec = Vec::with_capacity(len); + let start = Instant::now(); + for _ in 0..iters { + let mut chunks = [std::mem::take(&mut payload)]; + let h: Handle = pager::pageout(&mut chunks); + pager::take(h, &mut tmp); + black_box(touch_every_page(&tmp)); + payload = std::mem::take(&mut tmp); + tmp = Vec::with_capacity(len); + } + start.elapsed() + }); }); } } group.finish(); } -fn bench_pageout_scatter(c: &mut Criterion) { +/// Round-trip a scatter-input (multiple chunks forming one logical 2 MiB block). +/// Measures the same touch-every-page readback pattern as `round_trip_single`. +fn round_trip_scatter_2mib(c: &mut Criterion) { ensure_scratch(); - let mut group = c.benchmark_group("pager/pageout/scatter_2MiB"); - group.measurement_time(Duration::from_secs(2)); + let mut group = c.benchmark_group("pager/round_trip_touch/scatter_2MiB"); + group.measurement_time(Duration::from_secs(5)); let total_bytes: usize = 2 * 1024 * 1024; - for chunk_count in [1usize, 2, 64] { + for chunk_count in [1usize, 2, 8, 64] { let chunk_bytes = total_bytes / chunk_count; let chunk_len_u64s = chunk_bytes / 8; for backend in [Backend::Swap, Backend::File] { pager::set_backend(backend); - let total_bytes_u64 = u64::cast_from(total_bytes); - group.throughput(Throughput::Bytes(total_bytes_u64)); + group.throughput(Throughput::Bytes(u64::cast_from(total_bytes))); group.bench_function(BenchmarkId::new(format!("{backend:?}"), chunk_count), |b| { - b.iter_batched( - || { - (0..chunk_count) - .map(|_| fill_payload(chunk_len_u64s)) - .collect::>() - }, - |mut chunks| { - let h = pager::pageout(chunks.as_mut_slice()); - black_box(h); - }, - BatchSize::SmallInput, - ); + b.iter_custom(|iters| { + let mut payload: Vec> = (0..chunk_count) + .map(|_| fill_payload(chunk_len_u64s)) + .collect(); + let mut tmp: Vec = Vec::with_capacity(total_bytes / 8); + let start = Instant::now(); + for _ in 0..iters { + let h: Handle = pager::pageout(payload.as_mut_slice()); + pager::take(h, &mut tmp); + black_box(touch_every_page(&tmp)); + // Rebuild the input from `tmp` for the next iteration: + // swap the consolidated buffer back into chunk 0, leave + // the other chunks empty (they were drained by the + // swap backend's `mem::take`). The file backend already + // preserved their capacity, so this still amortizes its + // allocation cost. + payload[0] = std::mem::take(&mut tmp); + tmp = Vec::with_capacity(total_bytes / 8); + // For chunk_count > 1, refill the trailing chunks by + // splitting payload[0] back into the original shape. + if chunk_count > 1 { + let mut head = std::mem::take(&mut payload[0]); + for i in 1..chunk_count { + let take_len = std::cmp::min(chunk_len_u64s, head.len()); + let tail = head.split_off(head.len() - take_len); + payload[i] = tail; + } + payload[0] = head; + } + } + start.elapsed() + }); }); } } group.finish(); } -fn bench_round_trip(c: &mut Criterion) { - ensure_scratch(); - let mut group = c.benchmark_group("pager/round_trip"); - group.measurement_time(Duration::from_secs(3)); - let len = (256 * 1024) / 8; - for backend in [Backend::Swap, Backend::File] { - pager::set_backend(backend); - group.throughput(Throughput::Bytes(256 * 1024)); - group.bench_function(BenchmarkId::new(format!("{backend:?}"), 256), |b| { - b.iter_batched( - || [fill_payload(len)], - |mut chunks| { - let h = pager::pageout(&mut chunks); - let mut dst = Vec::new(); - pager::take(h, &mut dst); - black_box(dst); - }, - BatchSize::SmallInput, - ); - }); - } - group.finish(); -} - -criterion_group!( - benches, - bench_pageout_single, - bench_pageout_scatter, - bench_round_trip -); +criterion_group!(benches, round_trip_single, round_trip_scatter_2mib); criterion_main!(benches); From 3cb8968108531345475311d3a17d9e338aef7502 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 20:36:14 +0200 Subject: [PATCH 17/34] ore: pager merge-batcher example with cache-line touch Builds two chains of 2 MiB chunks then performs a merge pass that reads every cache line of both inputs and emits a new chain. Designed to be run under systemd-run with MemoryMax to simulate a working set that exceeds RAM, exposing real swap-eviction or disk-I/O cost. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/examples/pager_merge.rs | 182 ++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 src/ore/examples/pager_merge.rs diff --git a/src/ore/examples/pager_merge.rs b/src/ore/examples/pager_merge.rs new file mode 100644 index 0000000000000..a12c539b6defe --- /dev/null +++ b/src/ore/examples/pager_merge.rs @@ -0,0 +1,182 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Merge-batcher-style workload for the pager. +//! +//! Builds two chains of 2 MiB chunks (`--chain-gib` each), then performs a +//! merge pass that takes one chunk from each input, reads every cache line, +//! and emits two output chunks. Reports build/merge throughput. +//! +//! Run with constrained memory via `systemd-run --user --scope -p MemoryMax=...`. +//! +//! ```bash +//! cargo build --release --features pager --example pager_merge +//! systemd-run --user --scope -p MemoryMax=16G -p MemorySwapMax=64G --quiet \ +//! --setenv=MZ_PAGER_SCRATCH=/path/to/scratch \ +//! -- target/release/examples/pager_merge --chain-gib 16 --backend swap +//! ``` + +#![cfg(feature = "pager")] + +use std::env; +use std::path::PathBuf; +use std::time::{Duration, Instant}; + +use mz_ore::cast::CastFrom; +use mz_ore::pager::{self, Backend, Handle}; + +const CHUNK_BYTES: usize = 2 * 1024 * 1024; +const CHUNK_U64: usize = CHUNK_BYTES / 8; +const CACHE_LINE_BYTES: usize = 64; +const CACHE_LINE_U64: usize = CACHE_LINE_BYTES / 8; + +fn main() { + let args: Vec = env::args().collect(); + let chain_gib: usize = parse_arg(&args, "--chain-gib", 16); + let backend = parse_backend(&args); + let scratch: PathBuf = env::var_os("MZ_PAGER_SCRATCH") + .map(PathBuf::from) + .unwrap_or_else(env::temp_dir); + + pager::set_scratch_dir(scratch); + pager::set_backend(backend); + + let chain_bytes = chain_gib * 1024 * 1024 * 1024; + let chunks_per_chain = chain_bytes / CHUNK_BYTES; + + println!( + "backend={backend:?} chain={chain_gib}GiB chunks_per_chain={chunks_per_chain} chunk={CHUNK_BYTES}B" + ); + + let (chain_a, build_a) = time(|| build_chain(chunks_per_chain)); + println!( + "build A: {:.2?} ({:.2} GiB/s)", + build_a, + gib_per_sec(chain_bytes, build_a) + ); + + let (chain_b, build_b) = time(|| build_chain(chunks_per_chain)); + println!( + "build B: {:.2?} ({:.2} GiB/s)", + build_b, + gib_per_sec(chain_bytes, build_b) + ); + + let (chain_c, merge_dur) = time(|| merge_pass(chain_a, chain_b)); + let merged_bytes = chunks_per_chain * 2 * CHUNK_BYTES; + println!( + "merge: {:.2?} ({:.2} GiB/s through, output_chunks={})", + merge_dur, + gib_per_sec(merged_bytes, merge_dur), + chain_c.len() + ); + + let (_, drop_dur) = time(|| drop(chain_c)); + println!("drop output chain: {:.2?}", drop_dur); +} + +fn build_chain(n_chunks: usize) -> Vec { + let mut chain = Vec::with_capacity(n_chunks); + let mut buf: Vec = vec![0; CHUNK_U64]; + for i in 0..n_chunks { + // Fill with non-zero, position-dependent data so the kernel cannot + // share zero pages. + for (j, w) in buf.iter_mut().enumerate() { + *w = u64::cast_from(i) ^ u64::cast_from(j); + } + let mut chunks = [std::mem::take(&mut buf)]; + chain.push(pager::pageout(&mut chunks)); + // Reallocate; the swap backend stole the buffer, the file backend + // left an empty Vec with original capacity, but we don't keep it. + buf = vec![0; CHUNK_U64]; + } + chain +} + +fn merge_pass(a: Vec, b: Vec) -> Vec { + let mut out = Vec::with_capacity(a.len() + b.len()); + let mut tmp_a: Vec = Vec::with_capacity(CHUNK_U64); + let mut tmp_b: Vec = Vec::with_capacity(CHUNK_U64); + let mut sink: u64 = 0; + for (ha, hb) in a.into_iter().zip(b.into_iter()) { + pager::take(ha, &mut tmp_a); + pager::take(hb, &mut tmp_b); + // Touch every cache line of both inputs (1 u64 per 64-byte line). + sink = touch_cache_lines(&tmp_a, sink); + sink = touch_cache_lines(&tmp_b, sink); + // Emit two output chunks, simulating a merged run that doubles the + // chunk count. Each output is 2 MiB; we hand the original buffers + // straight to `pageout`, which transfers ownership cleanly on the + // swap backend. + { + let mut chunks = [std::mem::take(&mut tmp_a)]; + out.push(pager::pageout(&mut chunks)); + tmp_a = Vec::with_capacity(CHUNK_U64); + } + { + let mut chunks = [std::mem::take(&mut tmp_b)]; + out.push(pager::pageout(&mut chunks)); + tmp_b = Vec::with_capacity(CHUNK_U64); + } + } + std::hint::black_box(sink); + out +} + +#[inline] +fn touch_cache_lines(buf: &[u64], mut sink: u64) -> u64 { + let mut i = 0; + while i < buf.len() { + sink = sink.wrapping_add(buf[i]); + i += CACHE_LINE_U64; + } + sink +} + +fn time(f: impl FnOnce() -> T) -> (T, Duration) { + let start = Instant::now(); + let v = f(); + (v, start.elapsed()) +} + +fn gib_per_sec(bytes: usize, d: Duration) -> f64 { + let secs = d.as_secs_f64(); + if secs == 0.0 { + return 0.0; + } + let gib = bytes as f64 / (1024.0 * 1024.0 * 1024.0); + gib / secs +} + +fn parse_arg(args: &[String], flag: &str, default: usize) -> usize { + args.iter() + .position(|a| a == flag) + .and_then(|i| args.get(i + 1)) + .and_then(|v| v.parse().ok()) + .unwrap_or(default) +} + +fn parse_backend(args: &[String]) -> Backend { + let pos = args + .iter() + .position(|a| a == "--backend") + .and_then(|i| args.get(i + 1)); + match pos.map(String::as_str) { + Some("file") => Backend::File, + Some("swap") => Backend::Swap, + Some(other) => panic!("unknown backend {other:?}; use 'swap' or 'file'"), + None => Backend::Swap, + } +} From 756998446fc2fc47129c6085d849e03e7e62c5d9 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 4 May 2026 21:12:10 +0200 Subject: [PATCH 18/34] ore: update Cargo.lock for pager tempfile dev-dep Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index 056a8d6335603..58b881b8f2a9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7079,6 +7079,7 @@ dependencies = [ "serde_json", "smallvec", "stacker", + "tempfile", "thiserror 2.0.18", "tokio", "tokio-native-tls", From 98b471727ce6450dc0ab2241a80185d0cc094f6b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 10:26:56 +0200 Subject: [PATCH 19/34] ore: pager prefetch and prefetch_at hints Add `prefetch(&Handle)` and `prefetch_at(&Handle, offset, len)` to let callers overlap the next chunk's I/O with current chunk processing. The swap backend issues `MADV_WILLNEED`; the file backend opens the scratch file briefly and issues `posix_fadvise(POSIX_FADV_WILLNEED)`, both of which kick async kernel work and return promptly. The merge example now prefetches one chunk ahead. With a 32 GiB working set and 16 GiB cap on ext4, file-backend merge drops from 47.7 s to 45.2 s. Swap-backend merge is unchanged at ~141 s because under that much pressure the kernel is reclaim-bound, not stall-bound. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/examples/pager_merge.rs | 30 +++++++++++++- src/ore/src/pager.rs | 19 +++++++++ src/ore/src/pager/file.rs | 60 ++++++++++++++++++++++++++++ src/ore/src/pager/swap.rs | 71 ++++++++++++++++++++++++++++++--- 4 files changed, 172 insertions(+), 8 deletions(-) diff --git a/src/ore/examples/pager_merge.rs b/src/ore/examples/pager_merge.rs index a12c539b6defe..00d6ff7f46266 100644 --- a/src/ore/examples/pager_merge.rs +++ b/src/ore/examples/pager_merge.rs @@ -106,11 +106,36 @@ fn build_chain(n_chunks: usize) -> Vec { } fn merge_pass(a: Vec, b: Vec) -> Vec { - let mut out = Vec::with_capacity(a.len() + b.len()); + let n = a.len().min(b.len()); + let mut a: Vec> = a.into_iter().map(Some).collect(); + let mut b: Vec> = b.into_iter().map(Some).collect(); + let mut out = Vec::with_capacity(2 * n); let mut tmp_a: Vec = Vec::with_capacity(CHUNK_U64); let mut tmp_b: Vec = Vec::with_capacity(CHUNK_U64); let mut sink: u64 = 0; - for (ha, hb) in a.into_iter().zip(b.into_iter()) { + // Issue the first prefetches so the kernel starts populating page cache / + // swap-in for the very first iteration. + if n > 0 { + if let Some(h) = a[0].as_ref() { + pager::prefetch(h); + } + if let Some(h) = b[0].as_ref() { + pager::prefetch(h); + } + } + for i in 0..n { + // Prefetch one chunk ahead of the current pair so I/O overlaps with + // the cache-line touch and the output pageouts below. + if i + 1 < n { + if let Some(h) = a[i + 1].as_ref() { + pager::prefetch(h); + } + if let Some(h) = b[i + 1].as_ref() { + pager::prefetch(h); + } + } + let ha = a[i].take().expect("handle a present"); + let hb = b[i].take().expect("handle b present"); pager::take(ha, &mut tmp_a); pager::take(hb, &mut tmp_b); // Touch every cache line of both inputs (1 u64 per 64-byte line). @@ -156,6 +181,7 @@ fn gib_per_sec(bytes: usize, d: Duration) -> f64 { if secs == 0.0 { return 0.0; } + #[allow(clippy::as_conversions)] // usize -> f64 is intentionally lossy for reporting let gib = bytes as f64 / (1024.0 * 1024.0 * 1024.0); gib / secs } diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 15cbf8bb20024..09100c1c2d3d9 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -158,6 +158,25 @@ pub fn read_at(handle: &Handle, offset: usize, len: usize, dst: &mut Vec) { read_at_many(handle, &[(offset, len)], dst); } +/// Hints that the entire payload will be read soon. Best-effort. +/// Swap backend issues `madvise(MADV_WILLNEED)`; file backend issues +/// `posix_fadvise(POSIX_FADV_WILLNEED)`. Both are async — the call returns +/// promptly and the kernel populates pages or page cache in the background. +/// Useful for overlapping I/O with computation in pipelines that know which +/// handles they will read next. +pub fn prefetch(handle: &Handle) { + prefetch_at(handle, 0, handle.len()); +} + +/// Hints that the range `[offset, offset+len)` of the handle will be read soon. +/// Panics if the range is out of bounds. +pub fn prefetch_at(handle: &Handle, offset: usize, len: usize) { + match &handle.inner { + HandleInner::Swap(_) => swap::prefetch_at_swap(handle, offset, len), + HandleInner::File(_) => file::prefetch_at_file(handle, offset, len), + } +} + /// Consumes handle, writing the entire payload into `dst` (cleared first), then reclaims storage. /// Swap fast path: single-chunk handle into empty `dst` swaps in place, no copy. pub fn take(handle: Handle, dst: &mut Vec) { diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 1e468d03f5478..56f4b9677f0f6 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -180,6 +180,44 @@ fn write_all_vectored(mut file: &File, mut slices: &mut [IoSlice<'_>]) -> std::i Ok(()) } +pub(crate) fn prefetch_at_file(handle: &Handle, offset: usize, len: usize) { + let inner = handle + .file_inner() + .expect("prefetch_at_file called on non-file handle"); + let total = inner.len_u64s; + let end = offset.checked_add(len).expect("offset+len overflow"); + assert!( + end <= total, + "prefetch range out of bounds: {offset}+{len} > {total}" + ); + if len == 0 { + return; + } + let path = scratch_path(inner.id); + let Ok(file) = File::open(&path) else { + // Best-effort hint; if the file is gone the next read will surface the error. + return; + }; + posix_fadvise_willneed(&file, u64::cast_from(offset * 8), u64::cast_from(len * 8)); +} + +#[cfg(unix)] +fn posix_fadvise_willneed(file: &File, byte_off: u64, byte_len: u64) { + use std::os::unix::io::AsRawFd; + #[allow(clippy::as_conversions)] // u64 -> i64/off_t for FFI; values fit by construction + let off = byte_off as i64; + #[allow(clippy::as_conversions)] + let len = byte_len as i64; + // SAFETY: fd is valid for the life of `file`; `posix_fadvise` is a hint and + // does not mutate user memory. The return value (errno) is intentionally ignored. + unsafe { + libc::posix_fadvise(file.as_raw_fd(), off, len, libc::POSIX_FADV_WILLNEED); + } +} + +#[cfg(not(unix))] +fn posix_fadvise_willneed(_file: &File, _byte_off: u64, _byte_len: u64) {} + pub(crate) fn read_at_file(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec) { use std::os::unix::fs::FileExt; @@ -347,6 +385,28 @@ mod backend_tests { drop(h); assert!(!path.exists(), "scratch file should be unlinked on drop"); } + + #[mz_ore::test] + fn file_prefetch_does_not_corrupt_data() { + setup_dir(); + let payload: Vec = (0..1024).collect(); + let mut chunks = [payload.clone()]; + let h = pageout_file(&mut chunks); + prefetch_at_file(&h, 100, 200); + prefetch_at_file(&h, 0, 1024); + let mut dst = Vec::new(); + read_at_file(&h, &[(0, 1024)], &mut dst); + assert_eq!(dst, payload); + } + + #[mz_ore::test] + #[should_panic(expected = "out of bounds")] + fn file_prefetch_panics_on_oob() { + setup_dir(); + let mut chunks = [vec![1u64, 2]]; + let h = pageout_file(&mut chunks); + prefetch_at_file(&h, 0, 99); + } } #[cfg(test)] diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 91296faf03fc8..6d5474d0a81ef 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -50,14 +50,24 @@ pub(crate) fn pageout_swap(chunks: &mut [Vec]) -> Handle { taken.push(std::mem::take(c)); } for c in &taken { - madvise_cold(c); + madvise_range(c, MADV_COLD); } Handle::from_swap(SwapInner::new(taken)) } +#[cfg(target_os = "linux")] +const MADV_COLD: libc::c_int = libc::MADV_COLD; +#[cfg(target_os = "linux")] +const MADV_WILLNEED: libc::c_int = libc::MADV_WILLNEED; + +#[cfg(not(target_os = "linux"))] +const MADV_COLD: i32 = 0; +#[cfg(not(target_os = "linux"))] +const MADV_WILLNEED: i32 = 0; + #[cfg(target_os = "linux")] #[allow(clippy::as_conversions)] // ptr<->usize and *mut c_void casts have no safe wrapper -fn madvise_cold(chunk: &[u64]) { +fn madvise_range(chunk: &[u64], advice: libc::c_int) { if chunk.is_empty() { return; } @@ -70,19 +80,19 @@ fn madvise_cold(chunk: &[u64]) { return; } // SAFETY: pointer/length come from a live `&[u64]`; we restrict to a fully - // page-aligned subrange contained within that slice; `MADV_COLD` does not - // mutate the contents. + // page-aligned subrange contained within that slice; `madvise` with these + // hints does not mutate the contents. unsafe { libc::madvise( aligned_start as *mut libc::c_void, aligned_end - aligned_start, - libc::MADV_COLD, + advice, ); } } #[cfg(not(target_os = "linux"))] -fn madvise_cold(_chunk: &[u64]) {} +fn madvise_range(_chunk: &[u64], _advice: i32) {} #[cfg(target_os = "linux")] #[allow(clippy::as_conversions)] // libc::c_long -> usize is FFI; sysconf returns >0 here @@ -135,6 +145,35 @@ fn copy_range(inner: &SwapInner, off: usize, len: usize, dst: &mut Vec) { } } +pub(crate) fn prefetch_at_swap(handle: &Handle, offset: usize, len: usize) { + let inner = handle + .swap_inner() + .expect("prefetch_at_swap called on non-swap handle"); + let total = inner.total_len(); + let end = offset.checked_add(len).expect("offset+len overflow"); + assert!( + end <= total, + "prefetch range out of bounds: {offset}+{len} > {total}" + ); + if len == 0 { + return; + } + let mut cur = offset; + let mut idx = match inner.prefix.binary_search(&cur) { + Ok(i) => i, + Err(i) => i.saturating_sub(1), + }; + while cur < end { + let chunk_start = inner.prefix[idx]; + let chunk = &inner.chunks[idx]; + let local = cur - chunk_start; + let take = std::cmp::min(end - cur, chunk.len() - local); + madvise_range(&chunk[local..local + take], MADV_WILLNEED); + cur += take; + idx += 1; + } +} + pub(crate) fn take_swap(handle: Handle, dst: &mut Vec) { let inner = match handle.into_swap_inner() { Some(s) => s, @@ -230,4 +269,24 @@ mod tests { take_swap(h, &mut dst); assert_eq!(dst, vec![1, 2, 3, 4, 5]); } + + #[mz_ore::test] + fn prefetch_does_not_corrupt_data() { + let payload: Vec = (0..1024).collect(); + let mut chunks = [payload.clone()]; + let h = pageout_swap(&mut chunks); + prefetch_at_swap(&h, 100, 200); + prefetch_at_swap(&h, 0, 1024); + let mut dst = Vec::new(); + read_at_swap(&h, &[(0, 1024)], &mut dst); + assert_eq!(dst, payload); + } + + #[mz_ore::test] + #[should_panic(expected = "out of bounds")] + fn prefetch_panics_on_oob() { + let mut chunks = [vec![1u64, 2]]; + let h = pageout_swap(&mut chunks); + prefetch_at_swap(&h, 0, 99); + } } From 2fd9d3a17456ecb6eb9b24432b8ca84dbacafbf3 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 10:33:46 +0200 Subject: [PATCH 20/34] ore: replace as_conversions with cast_from/cast_lossy/try_from Per project policy, raw `as` is disallowed in favor of mz_ore::cast::CastFrom, mz_ore::cast::CastLossy, or std::convert::TryFrom. The pager's pointer-arithmetic paths now use stable `*const T::addr()` and `byte_add` to keep provenance, with `cast::()` and `cast_mut()` replacing pointer-type `as` casts. FFI integer arguments now go through `try_from` with explicit panics on overflow. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/examples/pager_merge.rs | 36 ++++++++++++++++++--------------- src/ore/src/pager/file.rs | 6 ++---- src/ore/src/pager/swap.rs | 33 ++++++++++++++++-------------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/ore/examples/pager_merge.rs b/src/ore/examples/pager_merge.rs index 00d6ff7f46266..9dbb680093a70 100644 --- a/src/ore/examples/pager_merge.rs +++ b/src/ore/examples/pager_merge.rs @@ -34,7 +34,7 @@ use std::env; use std::path::PathBuf; use std::time::{Duration, Instant}; -use mz_ore::cast::CastFrom; +use mz_ore::cast::{CastFrom, CastLossy}; use mz_ore::pager::{self, Backend, Handle}; const CHUNK_BYTES: usize = 2 * 1024 * 1024; @@ -45,6 +45,7 @@ const CACHE_LINE_U64: usize = CACHE_LINE_BYTES / 8; fn main() { let args: Vec = env::args().collect(); let chain_gib: usize = parse_arg(&args, "--chain-gib", 16); + let prefetch_depth: usize = parse_arg(&args, "--prefetch-depth", 1); let backend = parse_backend(&args); let scratch: PathBuf = env::var_os("MZ_PAGER_SCRATCH") .map(PathBuf::from) @@ -57,7 +58,7 @@ fn main() { let chunks_per_chain = chain_bytes / CHUNK_BYTES; println!( - "backend={backend:?} chain={chain_gib}GiB chunks_per_chain={chunks_per_chain} chunk={CHUNK_BYTES}B" + "backend={backend:?} chain={chain_gib}GiB chunks_per_chain={chunks_per_chain} chunk={CHUNK_BYTES}B prefetch_depth={prefetch_depth}" ); let (chain_a, build_a) = time(|| build_chain(chunks_per_chain)); @@ -74,7 +75,7 @@ fn main() { gib_per_sec(chain_bytes, build_b) ); - let (chain_c, merge_dur) = time(|| merge_pass(chain_a, chain_b)); + let (chain_c, merge_dur) = time(|| merge_pass(chain_a, chain_b, prefetch_depth)); let merged_bytes = chunks_per_chain * 2 * CHUNK_BYTES; println!( "merge: {:.2?} ({:.2} GiB/s through, output_chunks={})", @@ -105,7 +106,7 @@ fn build_chain(n_chunks: usize) -> Vec { chain } -fn merge_pass(a: Vec, b: Vec) -> Vec { +fn merge_pass(a: Vec, b: Vec, prefetch_depth: usize) -> Vec { let n = a.len().min(b.len()); let mut a: Vec> = a.into_iter().map(Some).collect(); let mut b: Vec> = b.into_iter().map(Some).collect(); @@ -113,24 +114,28 @@ fn merge_pass(a: Vec, b: Vec) -> Vec { let mut tmp_a: Vec = Vec::with_capacity(CHUNK_U64); let mut tmp_b: Vec = Vec::with_capacity(CHUNK_U64); let mut sink: u64 = 0; - // Issue the first prefetches so the kernel starts populating page cache / - // swap-in for the very first iteration. - if n > 0 { - if let Some(h) = a[0].as_ref() { + // Maintain a rolling window of `prefetch_depth` outstanding prefetches. + // Issue the initial wave for indices [0, prefetch_depth). + let initial = prefetch_depth.min(n); + for j in 0..initial { + if let Some(h) = a[j].as_ref() { pager::prefetch(h); } - if let Some(h) = b[0].as_ref() { + if let Some(h) = b[j].as_ref() { pager::prefetch(h); } } for i in 0..n { - // Prefetch one chunk ahead of the current pair so I/O overlaps with - // the cache-line touch and the output pageouts below. - if i + 1 < n { - if let Some(h) = a[i + 1].as_ref() { + // Each iteration extends the window by one: prefetch index `i + + // prefetch_depth` so that by the time we consume it the kernel has + // had `prefetch_depth` chunks worth of compute time to make pages + // available. + let pf = i + prefetch_depth; + if pf < n { + if let Some(h) = a[pf].as_ref() { pager::prefetch(h); } - if let Some(h) = b[i + 1].as_ref() { + if let Some(h) = b[pf].as_ref() { pager::prefetch(h); } } @@ -181,8 +186,7 @@ fn gib_per_sec(bytes: usize, d: Duration) -> f64 { if secs == 0.0 { return 0.0; } - #[allow(clippy::as_conversions)] // usize -> f64 is intentionally lossy for reporting - let gib = bytes as f64 / (1024.0 * 1024.0 * 1024.0); + let gib = f64::cast_lossy(bytes) / (1024.0 * 1024.0 * 1024.0); gib / secs } diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index 56f4b9677f0f6..faf6088e5bcb3 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -204,10 +204,8 @@ pub(crate) fn prefetch_at_file(handle: &Handle, offset: usize, len: usize) { #[cfg(unix)] fn posix_fadvise_willneed(file: &File, byte_off: u64, byte_len: u64) { use std::os::unix::io::AsRawFd; - #[allow(clippy::as_conversions)] // u64 -> i64/off_t for FFI; values fit by construction - let off = byte_off as i64; - #[allow(clippy::as_conversions)] - let len = byte_len as i64; + let off = i64::try_from(byte_off).expect("scratch file offset fits i64"); + let len = i64::try_from(byte_len).expect("scratch file length fits i64"); // SAFETY: fd is valid for the life of `file`; `posix_fadvise` is a hint and // does not mutate user memory. The return value (errno) is intentionally ignored. unsafe { diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 6d5474d0a81ef..b22d27843bf81 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -66,28 +66,31 @@ const MADV_COLD: i32 = 0; const MADV_WILLNEED: i32 = 0; #[cfg(target_os = "linux")] -#[allow(clippy::as_conversions)] // ptr<->usize and *mut c_void casts have no safe wrapper fn madvise_range(chunk: &[u64], advice: libc::c_int) { if chunk.is_empty() { return; } let page = page_size(); - let ptr = chunk.as_ptr() as usize; + let base_ptr = chunk.as_ptr(); + let base_addr = base_ptr.addr(); let len_bytes = chunk.len() * std::mem::size_of::(); - let aligned_start = (ptr + page - 1) & !(page - 1); - let aligned_end = (ptr + len_bytes) & !(page - 1); - if aligned_end <= aligned_start { + let aligned_start_addr = (base_addr + page - 1) & !(page - 1); + let aligned_end_addr = (base_addr + len_bytes) & !(page - 1); + if aligned_end_addr <= aligned_start_addr { return; } - // SAFETY: pointer/length come from a live `&[u64]`; we restrict to a fully - // page-aligned subrange contained within that slice; `madvise` with these - // hints does not mutate the contents. + let aligned_len = aligned_end_addr - aligned_start_addr; + // SAFETY: `aligned_start_addr` lies within `[base_addr, base_addr+len_bytes]`, + // i.e. inside the live `&[u64]`. Reconstructing the pointer via `byte_add` + // preserves provenance. + let aligned_ptr = unsafe { base_ptr.byte_add(aligned_start_addr - base_addr) } + .cast::() + .cast_mut(); + // SAFETY: pointer/length describe a fully page-aligned subrange contained + // within the live `&[u64]`. `madvise` with `MADV_COLD` / `MADV_WILLNEED` + // does not mutate the contents. unsafe { - libc::madvise( - aligned_start as *mut libc::c_void, - aligned_end - aligned_start, - advice, - ); + libc::madvise(aligned_ptr, aligned_len, advice); } } @@ -95,10 +98,10 @@ fn madvise_range(chunk: &[u64], advice: libc::c_int) { fn madvise_range(_chunk: &[u64], _advice: i32) {} #[cfg(target_os = "linux")] -#[allow(clippy::as_conversions)] // libc::c_long -> usize is FFI; sysconf returns >0 here fn page_size() -> usize { // SAFETY: `sysconf` with a valid argument is safe. - unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize } + let raw = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + usize::try_from(raw).expect("page size is positive and fits usize") } #[cfg(not(target_os = "linux"))] From b65569eca88ce79fde6d06c77ca2fe35cd575d39 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 11:03:52 +0200 Subject: [PATCH 21/34] ore: pager merge example takes --threads, partitions chain Each worker gets a 1/threads share of the total chain so working set stays constant across thread counts. Cap=16 GiB / total chain=32 GiB: file backend speeds up at 2 threads (64 -> 46 s, ~1.4x), regresses at 4, recovers at 8; swap backend halves wall at 4 threads (215 -> 127 s) because kernel reclaim overlaps with other workers' compute. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/examples/pager_merge.rs | 78 +++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/src/ore/examples/pager_merge.rs b/src/ore/examples/pager_merge.rs index 9dbb680093a70..33d330aeec9bc 100644 --- a/src/ore/examples/pager_merge.rs +++ b/src/ore/examples/pager_merge.rs @@ -32,6 +32,8 @@ use std::env; use std::path::PathBuf; +use std::sync::{Arc, Barrier}; +use std::thread; use std::time::{Duration, Instant}; use mz_ore::cast::{CastFrom, CastLossy}; @@ -46,6 +48,7 @@ fn main() { let args: Vec = env::args().collect(); let chain_gib: usize = parse_arg(&args, "--chain-gib", 16); let prefetch_depth: usize = parse_arg(&args, "--prefetch-depth", 1); + let threads: usize = parse_arg(&args, "--threads", 1).max(1); let backend = parse_backend(&args); let scratch: PathBuf = env::var_os("MZ_PAGER_SCRATCH") .map(PathBuf::from) @@ -54,38 +57,67 @@ fn main() { pager::set_scratch_dir(scratch); pager::set_backend(backend); - let chain_bytes = chain_gib * 1024 * 1024 * 1024; - let chunks_per_chain = chain_bytes / CHUNK_BYTES; + let total_chain_bytes = chain_gib * 1024 * 1024 * 1024; + let per_thread_chain_bytes = total_chain_bytes / threads; + let chunks_per_chain = per_thread_chain_bytes / CHUNK_BYTES; println!( - "backend={backend:?} chain={chain_gib}GiB chunks_per_chain={chunks_per_chain} chunk={CHUNK_BYTES}B prefetch_depth={prefetch_depth}" + "backend={backend:?} threads={threads} per_thread_chain_chunks={chunks_per_chain} chunk={CHUNK_BYTES}B total_chain={chain_gib}GiB prefetch_depth={prefetch_depth}" ); - let (chain_a, build_a) = time(|| build_chain(chunks_per_chain)); - println!( - "build A: {:.2?} ({:.2} GiB/s)", - build_a, - gib_per_sec(chain_bytes, build_a) - ); + let barrier = Arc::new(Barrier::new(threads)); + let start = Instant::now(); + let mut handles = Vec::with_capacity(threads); + for tid in 0..threads { + let barrier = Arc::clone(&barrier); + handles.push(thread::spawn(move || { + run_worker(tid, chunks_per_chain, prefetch_depth, &barrier) + })); + } + let mut per_thread = Vec::with_capacity(threads); + for h in handles { + per_thread.push(h.join().expect("worker panic")); + } + let total = start.elapsed(); - let (chain_b, build_b) = time(|| build_chain(chunks_per_chain)); + // Total bytes through the merge across all threads (each thread reads + // 2 chain shares end-to-end, regardless of thread count). + let total_bytes = chunks_per_chain * threads * 2 * CHUNK_BYTES; println!( - "build B: {:.2?} ({:.2} GiB/s)", - build_b, - gib_per_sec(chain_bytes, build_b) + "wall: {:.2?} ({:.2} GiB/s through)", + total, + gib_per_sec(total_bytes, total) ); + for (tid, t) in per_thread.iter().enumerate() { + println!( + " worker {tid}: build_a={:.2?} build_b={:.2?} merge={:.2?}", + t.build_a, t.build_b, t.merge + ); + } +} - let (chain_c, merge_dur) = time(|| merge_pass(chain_a, chain_b, prefetch_depth)); - let merged_bytes = chunks_per_chain * 2 * CHUNK_BYTES; - println!( - "merge: {:.2?} ({:.2} GiB/s through, output_chunks={})", - merge_dur, - gib_per_sec(merged_bytes, merge_dur), - chain_c.len() - ); +struct WorkerTimings { + build_a: Duration, + build_b: Duration, + merge: Duration, +} - let (_, drop_dur) = time(|| drop(chain_c)); - println!("drop output chain: {:.2?}", drop_dur); +fn run_worker( + _tid: usize, + chunks_per_chain: usize, + prefetch_depth: usize, + barrier: &Barrier, +) -> WorkerTimings { + barrier.wait(); + let (chain_a, build_a) = time(|| build_chain(chunks_per_chain)); + let (chain_b, build_b) = time(|| build_chain(chunks_per_chain)); + let (chain_c, merge) = time(|| merge_pass(chain_a, chain_b, prefetch_depth)); + drop(chain_c); + WorkerTimings { + build_a, + build_b, + merge, + } } fn build_chain(n_chunks: usize) -> Vec { From 2bbe256e7b467d0ee9b46275667b44303f12f2da Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 12:00:14 +0200 Subject: [PATCH 22/34] =?UTF-8?q?doc:=20pager=20design=20=E2=80=94=20add?= =?UTF-8?q?=20operational=20characteristics=20with=20measured=20throughput?= =?UTF-8?q?=20and=20perf=20data?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a section that captures the swap-vs-file trade-off as actually observed: file saturates the disk (1.47 GiB/s on encrypted NVMe), swap floors at ~0.36 GiB/s regardless of cap or parallelism. perf stat plus /proc/vmstat deltas show swap loses ~7x sys-time vs file because every 4 KiB readback page-faults synchronously on the user thread (5.2M minor-faults vs 4K, 2.1M pswpin vs 2.2K). Operational guidance: swap when resident, file when spilling. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/design/20260504_pager.md | 363 +++++++++++++++++++++++++ 1 file changed, 363 insertions(+) create mode 100644 doc/developer/design/20260504_pager.md diff --git a/doc/developer/design/20260504_pager.md b/doc/developer/design/20260504_pager.md new file mode 100644 index 0000000000000..cdc7f7e2f91c4 --- /dev/null +++ b/doc/developer/design/20260504_pager.md @@ -0,0 +1,363 @@ +# Pager + +* Associated: [CLU-65](https://linear.app/materializeinc/issue/CLU-65/pager), depends on [CLU-64](https://linear.app/materializeinc/issue/CLU-64/remove-lgalloc-from-columnar) (`Column::Aligned` becomes `Vec`). + +## The problem + +Materialize spills working sets to Linux swap. +The kernel decides which anonymous pages to evict, conflating application state with arbitrary heap allocations and forcing user threads into direct reclaim when memory pressure rises. +Direct reclaim shows up as user-visible latency in dataflow operators and as elevated `pgscan_direct` in `/proc/vmstat` during hydration. +We need an explicit pager so the application can mark cold data, ask for it back on demand, and choose whether the cold storage lives in anonymous memory (with kernel hints) or on a dedicated scratch volume. +This unblocks the columnar end-to-end project: once `Column::Aligned` is a `Vec` (CLU-64), the pager is the natural seam between in-memory columnar buffers and out-of-core storage. + +## Success criteria + +The design succeeds when: + +* Cold columnar data can be paged out and back in via a single API regardless of backend. +* The swap backend introduces no copies on the page-out path; the page-in path costs at most a `madvise` plus the unavoidable read. +* The file backend uses one syscall per logical operation where the OS supports it (`writev`, `pread`/coalesced `pread`). +* Switching backends is a runtime configuration flip, not a recompile or a restart. +* Existing handles remain usable across backend flips. +* The API supports both write-once/read-once 2 MiB-scale spill blocks and write-once/read-many large blobs with random offset and length access. +* Caller-side allocation is reusable: the buffers handed to `pageout` and the buffers passed to `pagein`-style reads return to the caller in a state that preserves their capacity where the backend permits. +* Out-of-core scenarios (working set 2x system RAM) actually offload pages: peak resident set size drops compared to leaving the data resident. + +## Out of scope + +* Eviction policy. + The pager is a mechanism; callers decide what and when to page out. +* Generic element types. + The API is `Vec` only. + We will revisit if a non-`u64` consumer appears. +* Compression and encryption. + Both can wrap the pager externally. +* Async or `io_uring`. + Sync syscalls only for v1. + An async wrapper is a follow-up if profiles motivate it. +* Cross-handle file pooling (one shared scratch file with a free-list). + Each handle owns one named file in the scratch directory; pooling is a follow-up if inode pressure shows up. +* Non-Linux production support. + Both backends compile on macOS and others as in-memory no-ops, but the production target is Linux. + +## Solution proposal + +A single `mz_ore::pager` module exposes a backend-agnostic API around a `Handle` type. +A global atomic selects the backend at `pageout` time; the chosen backend is baked into each handle so live flips do not invalidate existing data. +Two backends ship: `Swap` (keep allocations resident, hint the kernel via `MADV_COLD`) and `File` (write to a named file in a per-process scratch subdirectory). +The API is sync; the file backend uses `writev` and `pread` to keep syscall counts low. +The file backend never holds a file descriptor in the handle: each operation opens, runs syscalls, and closes the fd, so per-handle state is a tiny `(scratch_id, length)` tuple regardless of how many handles a process has alive. + +### Architecture + +```mermaid +flowchart LR + Caller -- "pageout(&mut [Vec])" --> Pager + Pager -- "Backend::current()" --> Sel{Atomic backend} + Sel -- Swap --> SwapBackend["Swap: hold Vec(s), MADV_COLD"] + Sel -- File --> FileBackend["File: writev to scratch/{pid}/{id}.bin, close fd"] + Caller -- "read_at_many(&Handle, ranges, &mut dst)" --> Pager + Caller -- "take(Handle, &mut dst)" --> Pager +``` + +The handle's variant captures which backend produced it, so reads dispatch on the handle alone. +A backend flip changes only future `pageout` calls; a handle taken under `Swap` continues to read from memory after a flip to `File`. + +### API + +```rust +//! src/ore/src/pager.rs + +pub enum Backend { Swap, File } + +/// Sets the active backend for future pageouts. Existing handles are unaffected. +pub fn set_backend(b: Backend); + +pub fn backend() -> Backend; + +/// Configures the scratch directory for the file backend. +/// Must be called before the first file-backend pageout. +pub fn set_scratch_dir(path: PathBuf); + +pub struct Handle { /* private: SwapInner | FileInner */ } + +impl Handle { + /// Logical length in u64s. + pub fn len(&self) -> usize; + pub fn len_bytes(&self) -> usize { self.len() * 8 } + pub fn is_empty(&self) -> bool { self.len() == 0 } +} + +impl Drop for Handle { /* swap: drops Vec(s). file: unlinks the scratch file. */ } + +/// Scatter pageout. Logical layout = chunks concatenated in input order. +/// After return: each Vec in `chunks` is empty. +/// File backend preserves capacity; swap backend moves the alloc into the handle. +/// Empty input returns a `len == 0` handle and performs no I/O. +pub fn pageout(chunks: &mut [Vec]) -> Handle; + +/// Reads multiple ranges. Output appended to `dst` in request order (concat). +/// Panics if any range is out of bounds. +pub fn read_at_many(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec); + +/// Single-range convenience. +pub fn read_at(handle: &Handle, offset: usize, len: usize, dst: &mut Vec); + +/// Consumes handle, writing the entire payload into `dst` (cleared first), then reclaims storage. +/// Swap fast path: single-chunk handle into empty `dst` swaps in place, no copy. +pub fn take(handle: Handle, dst: &mut Vec); +``` + +### Swap backend + +Storage is `Vec>` plus a prefix-sum `Vec` of cumulative lengths in u64s. + +`pageout`. +Move each input Vec via `mem::take`. +For each chunk, compute the page-aligned subrange of its byte buffer and call `madvise(ptr, len, MADV_COLD)`. +`MADV_COLD` deactivates the pages without freeing them; the kernel reclaims under pressure without a synchronous swap-out. +Skip the syscall when the page-aligned region is empty (sub-page chunks). + +`read_at_many`. +For each range, binary-search the prefix-sum to find the starting chunk, then `extend_from_slice` across chunk boundaries. +Optionally call `MADV_WILLNEED` on the touched pages before the copy. +Output appends in request order to `dst`. + +`take`. +A single-chunk handle paired with an empty `dst` triggers `mem::swap` for a zero-copy take. +Otherwise concat all chunks into `dst`. +The handle drops; the `Vec>` reclaims. + +### File backend + +Storage per handle is `(scratch_id: u64, len_u64s: usize)`. +File descriptors are never retained: the handle is 16 bytes regardless of file state. +The pager owns a per-process subdirectory `{scratch_dir}/mz-pager-{pid}-{boot_nonce}/` and writes each handle to `{subdir}/{scratch_id}.bin` where `scratch_id` is allocated from a process-wide `AtomicU64`. + +`pageout`. +Allocate `scratch_id`. +`File::create_new(path)` to open exclusively, build an iovec from each chunk's byte slice, issue one `writev` covering all chunks, then close the fd. +Clear each input Vec after the write so the caller keeps capacity. +On I/O failure, log a warning, unlink the path if it was created, and fall back by constructing a `SwapInner` from the same chunks; the contract on input Vecs and the returned handle is unchanged. + +`read_at_many`. +`File::open(path)`, then for each range compute byte offset and byte length and call `pread`. +Coalesce adjacent ranges (`offset_i + len_i == offset_{i+1}`) into a single `pread`. +Append into `dst`, then close the fd. +A future optimization is `preadv2` once a profile shows it matters. + +`take`. +`File::open(path)`, one `pread` for the whole length into `dst`, close the fd, `unlink(path)`, drop the handle. + +`Drop` (without `take`). +`unlink(path)`. The kernel reclaims the inode. + +### Scratch directory lifecycle + +`set_scratch_dir(root)` creates `{root}/mz-pager-{pid}-{boot_nonce}/`. +`boot_nonce` is a random 64-bit value sampled at config time so two processes that briefly share the same pid cannot collide. +On `set_scratch_dir`, the pager runs a reaper that walks `{root}` and removes any sibling `mz-pager-*` subdirectory whose owning pid is no longer alive (`/proc/{pid}` missing on Linux); this reclaims storage from crashed predecessors. +On clean process exit, a `Drop` on the global pager state removes the per-process subdirectory. +The reaper is best-effort and logs failures rather than panicking. + +### Configuration + +Two pieces of global state, both behind atomics or `OnceLock`: + +* `BACKEND: AtomicU8`, set by `set_backend`. +* `SCRATCH_DIR: OnceLock`, set by `set_scratch_dir` before the first file-backend pageout. Subsequent calls log a warning and become no-ops to avoid mid-run path changes. + +A LaunchDarkly-style param wires `set_backend` from cluster configuration, mirroring `mz_ore::region::ENABLE_LGALLOC_REGION`. + +### Concurrency + +`Handle: Send`. +`pageout` and `take` consume by value, so they are single-threaded with respect to a handle by construction. +`read_at` and `read_at_many` take `&Handle` and are concurrent-safe: the file path uses `pread` (thread-safe positional read); the swap path reads through an immutable `&Vec`. +`set_backend` racing with `pageout` is benign: `pageout` reads the atomic once at entry, and existing handles keep their backend. + +### Errors + +File I/O on `pageout` failure: log at `warn`, fall back to a swap-backed handle. +File I/O on read failure: panic. +The data lives only on the spill path; partial reads indicate corruption or device loss, both unrecoverable. +Out-of-bounds range in `read_at*`: panic. +Empty input to `pageout`: returns a `len == 0` handle with no syscalls on either backend. + +### File layout + +``` +src/ore/src/pager.rs # Public API, dispatch, Handle enum, config +src/ore/src/pager/swap.rs # Swap backend +src/ore/src/pager/file.rs # File backend (per-process subdir, writev, pread) +src/ore/benches/pager.rs # Criterion benches +src/ore/Cargo.toml # Feature `pager`: deps libc, bytemuck +``` + +The `pager` feature gates the module so non-Linux builds compile. +On non-Linux, both backends degrade to no-op variants that hold data in memory and skip syscalls. + +## Minimal viable prototype + +A working in-tree prototype is the first implementation step: both backends, end-to-end tests, and the benchmark harness described below. +The prototype validates three risks early. +First, that `MADV_COLD` actually offloads pages under pressure; we measure this by allocating 2x system RAM in handles and watching peak RSS via `/proc/self/status`. +Second, that the file backend's vectored I/O is competitive with swap on sequential workloads; the bench compares 1 x 2 MiB and 64 x 32 KiB layouts on both backends. +Third, that the API survives integration with `Column` post-CLU-64; a follow-up branch wires `Column::Aligned` to `pageout`/`take` end-to-end and runs an existing columnar bench. + +The bench harness lives at `src/ore/benches/pager.rs` and uses Criterion. +Knobs: backend (Swap, File), payload size (4 KiB, 64 KiB, 1 MiB, 16 MiB), chunk count (1, 2, 64), scratch dir (`MZ_PAGER_SCRATCH` env var, default `$TMPDIR`). +Cases: + +* `pageout` wall time, single chunk, varying size. +* `pageout` scatter, fixed total size, varying chunk count, both backends; isolates `writev` benefit. +* `read_at` whole-block after a configurable idle delay so the kernel actually reclaims swap pages. +* `read_at_many` random ranges, 1, 8, 64 ranges per call, sorted vs unsorted to exercise coalescing. +* Sustained round-trip `pageout` -> `read_at` -> drop, measuring ops/s. +* Working-set scenario: 2x system RAM in handles, page out half, read random handles; gated behind `cargo bench --features pager-stress` since CI cannot run it. + +Unit tests in `src/ore/src/pager.rs` and per-backend modules cover round-trip on both backends, scatter/gather correctness (random ranges including overlapping and adjacent), drop-without-read reclaim, the swap fast path for `take` (assert pointer identity), backend flip mid-run (handle taken under one backend reads through after flip), and capacity-preservation rules. +miri runs the swap backend; the file backend skips on miri due to syscalls. + +## Operational characteristics + +A merge-batcher-style example (`src/ore/examples/pager_merge.rs`) builds two chains of 2 MiB chunks, then merges them while reading every cache line of the input. +Run under `systemd-run --user --scope -p MemoryMax=...` to constrain memory and force real eviction. +Numbers below were collected on a Linux box with an encrypted NVMe (~1.4 GB/s sustained R+W ceiling) running the example with `--chain-gib 16` (32 GiB total working set) and `--prefetch-depth 4`. + +### Throughput sweep + +`through` is total bytes pumped through the merge divided by wall time, summed across threads. + +| RAM cap | threads | swap GiB/s | file GiB/s | file/swap | +|--------:|--------:|-----------:|-----------:|----------:| +| 16 G | 1 | 0.15 | 0.50 | 3.4× | +| 16 G | 16 | 0.36 | 1.47 | 4.0× | +| 8 G | 1 | 0.12 | 0.44 | 3.5× | +| 8 G | 16 | 0.37 | 0.79 | 2.1× | +| 4 G | 1 | 0.12 | 0.36 | 3.0× | +| 4 G | 16 | 0.36 | 1.21 | 3.4× | + +Headlines: + +* The file backend can saturate the disk: 1.47 GiB/s at 16 G cap with 16 threads is ~100 % of the encrypted-NVMe ceiling. +* The swap backend caps at ~0.36 GiB/s regardless of cap or parallelism, leaving 70 %+ of the disk capacity unused. +* File scales with parallelism (3× from 1 to 16 threads); swap scales sublinearly and floors near 90 s wall. + +### Why swap stalls + +Single-threaded merge with a 4 GiB chain (8 GiB working set) under a 2 G cap, instrumented via `perf stat` plus `/proc/vmstat` deltas: + +| metric | swap | file | ratio | +|---|---:|---:|---:| +| wall time | 65.5 s | 24.0 s | 2.7× | +| sys time | 63.8 s | 9.0 s | 7.1× | +| user time | 1.5 s | 1.5 s | 1× | +| major-faults | 65,516 | 1,913 | 34× | +| minor-faults | 5,187,868 | 3,986 | 1300× | +| dTLB-load-misses | 42 M | 12 M | 3.4× | +| pswpin (4 KiB pages) | 2.12 M | 2.2 K | 970× | +| pswpout | 3.64 M | 3.1 K | 1180× | + +Of swap's 65 s wall, 64 s is sys time — the kernel runs the user thread's fault handler. +2.1 M page-ins through the swap path means every 4 KiB granule of the 8 GiB working set page-faults synchronously on the user thread. +The disk does not become the bottleneck: 8 GiB / 65 s = 130 MB/s, less than 10 % of NVMe capacity. +Page-table churn from `MADV_COLD` reclaim and subsequent re-faulting also drives 3.4× more dTLB misses on the swap path; each unmap broadcasts a TLB-shootdown IPI to every CPU running the task. + +The file backend issues one `writev` per chunk on pageout and one `pread` per coalesced range on read, lets kernel readahead overlap I/O with the user thread's compute, and never pays the per-page fault tax. +9 s of sys time covers all of its kernel work for the same 8 GiB scan. + +### Operational guidance + +* Pick the swap backend when the working set is comfortably resident. + `MADV_COLD` is essentially free in that regime and operations run at memory bandwidth. +* Pick the file backend whenever the working set may exceed RAM. + The kernel I/O pipeline scales; swap-in does not. +* The runtime atomic switch is the right place for an operator-level policy: a controller can flip the global at startup based on cluster size or under a pressure signal. +* Prefetch hints (`prefetch` / `prefetch_at`) help the file backend by ~5 % at depth 16 on this workload; they do not help the swap backend because under pressure the kernel is reclaim-bound, not stall-bound. + +## Alternatives + +### Generic over `T: Pod` + +The columnar use case is `Vec` and the spec is explicit on `&[u64]`. +A generic over `T: bytemuck::Pod` would let callers spill `Vec` or `Vec` without manual casts. +The cost is API-wide: the handle must track element size, the file backend must validate alignment on read, and `take` round-tripping a `Vec` cannot return a `Vec` without reallocation. +The simpler `u64`-only design wins until a concrete non-`u64` consumer appears; an additive `pageout_pod` later would not break existing callers. + +### Per-pager configuration instead of a global atomic + +A `Pager` struct constructed with its backend would compose better than a global, especially in tests. +But the project intent is "the cluster runs on swap or on file, not both at once", and a global atomic encodes that operational reality directly. +A per-pager design would either duplicate the global flag at the struct level or invite confusion about which configuration wins. +We can add a per-instance constructor later for tests if the global proves awkward; the global stays as the production path. + +### One pager per use case (transient spill vs long-lived blob) + +The two named use cases differ only in handle lifetime and access pattern, not in storage. +Both want `pageout` once, both want random-offset reads, both want `Drop` to reclaim. +Two pagers would duplicate the global flag, the scratch directory, and the syscall code paths. +One pager covers both with the same API; UC1 calls `pageout` then `take`; UC2 calls `pageout` once and `read_at` many times. + +### Async API + +The dataflow callers run on synchronous timely worker threads, and bridging async out of an operator costs context switches and complicates lifetimes. +Sync `pread`/`writev` on a thread that already exists is the simplest correct choice for v1. +An async wrapper that offloads to a blocking pool can be added later without breaking the sync core. + +### `MADV_PAGEOUT` / `MADV_SWAPOUT` + +`MADV_PAGEOUT` (Linux 5.4+) actively reclaims pages synchronously, which is the closest to the file backend's eager-write semantics. +The cost is a synchronous, expensive syscall that we would issue from operator threads; under pressure this re-creates the direct-reclaim problem we are trying to escape. +`MADV_COLD` deactivates pages and lets the kernel reclaim asynchronously when it actually wants to, which matches our goal of moving work off the user thread. +We pick `MADV_COLD` for v1; if profiles show pages are not reclaimed quickly enough, `MADV_PAGEOUT` is a one-line swap on a feature-gate. + +### `O_TMPFILE` with the fd held in the handle + +`O_TMPFILE` creates an unnamed inode that auto-deletes when the last fd closes, so it would skip the explicit `unlink` step on reclaim. +The cost is one fd per handle: 100k live handles would exhaust the process fd ulimit and require an OS configuration change to operate at scale. +The chosen design instead opens a named file, writes, and closes the fd within `pageout`, so the handle holds 16 bytes and no fd, and reads reopen as needed. + +### Single shared scratch file with an offset table + +One file with handle-owned offsets reduces inode pressure and enables better physical layout. +It also requires a free-list, fragmentation handling, and a different reclaim story (truncation does not free arbitrary middles). +The complexity is unjustified at expected handle counts; revisit if inode count per handle becomes a measurable bottleneck. + +## Open questions + +* Should `pageout` accept `impl IntoIterator>` for ergonomics? + Iterators lose caller capacity reuse on the file path because we have no `&mut` access to put the cleared Vec back. + Recommend sticking with `&mut [Vec]`; revisit if the slice form proves awkward at call sites. + +* Should `set_backend` be allowed to flip multiple times during a process lifetime? + Yes for v1, since live LaunchDarkly flips are an explicit goal. + The per-handle stability rule (existing handles keep their backend) keeps this safe; document it in the rustdoc. + +* Should we add `read_at_into(&Handle, offset: usize, dst: &mut [u64])` for callers that have a sized buffer and do not want `Vec` semantics? + The columnar consumer is `Vec` and `read_at` already reuses the caller's allocation via `&mut Vec`. + Defer; add additively if a caller needs slice-only reads. + +* Should we add `pageout_each(&mut [Vec]) -> Vec` for timely-spill-style integration? + Timely's `BytesSpill` needs one handle per chunk; with the fd-less file backend, a "shared file under N handles" design becomes "N handles each pointing at independent files" and the `writev`-batching benefit disappears. + An alternative is `pageout_each` that writes all chunks into one shared scratch file and returns N handles each carrying `(scratch_id, byte_offset, byte_len)` plus a refcount so the file is unlinked once all handles drop. + Defer until the timely-spill integration concretely wants it; the existing `pageout` already covers the columnar primary use case. + +## Interaction with timely-dataflow spill + +Timely's `MergeQueue` exposes `BytesSpill`/`BytesFetch` traits ([PR 791](https://github.com/TimelyDataflow/timely-dataflow/pull/791) demonstrates the file-backed strategy). +The shapes differ: timely's `spill(&mut Vec, &mut Vec>)` takes N chunks and returns N independent fetch handles; the pager takes N chunks and returns one composite handle. +A `mz_timely_util::spill` adapter is the integration point, not the pager itself. +The adapter can be implemented in two ways once both pieces exist. + +The simple form calls `pager::pageout` once per chunk and stores each `pager::Handle` inside a `BytesFetch` impl. +This costs one scratch file per chunk; for 256 KiB chunks at 50 GiB total, that's roughly 200k inodes, which is workable on tmpfs but stressful on disk filesystems. +The richer form is the deferred `pageout_each` API above, which lets one writev produce one file with N handles and matches timely's design exactly. + +The element-type boundary needs care. +Timely passes `bytes::arc::Bytes` (byte-aligned); the pager wants `Vec` (8-byte aligned). +Materialize's columnar serialization already produces 8-byte-aligned bytes, so the adapter can cast where alignment is statically guaranteed and copy where it is not. +A future enhancement is a parallel byte-oriented pager API; deferred until the adapter exists and motivates it. + +The pager's swap backend is novel relative to timely's example: timely's "no-spill" baseline relies on the OS to manage memory, while the pager actively hints `MADV_COLD`. +This makes the swap backend an additional spill strategy that timely does not currently offer, suitable for cases where eager file write is too expensive but kernel-driven reclaim alone is too slow. From f6878e32c6f8d1d9b8d48815e3f031d8af879867 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 12:01:17 +0200 Subject: [PATCH 23/34] ore: gate pager_merge example on the pager feature Without `required-features` cargo tries to build the example with the default feature set, where `#![cfg(feature = "pager")]` strips the entire file and leaves no `main`. Declare the feature requirement so the example is skipped when the feature is off. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ore/Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ore/Cargo.toml b/src/ore/Cargo.toml index d446f694bc2d4..583b117f655f1 100644 --- a/src/ore/Cargo.toml +++ b/src/ore/Cargo.toml @@ -174,6 +174,10 @@ name = "pager" harness = false required-features = ["pager"] +[[example]] +name = "pager_merge" +required-features = ["pager"] + [package.metadata.cargo-udeps.ignore] # Only used in doc-tests. development = ["tokio-test"] From ea3a51e0ea691472997829c94c127a4cc62b18f7 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 12:19:10 +0200 Subject: [PATCH 24/34] ore: drop pager prefetch API and example usage Measured ~5% improvement on the file path at depth 16, within run-to-run variance, and zero on the swap path. Not worth the API surface for v1. Kernel readahead handles the file path adequately; swap is reclaim-bound under pressure and prefetch can't help. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/design/20260504_pager.md | 1 - src/ore/examples/pager_merge.rs | 42 +++------------- src/ore/src/pager.rs | 19 ------- src/ore/src/pager/file.rs | 58 --------------------- src/ore/src/pager/swap.rs | 70 ++------------------------ 5 files changed, 11 insertions(+), 179 deletions(-) diff --git a/doc/developer/design/20260504_pager.md b/doc/developer/design/20260504_pager.md index cdc7f7e2f91c4..41a4e5952f2c5 100644 --- a/doc/developer/design/20260504_pager.md +++ b/doc/developer/design/20260504_pager.md @@ -274,7 +274,6 @@ The file backend issues one `writev` per chunk on pageout and one `pread` per co * Pick the file backend whenever the working set may exceed RAM. The kernel I/O pipeline scales; swap-in does not. * The runtime atomic switch is the right place for an operator-level policy: a controller can flip the global at startup based on cluster size or under a pressure signal. -* Prefetch hints (`prefetch` / `prefetch_at`) help the file backend by ~5 % at depth 16 on this workload; they do not help the swap backend because under pressure the kernel is reclaim-bound, not stall-bound. ## Alternatives diff --git a/src/ore/examples/pager_merge.rs b/src/ore/examples/pager_merge.rs index 33d330aeec9bc..3903b37e1357d 100644 --- a/src/ore/examples/pager_merge.rs +++ b/src/ore/examples/pager_merge.rs @@ -25,7 +25,7 @@ //! cargo build --release --features pager --example pager_merge //! systemd-run --user --scope -p MemoryMax=16G -p MemorySwapMax=64G --quiet \ //! --setenv=MZ_PAGER_SCRATCH=/path/to/scratch \ -//! -- target/release/examples/pager_merge --chain-gib 16 --backend swap +//! -- target/release/examples/pager_merge --chain-gib 16 --backend swap --threads 1 //! ``` #![cfg(feature = "pager")] @@ -47,7 +47,6 @@ const CACHE_LINE_U64: usize = CACHE_LINE_BYTES / 8; fn main() { let args: Vec = env::args().collect(); let chain_gib: usize = parse_arg(&args, "--chain-gib", 16); - let prefetch_depth: usize = parse_arg(&args, "--prefetch-depth", 1); let threads: usize = parse_arg(&args, "--threads", 1).max(1); let backend = parse_backend(&args); let scratch: PathBuf = env::var_os("MZ_PAGER_SCRATCH") @@ -62,7 +61,7 @@ fn main() { let chunks_per_chain = per_thread_chain_bytes / CHUNK_BYTES; println!( - "backend={backend:?} threads={threads} per_thread_chain_chunks={chunks_per_chain} chunk={CHUNK_BYTES}B total_chain={chain_gib}GiB prefetch_depth={prefetch_depth}" + "backend={backend:?} threads={threads} per_thread_chain_chunks={chunks_per_chain} chunk={CHUNK_BYTES}B total_chain={chain_gib}GiB" ); let barrier = Arc::new(Barrier::new(threads)); @@ -71,7 +70,7 @@ fn main() { for tid in 0..threads { let barrier = Arc::clone(&barrier); handles.push(thread::spawn(move || { - run_worker(tid, chunks_per_chain, prefetch_depth, &barrier) + run_worker(tid, chunks_per_chain, &barrier) })); } let mut per_thread = Vec::with_capacity(threads); @@ -102,16 +101,11 @@ struct WorkerTimings { merge: Duration, } -fn run_worker( - _tid: usize, - chunks_per_chain: usize, - prefetch_depth: usize, - barrier: &Barrier, -) -> WorkerTimings { +fn run_worker(_tid: usize, chunks_per_chain: usize, barrier: &Barrier) -> WorkerTimings { barrier.wait(); let (chain_a, build_a) = time(|| build_chain(chunks_per_chain)); let (chain_b, build_b) = time(|| build_chain(chunks_per_chain)); - let (chain_c, merge) = time(|| merge_pass(chain_a, chain_b, prefetch_depth)); + let (chain_c, merge) = time(|| merge_pass(chain_a, chain_b)); drop(chain_c); WorkerTimings { build_a, @@ -138,7 +132,7 @@ fn build_chain(n_chunks: usize) -> Vec { chain } -fn merge_pass(a: Vec, b: Vec, prefetch_depth: usize) -> Vec { +fn merge_pass(a: Vec, b: Vec) -> Vec { let n = a.len().min(b.len()); let mut a: Vec> = a.into_iter().map(Some).collect(); let mut b: Vec> = b.into_iter().map(Some).collect(); @@ -146,31 +140,7 @@ fn merge_pass(a: Vec, b: Vec, prefetch_depth: usize) -> Vec = Vec::with_capacity(CHUNK_U64); let mut tmp_b: Vec = Vec::with_capacity(CHUNK_U64); let mut sink: u64 = 0; - // Maintain a rolling window of `prefetch_depth` outstanding prefetches. - // Issue the initial wave for indices [0, prefetch_depth). - let initial = prefetch_depth.min(n); - for j in 0..initial { - if let Some(h) = a[j].as_ref() { - pager::prefetch(h); - } - if let Some(h) = b[j].as_ref() { - pager::prefetch(h); - } - } for i in 0..n { - // Each iteration extends the window by one: prefetch index `i + - // prefetch_depth` so that by the time we consume it the kernel has - // had `prefetch_depth` chunks worth of compute time to make pages - // available. - let pf = i + prefetch_depth; - if pf < n { - if let Some(h) = a[pf].as_ref() { - pager::prefetch(h); - } - if let Some(h) = b[pf].as_ref() { - pager::prefetch(h); - } - } let ha = a[i].take().expect("handle a present"); let hb = b[i].take().expect("handle b present"); pager::take(ha, &mut tmp_a); diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 09100c1c2d3d9..15cbf8bb20024 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -158,25 +158,6 @@ pub fn read_at(handle: &Handle, offset: usize, len: usize, dst: &mut Vec) { read_at_many(handle, &[(offset, len)], dst); } -/// Hints that the entire payload will be read soon. Best-effort. -/// Swap backend issues `madvise(MADV_WILLNEED)`; file backend issues -/// `posix_fadvise(POSIX_FADV_WILLNEED)`. Both are async — the call returns -/// promptly and the kernel populates pages or page cache in the background. -/// Useful for overlapping I/O with computation in pipelines that know which -/// handles they will read next. -pub fn prefetch(handle: &Handle) { - prefetch_at(handle, 0, handle.len()); -} - -/// Hints that the range `[offset, offset+len)` of the handle will be read soon. -/// Panics if the range is out of bounds. -pub fn prefetch_at(handle: &Handle, offset: usize, len: usize) { - match &handle.inner { - HandleInner::Swap(_) => swap::prefetch_at_swap(handle, offset, len), - HandleInner::File(_) => file::prefetch_at_file(handle, offset, len), - } -} - /// Consumes handle, writing the entire payload into `dst` (cleared first), then reclaims storage. /// Swap fast path: single-chunk handle into empty `dst` swaps in place, no copy. pub fn take(handle: Handle, dst: &mut Vec) { diff --git a/src/ore/src/pager/file.rs b/src/ore/src/pager/file.rs index faf6088e5bcb3..1e468d03f5478 100644 --- a/src/ore/src/pager/file.rs +++ b/src/ore/src/pager/file.rs @@ -180,42 +180,6 @@ fn write_all_vectored(mut file: &File, mut slices: &mut [IoSlice<'_>]) -> std::i Ok(()) } -pub(crate) fn prefetch_at_file(handle: &Handle, offset: usize, len: usize) { - let inner = handle - .file_inner() - .expect("prefetch_at_file called on non-file handle"); - let total = inner.len_u64s; - let end = offset.checked_add(len).expect("offset+len overflow"); - assert!( - end <= total, - "prefetch range out of bounds: {offset}+{len} > {total}" - ); - if len == 0 { - return; - } - let path = scratch_path(inner.id); - let Ok(file) = File::open(&path) else { - // Best-effort hint; if the file is gone the next read will surface the error. - return; - }; - posix_fadvise_willneed(&file, u64::cast_from(offset * 8), u64::cast_from(len * 8)); -} - -#[cfg(unix)] -fn posix_fadvise_willneed(file: &File, byte_off: u64, byte_len: u64) { - use std::os::unix::io::AsRawFd; - let off = i64::try_from(byte_off).expect("scratch file offset fits i64"); - let len = i64::try_from(byte_len).expect("scratch file length fits i64"); - // SAFETY: fd is valid for the life of `file`; `posix_fadvise` is a hint and - // does not mutate user memory. The return value (errno) is intentionally ignored. - unsafe { - libc::posix_fadvise(file.as_raw_fd(), off, len, libc::POSIX_FADV_WILLNEED); - } -} - -#[cfg(not(unix))] -fn posix_fadvise_willneed(_file: &File, _byte_off: u64, _byte_len: u64) {} - pub(crate) fn read_at_file(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec) { use std::os::unix::fs::FileExt; @@ -383,28 +347,6 @@ mod backend_tests { drop(h); assert!(!path.exists(), "scratch file should be unlinked on drop"); } - - #[mz_ore::test] - fn file_prefetch_does_not_corrupt_data() { - setup_dir(); - let payload: Vec = (0..1024).collect(); - let mut chunks = [payload.clone()]; - let h = pageout_file(&mut chunks); - prefetch_at_file(&h, 100, 200); - prefetch_at_file(&h, 0, 1024); - let mut dst = Vec::new(); - read_at_file(&h, &[(0, 1024)], &mut dst); - assert_eq!(dst, payload); - } - - #[mz_ore::test] - #[should_panic(expected = "out of bounds")] - fn file_prefetch_panics_on_oob() { - setup_dir(); - let mut chunks = [vec![1u64, 2]]; - let h = pageout_file(&mut chunks); - prefetch_at_file(&h, 0, 99); - } } #[cfg(test)] diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index b22d27843bf81..9c87cb24c20fa 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -50,23 +50,13 @@ pub(crate) fn pageout_swap(chunks: &mut [Vec]) -> Handle { taken.push(std::mem::take(c)); } for c in &taken { - madvise_range(c, MADV_COLD); + madvise_cold(c); } Handle::from_swap(SwapInner::new(taken)) } #[cfg(target_os = "linux")] -const MADV_COLD: libc::c_int = libc::MADV_COLD; -#[cfg(target_os = "linux")] -const MADV_WILLNEED: libc::c_int = libc::MADV_WILLNEED; - -#[cfg(not(target_os = "linux"))] -const MADV_COLD: i32 = 0; -#[cfg(not(target_os = "linux"))] -const MADV_WILLNEED: i32 = 0; - -#[cfg(target_os = "linux")] -fn madvise_range(chunk: &[u64], advice: libc::c_int) { +fn madvise_cold(chunk: &[u64]) { if chunk.is_empty() { return; } @@ -87,15 +77,14 @@ fn madvise_range(chunk: &[u64], advice: libc::c_int) { .cast::() .cast_mut(); // SAFETY: pointer/length describe a fully page-aligned subrange contained - // within the live `&[u64]`. `madvise` with `MADV_COLD` / `MADV_WILLNEED` - // does not mutate the contents. + // within the live `&[u64]`. `MADV_COLD` does not mutate the contents. unsafe { - libc::madvise(aligned_ptr, aligned_len, advice); + libc::madvise(aligned_ptr, aligned_len, libc::MADV_COLD); } } #[cfg(not(target_os = "linux"))] -fn madvise_range(_chunk: &[u64], _advice: i32) {} +fn madvise_cold(_chunk: &[u64]) {} #[cfg(target_os = "linux")] fn page_size() -> usize { @@ -148,35 +137,6 @@ fn copy_range(inner: &SwapInner, off: usize, len: usize, dst: &mut Vec) { } } -pub(crate) fn prefetch_at_swap(handle: &Handle, offset: usize, len: usize) { - let inner = handle - .swap_inner() - .expect("prefetch_at_swap called on non-swap handle"); - let total = inner.total_len(); - let end = offset.checked_add(len).expect("offset+len overflow"); - assert!( - end <= total, - "prefetch range out of bounds: {offset}+{len} > {total}" - ); - if len == 0 { - return; - } - let mut cur = offset; - let mut idx = match inner.prefix.binary_search(&cur) { - Ok(i) => i, - Err(i) => i.saturating_sub(1), - }; - while cur < end { - let chunk_start = inner.prefix[idx]; - let chunk = &inner.chunks[idx]; - let local = cur - chunk_start; - let take = std::cmp::min(end - cur, chunk.len() - local); - madvise_range(&chunk[local..local + take], MADV_WILLNEED); - cur += take; - idx += 1; - } -} - pub(crate) fn take_swap(handle: Handle, dst: &mut Vec) { let inner = match handle.into_swap_inner() { Some(s) => s, @@ -272,24 +232,4 @@ mod tests { take_swap(h, &mut dst); assert_eq!(dst, vec![1, 2, 3, 4, 5]); } - - #[mz_ore::test] - fn prefetch_does_not_corrupt_data() { - let payload: Vec = (0..1024).collect(); - let mut chunks = [payload.clone()]; - let h = pageout_swap(&mut chunks); - prefetch_at_swap(&h, 100, 200); - prefetch_at_swap(&h, 0, 1024); - let mut dst = Vec::new(); - read_at_swap(&h, &[(0, 1024)], &mut dst); - assert_eq!(dst, payload); - } - - #[mz_ore::test] - #[should_panic(expected = "out of bounds")] - fn prefetch_panics_on_oob() { - let mut chunks = [vec![1u64, 2]]; - let h = pageout_swap(&mut chunks); - prefetch_at_swap(&h, 0, 99); - } } From 0f6602f84a2d8cb659de556b139cd772b0f65691 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Tue, 5 May 2026 15:20:29 +0000 Subject: [PATCH 25/34] =?UTF-8?q?doc:=20pager=20design=20=E2=80=94=20add?= =?UTF-8?q?=20r8gd.16xlarge=20bench,=20retract=20swap-caps-regardless-of-p?= =?UTF-8?q?arallelism=20claim?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The earlier "swap caps at ~0.36 GiB/s regardless of cap or parallelism" headline holds only at low thread counts. On a 64 vCPU box with two striped local NVMes, swap-backend merge scales 13× from 1 → 64 threads and reaches ~75% of file-backend throughput, because enough independent direct-reclaim contexts run in parallel to keep the swap stripe nearly busy. Reorganize the operational characteristics section into two benches — encrypted NVMe (1.4 GB/s ceiling) and r8gd.16xlarge with striped instance NVMe (~7 GB/s ceiling) — and add file-backend (1 TiB / cap 256G) and swap-backend (128 GiB / cap 32G) thread-scaling tables for the second. Operational guidance now distinguishes low-thread (file wins ~3–5×) from high-thread (within ~25%) regimes and calls out the multi-tenant RSS argument as a separate reason to prefer file regardless of throughput. Drop the dead --prefetch-depth 4 reference; that flag was removed. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/design/20260504_pager.md | 69 ++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/doc/developer/design/20260504_pager.md b/doc/developer/design/20260504_pager.md index 41a4e5952f2c5..ba327504dbaa9 100644 --- a/doc/developer/design/20260504_pager.md +++ b/doc/developer/design/20260504_pager.md @@ -223,12 +223,16 @@ miri runs the swap backend; the file backend skips on miri due to syscalls. A merge-batcher-style example (`src/ore/examples/pager_merge.rs`) builds two chains of 2 MiB chunks, then merges them while reading every cache line of the input. Run under `systemd-run --user --scope -p MemoryMax=...` to constrain memory and force real eviction. -Numbers below were collected on a Linux box with an encrypted NVMe (~1.4 GB/s sustained R+W ceiling) running the example with `--chain-gib 16` (32 GiB total working set) and `--prefetch-depth 4`. - -### Throughput sweep +Pager behavior depends sharply on disk topology. +Slow-storage boxes hit a kernel reclaim ceiling well below the disk and the swap backend looks catastrophic; fast-storage boxes expose new bottlenecks and the swap-vs-file gap collapses at high thread counts. +Two benches below: a single encrypted NVMe (~1.4 GB/s ceiling, akin to typical EBS-backed clusters) and an r8gd.16xlarge with two striped local instance NVMes (~7 GB/s ceiling). `through` is total bytes pumped through the merge divided by wall time, summed across threads. +### Bench A: encrypted NVMe, 32 GiB working set + +Single encrypted NVMe, ~1.4 GB/s sustained R+W ceiling. `--chain-gib 16` (32 GiB working set). + | RAM cap | threads | swap GiB/s | file GiB/s | file/swap | |--------:|--------:|-----------:|-----------:|----------:| | 16 G | 1 | 0.15 | 0.50 | 3.4× | @@ -238,13 +242,53 @@ Numbers below were collected on a Linux box with an encrypted NVMe (~1.4 GB/s su | 4 G | 1 | 0.12 | 0.36 | 3.0× | | 4 G | 16 | 0.36 | 1.21 | 3.4× | -Headlines: +* File saturates the disk: 1.47 GiB/s at 16 G cap, 16 threads ≈ NVMe ceiling. +* Swap caps at ~0.36 GiB/s, leaving 70 %+ of disk capacity unused. +* File scales with parallelism (3× from 1 → 16 threads); swap floors near 90 s wall. + +### Bench B: r8gd.16xlarge, striped instance NVMe + +64 vCPU, 512 GiB RAM, 2× 1.7 TiB local instance NVMe partitioned 50/50: half striped via mdadm RAID0 for ext4 scratch (~7 GB/s combined sequential), other halves attached as swap with equal priority (kernel-side striping). +Pressure ratio fixed at 4× (workload : RAM cap). + +#### File backend, 1 TiB working set (`--chain-gib 512`, cap = 256 G) + +| threads | wall | overall GiB/s | merge phase | +|--------:|-------:|--------------:|------------:| +| 1 | 1195 s | 0.86 | 888 s | +| 4 | 598 s | 1.71 | 301 s | +| 16 | 593 s | 1.73 | 310 s | +| 32 | 591 s | 1.73 | 310 s | +| 64 | 617 s | 1.66 | 336 s | -* The file backend can saturate the disk: 1.47 GiB/s at 16 G cap with 16 threads is ~100 % of the encrypted-NVMe ceiling. -* The swap backend caps at ~0.36 GiB/s regardless of cap or parallelism, leaving 70 %+ of the disk capacity unused. -* File scales with parallelism (3× from 1 to 16 threads); swap scales sublinearly and floors near 90 s wall. +File backend saturates at 4 threads; beyond that, scheduling overhead slightly hurts. +Combined disk during merge ≈ 7 GB/s (~3.5 read + ~3.5 write), at the two-NVMe stripe ceiling. +Build-phase write rate hits the same ceiling even at 1 thread, so build is disk-bound regardless of parallelism — only merge benefits from concurrency. -### Why swap stalls +#### Swap backend, 128 GiB working set (`--chain-gib 64`, cap = 32 G) + +| threads | wall | overall GiB/s | merge GiB/s | +|--------:|------:|--------------:|------------:| +| 1 | 773 s | 0.17 | 0.19 | +| 4 | 229 s | 0.56 | 0.69 | +| 16 | 106 s | 1.21 | 1.87 | +| 32 | 88 s | 1.45 | 2.45 | +| 64 | 85 s | 1.51 | 2.51 | + +Swap-backend merge scales 13× from 1 → 64 threads, plateauing at ~2.5 GiB/s through (~5 GB/s combined disk). +That is ~75 % of the file backend's saturation throughput on the same hardware. +Below ~16 threads the kernel serializes on per-memcg reclaim and we see Bench-A-like floors; above 32 threads, enough independent direct-reclaim contexts run in parallel to keep the swap stripe nearly busy. + +### Headlines (revised) + +* The "swap caps at 0.36 GiB/s regardless of parallelism" claim from Bench A is true only at low thread counts. + With 64 threads each entering direct reclaim independently, swap-backend merge reaches ~2.5 GiB/s on fast storage. +* File backend still wins three ways: ~25 % higher peak throughput on fast disk, far lower RSS (376 MB at 64 threads vs the cgroup cap pinned by swap), and consistent performance below 4 threads where swap collapses. +* Hardware floor matters more than expected. + On 1.4 GB/s encrypted NVMe, kernel reclaim is 4× below disk and swap is dominated. + On 7 GB/s striped local NVMe, reclaim is no longer the bottleneck at high parallelism. + +### Why swap stalls (single-thread regime) Single-threaded merge with a 4 GiB chain (8 GiB working set) under a 2 G cap, instrumented via `perf stat` plus `/proc/vmstat` deltas: @@ -267,12 +311,17 @@ Page-table churn from `MADV_COLD` reclaim and subsequent re-faulting also drives The file backend issues one `writev` per chunk on pageout and one `pread` per coalesced range on read, lets kernel readahead overlap I/O with the user thread's compute, and never pays the per-page fault tax. 9 s of sys time covers all of its kernel work for the same 8 GiB scan. +The single-thread analysis generalizes: per-thread fault cost bounds swap throughput, and the only escape is enough threads in flight that the kernel can run multiple reclaim contexts concurrently. +Bench B confirms this — at 64 threads on fast disks, swap reaches ~75 % of file throughput rather than the ~10 % implied by single-thread numbers. + ### Operational guidance * Pick the swap backend when the working set is comfortably resident. `MADV_COLD` is essentially free in that regime and operations run at memory bandwidth. -* Pick the file backend whenever the working set may exceed RAM. - The kernel I/O pipeline scales; swap-in does not. +* Pick the file backend whenever the working set may exceed RAM **and** the workload runs with few enough threads that swap-merge would serialize on reclaim. + Below ~16 threads, the file backend wins by ~3–5×. +* For highly parallel workloads (vCPU-count threads) on fast local storage, swap and file are within ~25 % of each other on throughput; the choice can shift toward swap if the operator prefers not to provision separate scratch — at the cost of pinning the cap's worth of RAM. +* Pick the file backend if multi-tenant memory pressure matters: file-backend RSS stays at the working window (hundreds of MB), while swap-backend RSS pins to the cgroup cap (hundreds of GB). * The runtime atomic switch is the right place for an operator-level policy: a controller can flip the global at startup based on cluster size or under a pressure signal. ## Alternatives From 8844e0fac1601edee68d486dc69b6af06b50fc9f Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Thu, 14 May 2026 16:49:48 +0200 Subject: [PATCH 26/34] ore: pageout_with helper for explicit-backend dispatch Adds `pageout_with(backend, chunks)` alongside `pageout`. Lets callers select the backend per call instead of going through the global atomic, so layered consumers (next commit's column-pager) can route swap and file pageouts independently without racing other writers. Co-Authored-By: Claude Opus 4.7 --- src/ore/src/pager.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/ore/src/pager.rs b/src/ore/src/pager.rs index 15cbf8bb20024..75b4bc1f38dd1 100644 --- a/src/ore/src/pager.rs +++ b/src/ore/src/pager.rs @@ -135,10 +135,17 @@ pub fn set_backend(b: Backend) { /// File backend preserves capacity; swap backend moves the alloc into the handle. /// Empty input returns a `len == 0` handle and performs no I/O. pub fn pageout(chunks: &mut [Vec]) -> Handle { + pageout_with(backend(), chunks) +} + +/// Same as [`pageout`], but selects the backend explicitly. Bypasses the global +/// atomic so callers (such as the column-pager layer) can dispatch per call +/// without racing other writers. +pub fn pageout_with(b: Backend, chunks: &mut [Vec]) -> Handle { if total_len(chunks) == 0 { return Handle::from_swap(SwapInner::new(Vec::new())); } - match backend() { + match b { Backend::Swap => swap::pageout_swap(chunks), Backend::File => file::pageout_file(chunks), } From 1274f631f29148c8c7ee2b8c1bc5fa0f34483b9c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Thu, 14 May 2026 16:50:01 +0200 Subject: [PATCH 27/34] timely-util: column_pager with policy + lz4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bridges `mz_ore::pager` to typed `Column` via `ContainerBytes`. Callers drain a column into a `PagedColumn` and rehydrate on demand; backend and compression are decided per call by an injected `PagingPolicy`, not the pager's global atomic. Three resting variants cover the matrix: * `Resident(Column)` — policy returned `Skip`. * `Paged { handle, meta }` — raw u64-aligned bytes via `pager::Handle`. * `Compressed { inner, meta }` — lz4 frame; bytes live either in memory or in a `pager::Handle` (padded to u64). Fast paths: * `Column::Align(Vec)` uncompressed — moves the body Vec into the handle, no copy on the swap backend. * Compressed — `FrameEncoder` wraps the target so `into_bytes` streams serialized bytes straight through lz4 with no uncompressed staging. * Compressed file — the frame trailer self-delimits, so no `compressed_len` field and no unpad on read. Tests cover skip, swap/file × uncompressed/lz4 round trips, and the align-variant fast path. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 2 + src/timely-util/Cargo.toml | 4 +- src/timely-util/src/column_pager.rs | 512 ++++++++++++++++++++++++++++ src/timely-util/src/lib.rs | 1 + 4 files changed, 518 insertions(+), 1 deletion(-) create mode 100644 src/timely-util/src/column_pager.rs diff --git a/Cargo.lock b/Cargo.lock index 58b881b8f2a9f..bbd2d0115d562 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8351,11 +8351,13 @@ dependencies = [ "either", "futures-util", "lgalloc", + "lz4_flex", "mz-ore", "num-traits", "proptest", "rand 0.9.4", "serde", + "tempfile", "timely", "tokio", "tracing", diff --git a/src/timely-util/Cargo.toml b/src/timely-util/Cargo.toml index 2d3b74ecdcb53..24f775de78ef0 100644 --- a/src/timely-util/Cargo.toml +++ b/src/timely-util/Cargo.toml @@ -31,7 +31,8 @@ differential-dataflow.workspace = true either.workspace = true futures-util.workspace = true lgalloc.workspace = true -mz-ore = { path = "../ore", default-features = false, features = ["async", "process", "tracing", "test", "num-traits", "region", "differential-dataflow", "overflowing"] } +lz4_flex.workspace = true +mz-ore = { path = "../ore", default-features = false, features = ["async", "process", "tracing", "test", "num-traits", "region", "differential-dataflow", "overflowing", "pager"] } num-traits.workspace = true serde.workspace = true timely.workspace = true @@ -48,6 +49,7 @@ allocation-counter = { workspace = true, optional = true } criterion.workspace = true proptest.workspace = true rand.workspace = true +tempfile.workspace = true [features] default = ["mz-ore/default"] diff --git a/src/timely-util/src/column_pager.rs b/src/timely-util/src/column_pager.rs new file mode 100644 index 0000000000000..1e1ca4076a796 --- /dev/null +++ b/src/timely-util/src/column_pager.rs @@ -0,0 +1,512 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Column-aware pager. Pages [`Column`] instances out via [`mz_ore::pager`], +//! optionally compressing with lz4. +//! +//! The pager (`mz_ore::pager`) deals in `Vec` blobs and two backends. This +//! module adds: +//! +//! 1. A [`PagingPolicy`] trait that decides _whether_ to page out, _which +//! backend_, and _whether to compress_. Decisions live in the policy +//! implementation, not in the global atomic the pager exposes. +//! 2. A [`ColumnPager`] that drains a `Column` into a [`PagedColumn`] and +//! rehydrates it on demand. +//! 3. Lz4 frame-format compression as an optional codec. +//! +//! The serialization uses the existing [`ContainerBytes`] protocol on +//! `Column`, so we get a single byte layout that both raw and compressed +//! paths share. See `doc/developer/design/20260504_pager.md` for background. + +#![deny(missing_docs)] + +use std::io::{self, Read}; +use std::sync::Arc; + +use columnar::Columnar; +use lz4_flex::frame::{FrameDecoder, FrameEncoder}; +use mz_ore::pager::{self, Backend, Handle}; +use timely::bytes::arc::BytesMut; +use timely::dataflow::channels::ContainerBytes; + +use crate::columnar::Column; + +// --------------------------------------------------------------------------- +// Codec +// --------------------------------------------------------------------------- + +/// Compression codec applied to a paged-out column. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum Codec { + /// lz4 frame format (`lz4_flex::frame`). Self-delimiting, streams via + /// `io::Read`/`io::Write`, no random access. + Lz4, +} + +// --------------------------------------------------------------------------- +// Policy +// --------------------------------------------------------------------------- + +/// Inputs to a pageout decision. +#[derive(Copy, Clone, Debug)] +pub struct PageHint { + /// Uncompressed body size in bytes (matches [`ContainerBytes::length_in_bytes`]). + pub len_bytes: usize, +} + +/// Outcome of a policy decision. +#[derive(Copy, Clone, Debug)] +pub enum PageDecision { + /// Keep the column resident; no I/O, no compression. + Skip, + /// Page out using the given backend and (optionally) codec. + Page { + /// Pager backend to use. + backend: Backend, + /// Compression codec, or `None` for raw bytes. + codec: Option, + }, +} + +/// Notifications the column-pager sends back to the policy. Implementations +/// typically forward to metrics counters. +#[derive(Debug)] +pub enum PageEvent { + /// A successful pageout. `bytes_in` is the uncompressed body size, + /// `bytes_out` is the on-storage payload size (after compression). + PagedOut { + /// Uncompressed body size handed to the pager. + bytes_in: usize, + /// On-storage payload size after compression and padding. + bytes_out: usize, + /// Backend selected by the policy. + backend: Backend, + /// Codec selected by the policy. + codec: Option, + }, + /// A successful page-in. `bytes` is the uncompressed body size delivered to + /// the caller. + PagedIn { + /// Uncompressed body size delivered to the caller. + bytes: usize, + }, + /// A pageout failure surfaced via the underlying pager. + Failed { + /// Backend that produced the error. + backend: Backend, + /// Underlying I/O error. + err: io::Error, + }, +} + +/// Decides whether/how to page a column out, and records page events. +/// +/// Implementations carry their own state (counters, atomics, configuration) +/// via interior mutability. Methods take `&self` so a single policy can be +/// shared across operator threads. +pub trait PagingPolicy: Send + Sync { + /// Returns the action to take for a column with the given hint. + fn decide(&self, hint: PageHint) -> PageDecision; + /// Records a pageout/pagein/failure event for metrics or adaptive decisions. + fn record(&self, event: PageEvent); +} + +// --------------------------------------------------------------------------- +// Meta + PagedColumn +// --------------------------------------------------------------------------- + +/// Sizing metadata captured at pageout time. Stored alongside the payload so +/// `take` can size buffers. +#[derive(Clone, Debug)] +pub struct Meta { + /// Uncompressed body size in bytes. + pub len_bytes: usize, +} + +/// A column whose body may be resident, paged out, or paged out and compressed. +/// +/// Each variant corresponds to one of the [`PageDecision`] outcomes. +pub enum PagedColumn { + /// Body kept resident. Returned when the policy answered [`PageDecision::Skip`]. + Resident(Column), + /// Raw `ContainerBytes` payload stored via [`pager::Handle`]. The backend + /// (Swap or File) is baked into the handle. + Paged { + /// Pager handle owning the raw payload. + handle: Handle, + /// Sizing metadata. + meta: Meta, + }, + /// Lz4-framed serialized form. The framed bytes themselves may live in + /// memory or in the pager (see [`CompressedInner`]). + Compressed { + /// Where the framed bytes live. + inner: CompressedInner, + /// Sizing metadata. + meta: Meta, + }, +} + +/// Storage location for the lz4-framed bytes inside a compressed paged column. +pub enum CompressedInner { + /// Owned `Vec` held resident in the caller's address space. + Memory(Vec), + /// Framed bytes padded to a `u64` boundary and handed to the pager. The + /// frame trailer self-delimits, so the trailing pad is ignored on read. + Paged(Handle), +} + +// --------------------------------------------------------------------------- +// ColumnPager +// --------------------------------------------------------------------------- + +/// Pages typed [`Column`]s out and back in, driven by a [`PagingPolicy`]. +/// +/// Cheap to clone (it's an `Arc`). Hold one per operator if you want per-site +/// policy state, or share globally if you want one policy. +#[derive(Clone)] +pub struct ColumnPager { + policy: Arc, +} + +impl ColumnPager { + /// Constructs a column pager driven by `policy`. + pub fn new(policy: Arc) -> Self { + Self { policy } + } + + /// Drains `col` into a [`PagedColumn`]. After return `col` is left as a + /// fresh `Column::default()` (typed, empty), ready to be refilled by the + /// caller on the next loop iteration. + /// + /// Backend / codec semantics: + /// + /// * Uncompressed, [`Column::Align`]: the inner `Vec` is moved into + /// the pager handle with no copies. Swap backend keeps the allocation + /// resident; file backend writes it out and drops it. + /// * Uncompressed, other variants: the column is serialized via + /// [`ContainerBytes::into_bytes`] into a `Vec`, copied into a + /// u64-aligned `Vec`, then handed to the pager. + /// * Compressed: the column is serialized through an [`FrameEncoder`] + /// directly into the output buffer. No intermediate uncompressed + /// `Vec` is materialized. + pub fn page(&self, col: &mut Column) -> PagedColumn { + let len_bytes = col.length_in_bytes(); + let hint = PageHint { len_bytes }; + + let (backend, codec) = match self.policy.decide(hint) { + PageDecision::Skip => return PagedColumn::Resident(std::mem::take(col)), + PageDecision::Page { backend, codec } => (backend, codec), + }; + let meta = Meta { len_bytes }; + + match codec { + None => { + // Raw path: the body must end up as u64-aligned bytes for the + // pager. `Column::Align` already is; other variants are + // serialized and copied. + debug_assert_eq!(len_bytes % 8, 0); + let body: Vec = match std::mem::take(col) { + Column::Align(v) => v, + other => { + let mut buf = Vec::with_capacity(len_bytes); + other.into_bytes(&mut buf); + debug_assert_eq!(buf.len() % 8, 0); + bytemuck::allocation::pod_collect_to_vec::(&buf) + } + }; + let handle = pager::pageout_with(backend, &mut [body]); + self.policy.record(PageEvent::PagedOut { + bytes_in: len_bytes, + bytes_out: handle.len_bytes(), + backend, + codec: None, + }); + PagedColumn::Paged { handle, meta } + } + Some(Codec::Lz4) => { + // Stream serialized bytes straight into lz4 — no intermediate + // uncompressed `Vec`. + let mut out = Vec::with_capacity(len_bytes / 4); + { + let mut enc = FrameEncoder::new(&mut out); + col.into_bytes(&mut enc); + enc.finish().expect("lz4 finish into Vec is infallible"); + } + *col = Column::default(); + self.policy.record(PageEvent::PagedOut { + bytes_in: len_bytes, + bytes_out: out.len(), + backend, + codec: Some(Codec::Lz4), + }); + let inner = match backend { + Backend::Swap => CompressedInner::Memory(out), + Backend::File => { + let padded = pad_u8_to_u64(out); + let handle = pager::pageout_with(Backend::File, &mut [padded]); + CompressedInner::Paged(handle) + } + }; + PagedColumn::Compressed { inner, meta } + } + } + } + + /// Rehydrates `paged` into a [`Column`]. Consumes the handle and + /// reclaims its storage (file backend unlinks; swap backend drops the + /// `Vec`). + pub fn take(&self, paged: PagedColumn) -> Column { + match paged { + PagedColumn::Resident(c) => c, + PagedColumn::Paged { handle, meta } => { + let mut body: Vec = Vec::with_capacity(handle.len()); + pager::take(handle, &mut body); + debug_assert_eq!(body.len() * 8, meta.len_bytes); + self.policy.record(PageEvent::PagedIn { + bytes: meta.len_bytes, + }); + Column::Align(body) + } + PagedColumn::Compressed { inner, meta } => { + let mut decoded = Vec::with_capacity(meta.len_bytes); + match inner { + CompressedInner::Memory(v) => { + FrameDecoder::new(&v[..]) + .read_to_end(&mut decoded) + .expect("lz4 decode from memory"); + } + CompressedInner::Paged(h) => { + let mut padded = Vec::with_capacity(h.len()); + pager::take(h, &mut padded); + let src: &[u8] = bytemuck::cast_slice(&padded); + FrameDecoder::new(src) + .read_to_end(&mut decoded) + .expect("lz4 decode from pager"); + } + } + debug_assert_eq!(decoded.len(), meta.len_bytes); + self.policy.record(PageEvent::PagedIn { + bytes: decoded.len(), + }); + // `BytesMut::from` wraps the `Vec` without copying; `freeze` + // produces the refcounted `Bytes` that `ContainerBytes` expects. + Column::from_bytes(BytesMut::from(decoded).freeze()) + } + } + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Reinterprets `bytes` as a `Vec` by trailing-zero padding to a multiple +/// of 8 and copying. The lz4 frame trailer self-delimits so the trailing pad is +/// invisible to [`FrameDecoder`]. +fn pad_u8_to_u64(mut bytes: Vec) -> Vec { + let pad = bytes.len().next_multiple_of(8) - bytes.len(); + if pad != 0 { + bytes.resize(bytes.len() + pad, 0); + } + debug_assert_eq!(bytes.len() % 8, 0); + // `Vec` and `Vec` have different layouts (size + align), so we + // can't transmute the allocation. Copy into a fresh, properly aligned + // `Vec`. The cost is one `len_bytes/8`-word memcpy per pageout. + let len_u64s = bytes.len() / 8; + let mut out = vec![0u64; len_u64s]; + let dst: &mut [u8] = bytemuck::cast_slice_mut(&mut out); + dst.copy_from_slice(&bytes); + out +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +#[allow(clippy::clone_on_ref_ptr)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + + use columnar::Index; + use timely::container::PushInto; + + use super::*; + + /// Promotes a typed policy `Arc` to `Arc`. Hides the + /// unsize coercion behind a `clone()` so the trait object is constructed + /// without the now-discouraged `as` cast. + fn as_dyn(p: &Arc) -> Arc { + p.clone() + } + + /// Recording policy: configurable decision, counts events. + struct TestPolicy { + decision: PageDecision, + out: AtomicUsize, + r#in: AtomicUsize, + } + + impl TestPolicy { + fn new(decision: PageDecision) -> Arc { + Arc::new(Self { + decision, + out: AtomicUsize::new(0), + r#in: AtomicUsize::new(0), + }) + } + } + + impl PagingPolicy for TestPolicy { + fn decide(&self, _hint: PageHint) -> PageDecision { + self.decision + } + fn record(&self, event: PageEvent) { + match event { + PageEvent::PagedOut { .. } => { + self.out.fetch_add(1, Ordering::Relaxed); + } + PageEvent::PagedIn { .. } => { + self.r#in.fetch_add(1, Ordering::Relaxed); + } + PageEvent::Failed { .. } => {} + } + } + } + + /// Builds a sample typed column of `i64`s. + fn sample_typed() -> Column { + let mut col: Column = Default::default(); + for v in 0i64..1024 { + col.push_into(v); + } + col + } + + /// Drains a column into a `Vec` for comparison via `borrow`. + fn collect_i64(col: &Column) -> Vec { + col.borrow().into_index_iter().copied().collect() + } + + #[mz_ore::test] + fn skip_policy_keeps_resident() { + let policy = TestPolicy::new(PageDecision::Skip); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample_typed(); + let paged = cp.page(&mut col); + assert!(matches!(paged, PagedColumn::Resident(_))); + let rt = cp.take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + assert_eq!(policy.out.load(Ordering::Relaxed), 0); + assert_eq!(policy.r#in.load(Ordering::Relaxed), 0); + } + + #[mz_ore::test] + fn round_trip_swap_uncompressed() { + let policy = TestPolicy::new(PageDecision::Page { + backend: Backend::Swap, + codec: None, + }); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample_typed(); + let paged = cp.page(&mut col); + assert!(matches!(paged, PagedColumn::Paged { .. })); + let rt = cp.take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + assert_eq!(policy.out.load(Ordering::Relaxed), 1); + assert_eq!(policy.r#in.load(Ordering::Relaxed), 1); + } + + #[mz_ore::test] + fn round_trip_swap_lz4() { + let policy = TestPolicy::new(PageDecision::Page { + backend: Backend::Swap, + codec: Some(Codec::Lz4), + }); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample_typed(); + let paged = cp.page(&mut col); + assert!(matches!( + paged, + PagedColumn::Compressed { + inner: CompressedInner::Memory(_), + .. + } + )); + let rt = cp.take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + } + + #[mz_ore::test] + fn round_trip_file_uncompressed() { + let dir = tempfile::tempdir().unwrap(); + pager::set_scratch_dir(dir.path().to_path_buf()); + let policy = TestPolicy::new(PageDecision::Page { + backend: Backend::File, + codec: None, + }); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample_typed(); + let paged = cp.page(&mut col); + assert!(matches!(paged, PagedColumn::Paged { .. })); + let rt = cp.take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + } + + #[mz_ore::test] + fn round_trip_file_lz4() { + let dir = tempfile::tempdir().unwrap(); + pager::set_scratch_dir(dir.path().to_path_buf()); + let policy = TestPolicy::new(PageDecision::Page { + backend: Backend::File, + codec: Some(Codec::Lz4), + }); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample_typed(); + let paged = cp.page(&mut col); + assert!(matches!( + paged, + PagedColumn::Compressed { + inner: CompressedInner::Paged(_), + .. + } + )); + let rt = cp.take(paged); + assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); + } + + #[mz_ore::test] + fn align_variant_fast_path() { + // Construct an Align column directly to exercise the move-only raw path. + let policy = TestPolicy::new(PageDecision::Page { + backend: Backend::Swap, + codec: None, + }); + let cp = ColumnPager::new(as_dyn(&policy)); + let body: Vec = (1u64..=512).collect(); + let mut col: Column = Column::Align(body.clone()); + let paged = cp.page(&mut col); + assert!(matches!(paged, PagedColumn::Paged { .. })); + // After paging an Align variant, `col` is reset to the typed default. + assert!(matches!(col, Column::Typed(_))); + let rt = cp.take(paged); + // Round-tripped column should produce identical bytes. + match rt { + Column::Align(v) => assert_eq!(v, body), + other => panic!("expected Align, got {:?}", std::mem::discriminant(&other)), + } + } +} diff --git a/src/timely-util/src/lib.rs b/src/timely-util/src/lib.rs index 57474b58cea34..61f5801e86ca7 100644 --- a/src/timely-util/src/lib.rs +++ b/src/timely-util/src/lib.rs @@ -19,6 +19,7 @@ pub mod activator; pub mod antichain; pub mod builder_async; pub mod capture; +pub mod column_pager; pub mod columnar; pub mod columnation; pub mod containers; From aef6d53d0318885c3c0bb48d9a4e639c55c15bf6 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Thu, 14 May 2026 17:00:02 +0200 Subject: [PATCH 28/34] timely-util: tiered paging policy + drop-based release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `ResidentTicket`, a drop guard carried inside `PagedColumn::Resident` that fires a new `PageEvent::ResidentReleased { bytes }` when the resident column is consumed via `ColumnPager::take` or dropped without being taken. Lets policies track outstanding resident memory without leaking budget if a caller drops a column unexpectedly. Introduces `TieredPolicy` in `column_pager::policy`. Each Timely worker draws from a fixed per-worker byte budget; once exhausted it falls back to a process-wide shared pool, and only when both are full does it page out via a configured backend and codec. Per-worker state lives in a `thread_local!` static so worker threads see independent counters. This limits the design to one `TieredPolicy` per process — sufficient for the expected configuration, and the constraint is documented. Release order returns budget to the shared pool first so other workers unblock sooner. The shared pool is a single `AtomicUsize` consumed via a CAS loop; only the cold fallback path touches it. Co-Authored-By: Claude Opus 4.7 --- src/timely-util/src/column_pager.rs | 53 +++- src/timely-util/src/column_pager/policy.rs | 288 +++++++++++++++++++++ 2 files changed, 335 insertions(+), 6 deletions(-) create mode 100644 src/timely-util/src/column_pager/policy.rs diff --git a/src/timely-util/src/column_pager.rs b/src/timely-util/src/column_pager.rs index 1e1ca4076a796..e66c9a31a2f81 100644 --- a/src/timely-util/src/column_pager.rs +++ b/src/timely-util/src/column_pager.rs @@ -32,6 +32,8 @@ #![deny(missing_docs)] +pub mod policy; + use std::io::{self, Read}; use std::sync::Arc; @@ -109,6 +111,15 @@ pub enum PageEvent { /// Underlying I/O error. err: io::Error, }, + /// A resident column has been dropped. Fires from [`ResidentTicket::drop`] + /// when the [`PagedColumn::Resident`] holding the ticket is consumed by + /// [`ColumnPager::take`] or dropped without being taken. Policies use this + /// to return budget allocated when [`PagingPolicy::decide`] answered + /// [`PageDecision::Skip`]. + ResidentReleased { + /// Uncompressed body size returned to the policy. + bytes: usize, + }, } /// Decides whether/how to page a column out, and records page events. @@ -139,8 +150,12 @@ pub struct Meta { /// /// Each variant corresponds to one of the [`PageDecision`] outcomes. pub enum PagedColumn { - /// Body kept resident. Returned when the policy answered [`PageDecision::Skip`]. - Resident(Column), + /// Body kept resident. Returned when the policy answered + /// [`PageDecision::Skip`]. The accompanying [`ResidentTicket`] fires a + /// [`PageEvent::ResidentReleased`] when the variant is dropped or + /// consumed by [`ColumnPager::take`], so the policy can reclaim the + /// budget it granted in [`PagingPolicy::decide`]. + Resident(Column, ResidentTicket), /// Raw `ContainerBytes` payload stored via [`pager::Handle`]. The backend /// (Swap or File) is baked into the handle. Paged { @@ -159,6 +174,25 @@ pub enum PagedColumn { }, } +/// Drop guard that returns budget to a [`PagingPolicy`] when a +/// [`PagedColumn::Resident`] is destroyed. +/// +/// The ticket holds an `Arc` to the policy and the byte count it was charged +/// for at [`PagingPolicy::decide`] time. On drop it fires a +/// [`PageEvent::ResidentReleased`] event; the policy implementation decides +/// what to credit and where (local pool, shared pool, both). +pub struct ResidentTicket { + bytes: usize, + policy: Arc, +} + +impl Drop for ResidentTicket { + fn drop(&mut self) { + self.policy + .record(PageEvent::ResidentReleased { bytes: self.bytes }); + } +} + /// Storage location for the lz4-framed bytes inside a compressed paged column. pub enum CompressedInner { /// Owned `Vec` held resident in the caller's address space. @@ -207,7 +241,13 @@ impl ColumnPager { let hint = PageHint { len_bytes }; let (backend, codec) = match self.policy.decide(hint) { - PageDecision::Skip => return PagedColumn::Resident(std::mem::take(col)), + PageDecision::Skip => { + let ticket = ResidentTicket { + bytes: len_bytes, + policy: Arc::clone(&self.policy), + }; + return PagedColumn::Resident(std::mem::take(col), ticket); + } PageDecision::Page { backend, codec } => (backend, codec), }; let meta = Meta { len_bytes }; @@ -270,7 +310,8 @@ impl ColumnPager { /// `Vec`). pub fn take(&self, paged: PagedColumn) -> Column { match paged { - PagedColumn::Resident(c) => c, + // `_ticket` drops here and fires `PageEvent::ResidentReleased`. + PagedColumn::Resident(c, _ticket) => c, PagedColumn::Paged { handle, meta } => { let mut body: Vec = Vec::with_capacity(handle.len()); pager::take(handle, &mut body); @@ -382,7 +423,7 @@ mod tests { PageEvent::PagedIn { .. } => { self.r#in.fetch_add(1, Ordering::Relaxed); } - PageEvent::Failed { .. } => {} + PageEvent::ResidentReleased { .. } | PageEvent::Failed { .. } => {} } } } @@ -407,7 +448,7 @@ mod tests { let cp = ColumnPager::new(as_dyn(&policy)); let mut col = sample_typed(); let paged = cp.page(&mut col); - assert!(matches!(paged, PagedColumn::Resident(_))); + assert!(matches!(paged, PagedColumn::Resident(_, _))); let rt = cp.take(paged); assert_eq!(collect_i64(&rt), (0i64..1024).collect::>()); assert_eq!(policy.out.load(Ordering::Relaxed), 0); diff --git a/src/timely-util/src/column_pager/policy.rs b/src/timely-util/src/column_pager/policy.rs new file mode 100644 index 0000000000000..b631fcb115a05 --- /dev/null +++ b/src/timely-util/src/column_pager/policy.rs @@ -0,0 +1,288 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Concrete [`PagingPolicy`] implementations. +//! +//! Today: [`TieredPolicy`], a two-tier byte budget where each Timely worker +//! gets a fixed local budget and falls back to a process-wide shared pool when +//! the local pool is exhausted. + +use std::cell::RefCell; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use mz_ore::pager::Backend; + +use crate::column_pager::{Codec, PageDecision, PageEvent, PageHint, PagingPolicy}; + +/// A two-tier byte budget for resident columns. +/// +/// Each Timely worker thread draws first from a fixed per-worker pool of +/// `per_worker_budget` bytes. When a worker's pool is exhausted, it consults +/// the shared process-wide pool of `shared_budget` bytes (set at construction +/// via [`TieredPolicy::new`]). If both are full, [`PagingPolicy::decide`] +/// returns [`PageDecision::Page`] and the column is paged out via the +/// configured `backend` + `codec`. +/// +/// ## Per-worker state via thread-local storage +/// +/// Worker state lives in a `thread_local!` static, so each OS thread (= each +/// Timely worker, in current Materialize deployments) sees its own +/// `WorkerState`. This means **at most one `TieredPolicy` instance per +/// process** — a second instance would share the same `LOCAL` static and +/// corrupt the first instance's accounting. +/// +/// ## Release order +/// +/// On [`PageEvent::ResidentReleased`], the policy returns budget to the shared +/// pool first (so other workers unblock sooner), then to the local pool. +/// +/// ## Contention +/// +/// The shared pool is a single [`AtomicUsize`]. Only the cold path (local +/// exhausted) touches it. Per-byte CAS is fine at current page granularity; +/// if profiles show contention we can switch to chunk reservations. +pub struct TieredPolicy { + per_worker_budget: usize, + shared: AtomicUsize, + backend: Backend, + codec: Option, +} + +thread_local! { + static LOCAL: RefCell> = const { RefCell::new(None) }; +} + +/// Per-worker state. Initialized lazily on the first `with_local` call so the +/// `thread_local!` static doesn't need to know `per_worker_budget` up front. +#[derive(Debug)] +struct WorkerState { + /// Remaining bytes in the local pool. + remaining: usize, + /// Bytes the worker currently owes back to its local pool. + locally_owed: usize, + /// Bytes the worker currently owes back to the shared pool. + shared_owed: usize, +} + +impl TieredPolicy { + /// Constructs a tiered policy. Total budget is + /// `per_worker_budget * workers + shared_budget`. The first `decide` call + /// from each worker initializes that worker's local pool. + /// + /// `backend` and `codec` are used for the [`PageDecision::Page`] outcome + /// when both pools are exhausted. + pub fn new( + per_worker_budget: usize, + shared_budget: usize, + backend: Backend, + codec: Option, + ) -> Self { + Self { + per_worker_budget, + shared: AtomicUsize::new(shared_budget), + backend, + codec, + } + } + + /// Returns the current shared-pool remaining size in bytes. Useful for + /// metrics or tests. + pub fn shared_remaining(&self) -> usize { + self.shared.load(Ordering::Relaxed) + } + + fn with_local(&self, f: impl FnOnce(&mut WorkerState) -> R) -> R { + LOCAL.with(|cell| { + let mut borrow = cell.borrow_mut(); + let state = borrow.get_or_insert_with(|| WorkerState { + remaining: self.per_worker_budget, + locally_owed: 0, + shared_owed: 0, + }); + f(state) + }) + } +} + +impl PagingPolicy for TieredPolicy { + fn decide(&self, hint: PageHint) -> PageDecision { + self.with_local(|s| { + // Local pool first. + if s.remaining >= hint.len_bytes { + s.remaining -= hint.len_bytes; + s.locally_owed += hint.len_bytes; + return PageDecision::Skip; + } + // Shared pool fallback. + if try_consume(&self.shared, hint.len_bytes) { + s.shared_owed += hint.len_bytes; + return PageDecision::Skip; + } + // Both exhausted — page out. + PageDecision::Page { + backend: self.backend, + codec: self.codec, + } + }) + } + + fn record(&self, event: PageEvent) { + let PageEvent::ResidentReleased { bytes } = event else { + return; + }; + self.with_local(|s| { + // Return to shared first so other workers unblock sooner. + let from_shared = bytes.min(s.shared_owed); + if from_shared > 0 { + s.shared_owed -= from_shared; + self.shared.fetch_add(from_shared, Ordering::Relaxed); + } + let to_local = bytes - from_shared; + if to_local > 0 { + debug_assert!( + s.locally_owed >= to_local, + "release exceeds locally_owed (releasing {to_local}, owed {})", + s.locally_owed, + ); + s.locally_owed -= to_local; + s.remaining += to_local; + } + }); + } +} + +/// Atomically subtracts `want` from `atomic` if at least `want` is available. +/// Returns `true` on success. +fn try_consume(atomic: &AtomicUsize, want: usize) -> bool { + let mut cur = atomic.load(Ordering::Relaxed); + loop { + if cur < want { + return false; + } + match atomic.compare_exchange_weak(cur, cur - want, Ordering::AcqRel, Ordering::Relaxed) { + Ok(_) => return true, + Err(actual) => cur = actual, + } + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use timely::container::PushInto; + + use crate::column_pager::{ColumnPager, PagedColumn}; + use crate::columnar::Column; + + use super::*; + + fn sample(n: i64) -> Column { + let mut c: Column = Default::default(); + for v in 0..n { + c.push_into(v); + } + c + } + + /// Promotes a typed policy `Arc` to `Arc` without + /// triggering `clippy::clone_on_ref_ptr` or `clippy::as_conversions`. + fn as_dyn(p: &Arc) -> Arc { + #[allow(clippy::clone_on_ref_ptr)] + p.clone() + } + + /// All allocations fit in the per-worker pool. + #[mz_ore::test] + fn fits_in_local() { + let policy = Arc::new(TieredPolicy::new(64 * 1024, 0, Backend::Swap, None)); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample(256); + let p = cp.page(&mut col); + assert!(matches!(p, PagedColumn::Resident(_, _))); + drop(p); // Drop fires ResidentReleased; budget returns. + } + + /// Local pool exhausted, shared pool covers the rest. + #[mz_ore::test] + fn spills_to_shared() { + let policy = Arc::new(TieredPolicy::new(0, 64 * 1024, Backend::Swap, None)); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample(256); + let before = policy.shared_remaining(); + let p = cp.page(&mut col); + assert!(matches!(p, PagedColumn::Resident(_, _))); + let after = policy.shared_remaining(); + assert!(after < before, "shared pool should be consumed"); + drop(p); + assert_eq!( + policy.shared_remaining(), + before, + "release should refund the shared pool", + ); + } + + /// Both pools exhausted: pageout is forced. + #[mz_ore::test] + fn exhausted_pages_out() { + let policy = Arc::new(TieredPolicy::new(0, 0, Backend::Swap, None)); + let cp = ColumnPager::new(as_dyn(&policy)); + let mut col = sample(256); + let p = cp.page(&mut col); + assert!(matches!(p, PagedColumn::Paged { .. })); + } + + /// Local refill: a held Resident locks budget; dropping it frees space + /// for the next allocation. + #[mz_ore::test] + fn release_refills_local() { + let policy = Arc::new(TieredPolicy::new(4 * 1024, 0, Backend::Swap, None)); + let cp = ColumnPager::new(as_dyn(&policy)); + + // First allocation fits. + let mut col = sample(256); + let p1 = cp.page(&mut col); + assert!(matches!(p1, PagedColumn::Resident(_, _))); + + // Second allocation overflows local (no shared) -> page out. + let mut col2 = sample(256); + let p2 = cp.page(&mut col2); + assert!(matches!(p2, PagedColumn::Paged { .. })); + + // Releasing the first should refill local; a third allocation now + // fits resident again. + drop(p1); + drop(p2); + let mut col3 = sample(256); + let p3 = cp.page(&mut col3); + assert!(matches!(p3, PagedColumn::Resident(_, _))); + } + + #[mz_ore::test] + fn try_consume_atomicity() { + let a = AtomicUsize::new(10); + assert!(try_consume(&a, 4)); + assert_eq!(a.load(Ordering::Relaxed), 6); + assert!(!try_consume(&a, 7)); + assert_eq!(a.load(Ordering::Relaxed), 6); + assert!(try_consume(&a, 6)); + assert_eq!(a.load(Ordering::Relaxed), 0); + assert!(!try_consume(&a, 1)); + } +} From 9054eba387a821bfe7be87009b9cfd4f2d145f29 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Thu, 14 May 2026 17:06:11 +0200 Subject: [PATCH 29/34] timely-util: criterion bench for column_pager Measures round-trip (`page` + `take`) and operator-loop (`page` with column reuse) throughput across three axes: column size (4 KiB, 256 KiB, 4 MiB), pager backend (Swap, File), and codec (uncompressed, lz4). 24 cases total, throughput reported in bytes/sec via Criterion's `Throughput`. Run with: cargo bench -p mz-timely-util --bench column_pager The bench uses an `AlwaysPage` stub policy so every iteration exercises the paging path rather than the resident fast path. Smoke-tested at 4 KiB/swap/raw at ~8.6 GiB/s on a development laptop, which is close to the underlying pager's memcpy ceiling and confirms the column-pager layer adds no measurable overhead at that size. Co-Authored-By: Claude Opus 4.7 --- src/timely-util/Cargo.toml | 4 + src/timely-util/benches/column_pager.rs | 159 ++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 src/timely-util/benches/column_pager.rs diff --git a/src/timely-util/Cargo.toml b/src/timely-util/Cargo.toml index 24f775de78ef0..6000b887ad64d 100644 --- a/src/timely-util/Cargo.toml +++ b/src/timely-util/Cargo.toml @@ -21,6 +21,10 @@ harness = false name = "columnar_merger" harness = false +[[bench]] +name = "column_pager" +harness = false + [dependencies] ahash.workspace = true bincode.workspace = true diff --git a/src/timely-util/benches/column_pager.rs b/src/timely-util/benches/column_pager.rs new file mode 100644 index 0000000000000..01ece21ff496c --- /dev/null +++ b/src/timely-util/benches/column_pager.rs @@ -0,0 +1,159 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Throughput benchmark for [`mz_timely_util::column_pager`]. +//! +//! Two shapes: +//! +//! * `rt`: round-trip cost of `page` immediately followed by `take` on a +//! freshly built column. Measures the full encode/decode path including +//! pager I/O and (for the lz4 axis) compression and decompression. +//! * `loop_`: operator-loop cost — refill an existing column then `page` it, +//! without `take`. Mirrors how a spill operator amortizes allocations and +//! measures the cheaper write-only side of the cycle. +//! +//! Axes: +//! +//! * Column size, in target uncompressed bytes: 4 KiB, 256 KiB, 4 MiB. +//! * Pager backend: `Swap`, `File`. +//! * Codec: uncompressed, lz4. +//! +//! Run with: +//! +//! cargo bench -p mz-timely-util --bench column_pager + +use std::sync::Arc; + +use criterion::{BatchSize, Criterion, Throughput, criterion_group, criterion_main}; +use mz_ore::pager::{self, Backend}; +use mz_timely_util::column_pager::{ + Codec, ColumnPager, PageDecision, PageEvent, PageHint, PagedColumn, PagingPolicy, +}; +use mz_timely_util::columnar::Column; +use timely::container::PushInto; +use timely::dataflow::channels::ContainerBytes; + +/// Stub policy that always returns the configured decision. Records nothing. +struct AlwaysPage { + backend: Backend, + codec: Option, +} + +impl PagingPolicy for AlwaysPage { + fn decide(&self, _hint: PageHint) -> PageDecision { + PageDecision::Page { + backend: self.backend, + codec: self.codec, + } + } + fn record(&self, _event: PageEvent) {} +} + +/// Builds a `Column` whose serialized byte size is approximately +/// `target_bytes`. The actual size is reported by [`ContainerBytes::length_in_bytes`] +/// and used for throughput accounting. +fn build_column(target_bytes: usize) -> Column { + // i64 typed columns serialize to roughly 8 bytes per element plus header + // overhead. Aim a touch high and trust `length_in_bytes` for accounting. + let n = i64::try_from((target_bytes / 8).max(1)).expect("fits in i64"); + let mut c: Column = Default::default(); + for v in 0..n { + c.push_into(v); + } + c +} + +fn label(prefix: &str, target: usize, backend: Backend, codec: Option) -> String { + let size = match target { + n if n >= 1 << 20 => format!("{}MiB", n >> 20), + n if n >= 1 << 10 => format!("{}KiB", n >> 10), + n => format!("{n}B"), + }; + let codec = match codec { + None => "raw", + Some(Codec::Lz4) => "lz4", + }; + let backend = match backend { + Backend::Swap => "swap", + Backend::File => "file", + }; + format!("{prefix}/{size}/{backend}/{codec}") +} + +fn bench_round_trip(c: &mut Criterion, target: usize, backend: Backend, codec: Option) { + let policy: Arc = Arc::new(AlwaysPage { backend, codec }); + let cp = ColumnPager::new(policy); + let prototype = build_column(target); + let actual_bytes = prototype.length_in_bytes(); + + let mut group = c.benchmark_group("column_pager"); + group.throughput(Throughput::Bytes(u64::try_from(actual_bytes).unwrap())); + group.bench_function(label("rt", target, backend, codec), |b| { + b.iter_batched( + || build_column(target), + |mut col| { + let p = cp.page(&mut col); + let _ = cp.take(p); + }, + BatchSize::LargeInput, + ); + }); + group.finish(); +} + +fn bench_loop(c: &mut Criterion, target: usize, backend: Backend, codec: Option) { + let policy: Arc = Arc::new(AlwaysPage { backend, codec }); + let cp = ColumnPager::new(policy); + let prototype = build_column(target); + let actual_bytes = prototype.length_in_bytes(); + + let mut group = c.benchmark_group("column_pager"); + group.throughput(Throughput::Bytes(u64::try_from(actual_bytes).unwrap())); + group.bench_function(label("loop", target, backend, codec), |b| { + let mut col = build_column(target); + b.iter(|| { + // Operator loop: refill the column, then page it. Drop the paged + // result without `take`, simulating a write-only spill operator. + if col.length_in_bytes() == 0 { + col = build_column(target); + } + let paged: PagedColumn = cp.page(&mut col); + // Refill before next iteration so the column carries data again. + col = build_column(target); + std::mem::drop(paged); + }); + }); + group.finish(); +} + +fn benches(c: &mut Criterion) { + // The File backend writes to a scratch directory chosen at process + // startup; tests do this via `tempfile`. For the bench we use the + // platform default, which `pager::file` will create under + // `/tmp//...` if no override is set. + let scratch = std::env::temp_dir().join(format!("column-pager-bench-{}", std::process::id())); + let _ = std::fs::create_dir_all(&scratch); + pager::set_scratch_dir(scratch); + + let sizes = [4 * 1024, 256 * 1024, 4 * 1024 * 1024]; + let backends = [Backend::Swap, Backend::File]; + let codecs = [None, Some(Codec::Lz4)]; + + for &size in &sizes { + for &backend in &backends { + for &codec in &codecs { + bench_round_trip(c, size, backend, codec); + bench_loop(c, size, backend, codec); + } + } + } +} + +criterion_group!(column_pager_benches, benches); +criterion_main!(column_pager_benches); From e5f246dd03c59f3fc5dfe6bd18326c72fda34e7f Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Thu, 14 May 2026 17:39:23 +0200 Subject: [PATCH 30/34] timely-util: relabel swap-backend bench as swap-warm The pager's swap backend keeps the body Vec resident and hints MADV_COLD; the kernel evicts only under memory pressure. The column_pager bench round-trips one column at a time and never builds enough working set to trigger eviction, so swap-backend numbers measure the in-memory fast path (Vec move + bookkeeping), not the cost of a page-in from disk. Relabel the axis as `swap-warm` to make the distinction visible in every measurement name, and add a module-level caveat explaining what the numbers do and don't represent. A follow-up `column_pager_pressure` bench under `systemd-run --user --scope -p MemoryMax=...` will exercise the real eviction path. Co-Authored-By: Claude Opus 4.7 --- src/timely-util/benches/column_pager.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/timely-util/benches/column_pager.rs b/src/timely-util/benches/column_pager.rs index 01ece21ff496c..8fbf5adfd9f53 100644 --- a/src/timely-util/benches/column_pager.rs +++ b/src/timely-util/benches/column_pager.rs @@ -24,6 +24,22 @@ //! * Pager backend: `Swap`, `File`. //! * Codec: uncompressed, lz4. //! +//! ## Caveat: swap backend numbers are the warm fast path +//! +//! The pager's swap backend keeps the body `Vec` resident and hints +//! `MADV_COLD` to the kernel. This bench round-trips one column at a time +//! and never accumulates enough working set to exceed system RAM, so the +//! kernel never actually evicts. Swap-backend results therefore measure +//! `pageout = move-Vec-into-handle` and `take = move-Vec-out` plus +//! bookkeeping — essentially memcpy at the configured size — not the real +//! cost of a page-in from disk under memory pressure. +//! +//! To distinguish the cases, swap-backend results are labelled +//! `swap-warm` rather than `swap`. A separate `column_pager_pressure` +//! bench (TODO) will hold many paged handles alive under a constrained +//! cgroup (`systemd-run --user --scope -p MemoryMax=...`) so the kernel +//! is forced to evict, and time `take` on a cold handle. +//! //! Run with: //! //! cargo bench -p mz-timely-util --bench column_pager @@ -79,8 +95,12 @@ fn label(prefix: &str, target: usize, backend: Backend, codec: Option) -> None => "raw", Some(Codec::Lz4) => "lz4", }; + // `swap-warm` flags that this measures the in-memory fast path: the + // bench never builds enough working set to push the system into actual + // swap eviction, so swap-backend numbers reflect pageout/pagein as + // memcpy + bookkeeping, not kernel paging cost. See module docs. let backend = match backend { - Backend::Swap => "swap", + Backend::Swap => "swap-warm", Backend::File => "file", }; format!("{prefix}/{size}/{backend}/{codec}") From 898640bc5d21def8db3d4be496e10f92d7a56dbf Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 19 May 2026 11:04:03 -0400 Subject: [PATCH 31/34] cargo: add lz4_flex workspace dep Declares lz4_flex in `[workspace.dependencies]` so `mz-timely-util`'s `lz4_flex.workspace = true` resolves. Required by the column-paged merge batcher's optional lz4 codec; the dep was referenced before being declared and broke workspace loading. --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 1f71ea92e6c7b..0807e21b8a8d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -385,6 +385,7 @@ launchdarkly-server-sdk = { version = "2.6.2", default-features = false } lgalloc = "0.6.0" libc = "0.2.184" lru = "0.16.3" +lz4_flex = { version = "0.12.1", default-features = false, features = ["frame"] } maplit = "1.0.2" mappings = "0.7.2" md-5 = "0.10.6" From 199d91ad0032f332cd53c13c090f5d50ea1fa6f1 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Mon, 18 May 2026 11:54:15 -0400 Subject: [PATCH 32/34] compute: column-paged merge batcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a Materialize-private merge-batcher that routes per-chunk transient state through `ColumnPager`, bounding the resident-bytes peak under memory pressure. Behind `enable_column_paged_batcher` (default off). Three building blocks in `mz-timely-util`: * `ColumnMergeBatcher` + `merge_chains` + `extract_chain` in `columnar/merge_batcher.rs` — chains hold `PagedColumn` entries that resolve to disk on demand. Reuses the existing `Column::merge_from` / `Column::extract` building blocks. * `BuilderInput for Column<((K, V), T, R)>` so DD `OrdValBuilder` can consume the batcher's output without a container conversion. * `column_pager` gains a process-global pager singleton (matching the lower-level pager's global-atomic design) and a per-decision skip/page counter for diagnostics. Compute integration: * `RowRowColPagedBuilder` alias + `PartialEq<&RowRef> for DatumSeq` / `PushInto<&RowRef> for DatumContainer` so the Row-keyed arrange path type-checks. * Worker init in `apply_worker_config` reads three new dyncfgs and installs the process-global pager: `enable_column_paged_batcher` (on/off), `column_paged_batcher_backend` (`swap` | `file`), `column_paged_batcher_budget_fraction` (fraction of replica memory, default 5%). Per-worker / shared pool sizes derive from `memory_limiter::get_memory_limit` with sensible floors and caps. * Two arrange call sites switched to the paged path: `render/context.rs::arrange_collection` (central ArrangeBy) and `render/join/linear_join.rs::JoinStage`. Other arrange sites (logging) left on the legacy `ColInternalMerger` path. Also extends `Materialized` and `Clusterd` mzcompose services to accept `memory_swap` and `mem_swappiness`, so callers can configure container-level swap behavior independent of the batcher. --- .../mzcompose/services/clusterd.py | 13 + .../mzcompose/services/materialized.py | 11 + src/compute-types/src/dyncfgs.rs | 48 + src/compute/src/compute_state.rs | 69 ++ src/compute/src/render/context.rs | 8 +- src/compute/src/render/join/linear_join.rs | 8 +- src/compute/src/row_spine.rs | 66 +- src/timely-util/src/column_pager.rs | 106 +- src/timely-util/src/columnar.rs | 17 + src/timely-util/src/columnar/batcher.rs | 13 +- src/timely-util/src/columnar/builder_input.rs | 111 +++ src/timely-util/src/columnar/merge_batcher.rs | 939 ++++++++++++++++++ 12 files changed, 1392 insertions(+), 17 deletions(-) create mode 100644 src/timely-util/src/columnar/builder_input.rs create mode 100644 src/timely-util/src/columnar/merge_batcher.rs diff --git a/misc/python/materialize/mzcompose/services/clusterd.py b/misc/python/materialize/mzcompose/services/clusterd.py index e07ca490a5355..f05a465a46cf4 100644 --- a/misc/python/materialize/mzcompose/services/clusterd.py +++ b/misc/python/materialize/mzcompose/services/clusterd.py @@ -24,6 +24,8 @@ def __init__( environment_id: str | None = None, environment_extra: list[str] = [], memory: str | None = None, + memory_swap: str | None = None, + mem_swappiness: int | None = None, cpu: str | None = None, options: list[str] = [], restart: str = "no", @@ -93,6 +95,17 @@ def __init__( limits["cpus"] = cpu config["deploy"] = {"resources": {"limits": limits}} + # Swap controls aren't part of compose's `deploy.resources` schema; they + # live as top-level compose v2 service keys (`memswap_limit`, + # `mem_swappiness`). Setting `memswap_limit > mem_limit` enables the + # container to use host swap when RAM pressure builds, which lets the + # kernel page out anonymous memory rather than OOM-killing. Useful for + # benchmarking "OS swap" as a baseline vs application-managed spill. + if memory_swap is not None: + config["memswap_limit"] = memory_swap + if mem_swappiness is not None: + config["mem_swappiness"] = mem_swappiness + config.update( { "command": options, diff --git a/misc/python/materialize/mzcompose/services/materialized.py b/misc/python/materialize/mzcompose/services/materialized.py index 0be2b747811b6..9549e0d2fcb29 100644 --- a/misc/python/materialize/mzcompose/services/materialized.py +++ b/misc/python/materialize/mzcompose/services/materialized.py @@ -71,6 +71,8 @@ def __init__( volumes_extra: list[str] = [], depends_on: list[str] = [], memory: str | None = None, + memory_swap: str | None = None, + mem_swappiness: int | None = None, cpu: str | None = None, options: list[str] = [], persist_blob_url: str | None = None, @@ -332,6 +334,15 @@ def __init__( limits["cpus"] = cpu config["deploy"] = {"resources": {"limits": limits}} + # Swap controls live as top-level compose v2 service keys, not under + # `deploy.resources`. `memswap_limit > mem_limit` lets the container use + # host swap so the kernel can page out anonymous memory rather than OOM. + # `mem_swappiness=100` biases the kernel toward swapping aggressively. + if memory_swap is not None: + config["memswap_limit"] = memory_swap + if mem_swappiness is not None: + config["mem_swappiness"] = mem_swappiness + if sanity_restart: # Workaround for https://github.com/docker/compose/issues/11133 config["labels"] = {"sanity_restart": True} diff --git a/src/compute-types/src/dyncfgs.rs b/src/compute-types/src/dyncfgs.rs index 70235d9ddf2e5..68009ff048bc1 100644 --- a/src/compute-types/src/dyncfgs.rs +++ b/src/compute-types/src/dyncfgs.rs @@ -23,6 +23,51 @@ pub const ENABLE_HALF_JOIN2: Config = Config::new( "Whether compute should use `half_join2` rather than DD's `half_join` to render delta joins.", ); +/// Install the column-pageable merge batcher on each compute worker, so +/// arrangements that route through it can spill chunks under memory +/// pressure rather than holding them all resident. Disabled by default; +/// the budget/backend knobs below tune the behavior when enabled. +pub const ENABLE_COLUMN_PAGED_BATCHER: Config = Config::new( + "enable_column_paged_batcher", + false, + "Install the column-paged merge batcher on each compute worker so it can spill under memory \ + pressure.", +); + +/// Total resident-byte budget the column-paged batcher's +/// [`TieredPolicy`](mz_timely_util::column_pager::policy::TieredPolicy) +/// is allowed to hold across all workers in this process, expressed as +/// a fraction of the replica's announced memory limit. Workers split +/// this between a per-worker local pool and a process-wide shared pool; +/// values beyond either pool spill to the configured backend. +/// +/// `0.05` (5%) is a reasonable starting point: large enough that the +/// per-call ColumnBuilder ship-threshold (~2 MiB) fits multiple chunks +/// per worker, small enough that the merge-batcher's transient state +/// doesn't crowd out the spine. Set lower to spill more aggressively +/// under pressure; set `0.0` to spill on every chunk (sanity check only). +/// Ignored when `enable_column_paged_batcher` is `false`. +pub const COLUMN_PAGED_BATCHER_BUDGET_FRACTION: Config = Config::new( + "column_paged_batcher_budget_fraction", + 0.05, + "Fraction of replica memory the column-paged batcher's tiered policy may hold resident \ + before spilling to the backend. Total budget = mem_limit * fraction; split 1/8 per-worker \ + local (clamped 16-64 MiB) and 7/8 shared (clamped 128 MiB - 1 GiB).", +); + +/// Backend to which the column-paged batcher spills chunks once both the +/// per-worker and shared budgets are exhausted. `"swap"` keeps the bytes +/// in process memory (the OS swap subsystem may page them out under +/// pressure); `"file"` writes them to compute's scratch directory under +/// our control. The file backend requires `--scratch-directory` to be +/// configured on clusterd; if absent, the worker falls back to swap and +/// logs a warning. Ignored when `enable_column_paged_batcher` is `false`. +pub const COLUMN_PAGED_BATCHER_BACKEND: Config<&str> = Config::new( + "column_paged_batcher_backend", + "swap", + "Backend for column-paged batcher spills: \"swap\" or \"file\".", +); + /// Whether rendering should use `mz_join_core` rather than DD's `JoinCore::join_core`. pub const ENABLE_MZ_JOIN_CORE: Config = Config::new( "enable_mz_join_core", @@ -424,4 +469,7 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet { .add(&COMPUTE_PROMETHEUS_INTROSPECTION_SCRAPE_INTERVAL) .add(&SUBSCRIBE_SNAPSHOT_OPTIMIZATION) .add(&MV_SINK_ADVANCE_PERSIST_FRONTIERS) + .add(&ENABLE_COLUMN_PAGED_BATCHER) + .add(&COLUMN_PAGED_BATCHER_BUDGET_FRACTION) + .add(&COLUMN_PAGED_BATCHER_BACKEND) } diff --git a/src/compute/src/compute_state.rs b/src/compute/src/compute_state.rs index 849b40c7ba306..3d9ab30df383c 100644 --- a/src/compute/src/compute_state.rs +++ b/src/compute/src/compute_state.rs @@ -295,6 +295,75 @@ impl ComputeState { std::sync::atomic::Ordering::Relaxed, ); + // Install / replace the column-paged batcher's process-global pager. + // Reads on every config apply so changing the dyncfg at runtime takes + // effect on the next merge-batcher chunk. All compute workers in + // this process share one pager; per-worker budget bookkeeping lives + // inside `TieredPolicy`'s own thread-local accounting (see policy.rs). + if ENABLE_COLUMN_PAGED_BATCHER.get(config) { + use mz_ore::pager::{self, Backend}; + use mz_timely_util::column_pager::policy::TieredPolicy; + use mz_timely_util::column_pager::{ColumnPager, set_global_pager}; + use std::sync::Arc; + + // Budget derivation: fraction × announced memory limit, split + // 1/8 per-worker (clamped 16-64 MiB) and 7/8 shared (clamped + // 128 MiB - 1 GiB). Clamps cushion against fraction misconfigs: + // floors prevent per-chunk pageout in the no-pressure case; + // ceilings prevent the batcher from hoarding RAM the spine + // could use on big-memory replicas. Falls back to a 4 GiB + // assumption if no limit was announced (e.g. dev environments). + const MIB: usize = 1024 * 1024; + const DEFAULT_MEM_LIMIT: usize = 4 * 1024 * MIB; + let mem_limit = crate::memory_limiter::get_memory_limit() + .unwrap_or(DEFAULT_MEM_LIMIT); + let fraction = COLUMN_PAGED_BATCHER_BUDGET_FRACTION.get(config).max(0.0); + let total = ((mem_limit as f64) * fraction) as usize; + let per_worker = (total / 8).clamp(16 * MIB, 64 * MIB); + let shared = total.saturating_sub(per_worker).clamp(128 * MIB, 1024 * MIB); + + let backend_str = COLUMN_PAGED_BATCHER_BACKEND.get(config); + let backend = match backend_str.as_str() { + "file" => { + if let Some(path) = &self.context.scratch_directory { + // `set_scratch_dir` is process-wide and idempotent + // (per-process subdir under the given root), so calling + // it from every worker on every config apply is safe. + pager::set_scratch_dir(path.clone()); + Backend::File + } else { + warn!( + "column-paged batcher requested file backend but \ + scratch-directory is unset; falling back to swap" + ); + Backend::Swap + } + } + "swap" => Backend::Swap, + other => { + warn!( + backend = %other, + "unknown column_paged_batcher_backend; using swap" + ); + Backend::Swap + } + }; + info!( + ?backend, + fraction, + mem_limit, + per_worker_bytes = per_worker, + shared_bytes = shared, + "column-paged batcher: installing tiered pager", + ); + let policy = Arc::new(TieredPolicy::new(per_worker, shared, backend, None)); + set_global_pager(ColumnPager::new(policy)); + } else { + use mz_timely_util::column_pager::{ColumnPager, set_global_pager}; + info!("column-paged batcher: disabled, installing no-op pager"); + set_global_pager(ColumnPager::disabled()); + } + // Remember the maintenance interval locally to avoid reading it from the config set on // every server iteration. self.server_maintenance_interval = COMPUTE_SERVER_MAINTENANCE_INTERVAL.get(config); diff --git a/src/compute/src/render/context.rs b/src/compute/src/render/context.rs index 5ba42239c74b6..e59407a5f4b39 100644 --- a/src/compute/src/render/context.rs +++ b/src/compute/src/render/context.rs @@ -31,7 +31,7 @@ use mz_repr::fixed_length::ToDatumIter; use mz_repr::{DatumVec, DatumVecBorrow, Diff, GlobalId, Row, RowArena, SharedRow}; use mz_storage_types::controller::CollectionMetadata; use mz_timely_util::columnar::builder::ColumnBuilder; -use mz_timely_util::columnar::{Col2ValBatcher, columnar_exchange}; +use mz_timely_util::columnar::{Col2ValPagedBatcher, columnar_exchange}; use timely::ContainerBuilder; use timely::container::{CapacityContainerBuilder, PushInto}; use timely::dataflow::channels::pact::{ExchangeCore, Pipeline}; @@ -46,7 +46,7 @@ use crate::compute_state::ComputeState; use crate::extensions::arrange::{KeyCollection, MzArrange, MzArrangeCore}; use crate::render::errors::{DataflowErrorSer, ErrorLogger}; use crate::render::{LinearJoinSpec, MaybeBucketByTime, RenderTimestamp}; -use crate::row_spine::{DatumSeq, RowRowBuilder}; +use crate::row_spine::{DatumSeq, RowRowColPagedBuilder}; use crate::typedefs::{ ErrAgent, ErrBatcher, ErrBuilder, ErrEnter, ErrSpine, RowRowAgent, RowRowEnter, RowRowSpine, }; @@ -1148,8 +1148,8 @@ impl<'scope, T: RenderTimestamp> CollectionBundle<'scope, T> { let oks = ok_stream .mz_arrange_core::< _, - Col2ValBatcher<_, _, _, _>, - RowRowBuilder<_, _>, + Col2ValPagedBatcher<_, _, _, _>, + RowRowColPagedBuilder<_, _>, RowRowSpine<_, _>, >( ExchangeCore::, _>::new_core( diff --git a/src/compute/src/render/join/linear_join.rs b/src/compute/src/render/join/linear_join.rs index 285d1d90c7285..2078179dbb451 100644 --- a/src/compute/src/render/join/linear_join.rs +++ b/src/compute/src/render/join/linear_join.rs @@ -25,7 +25,7 @@ use mz_dyncfg::ConfigSet; use mz_repr::fixed_length::ToDatumIter; use mz_repr::{DatumVec, Diff, Row, RowArena, SharedRow}; use mz_timely_util::columnar::builder::ColumnBuilder; -use mz_timely_util::columnar::{Col2ValBatcher, columnar_exchange}; +use mz_timely_util::columnar::{Col2ValPagedBatcher, columnar_exchange}; use mz_timely_util::operator::{CollectionExt, StreamExt}; use timely::dataflow::Scope; use timely::dataflow::channels::pact::{ExchangeCore, Pipeline}; @@ -36,7 +36,7 @@ use crate::render::RenderTimestamp; use crate::render::context::{ArrangementFlavor, CollectionBundle, Context}; use crate::render::errors::DataflowErrorSer; use crate::render::join::mz_join_core::mz_join_core; -use crate::row_spine::{RowRowBuilder, RowRowSpine}; +use crate::row_spine::{RowRowColPagedBuilder, RowRowSpine}; use crate::typedefs::{RowRowAgent, RowRowEnter}; /// Available linear join implementations. @@ -383,8 +383,8 @@ where let arranged = keyed .mz_arrange_core::< _, - Col2ValBatcher<_, _, _, _>, - RowRowBuilder<_, _>, + Col2ValPagedBatcher<_, _, _, _>, + RowRowColPagedBuilder<_, _>, RowRowSpine<_, _>, >( ExchangeCore::, _>::new_core( diff --git a/src/compute/src/row_spine.rs b/src/compute/src/row_spine.rs index 90ada7a7386de..c731f39aabc95 100644 --- a/src/compute/src/row_spine.rs +++ b/src/compute/src/row_spine.rs @@ -11,8 +11,8 @@ pub use self::container::DatumContainer; pub use self::container::DatumSeq; pub use self::offset_opt::OffsetOptimized; pub use self::spines::{ - RowBatcher, RowBuilder, RowRowBatcher, RowRowBuilder, RowRowSpine, RowSpine, RowValBatcher, - RowValBuilder, RowValSpine, + RowBatcher, RowBuilder, RowRowBatcher, RowRowBuilder, RowRowColPagedBuilder, RowRowSpine, + RowSpine, RowValBatcher, RowValBuilder, RowValSpine, }; use differential_dataflow::trace::implementations::OffsetList; @@ -28,6 +28,7 @@ mod spines { use differential_dataflow::trace::implementations::spine_fueled::Spine; use differential_dataflow::trace::rc_blanket_impls::RcBuilder; use mz_repr::Row; + use mz_timely_util::columnar::Column; use mz_timely_util::columnation::ColumnationStack; use crate::row_spine::{DatumContainer, OffsetOptimized}; @@ -39,6 +40,14 @@ mod spines { OrdValBuilder, ColumnationStack<((Row, Row), T, R)>>, >; + /// `RowRowBuilder` variant that consumes [`Column`] chunks. Pairs with + /// [`Col2ValPagedBatcher`] for the spillable arrange path. + /// + /// [`Col2ValPagedBatcher`]: mz_timely_util::columnar::Col2ValPagedBatcher + pub type RowRowColPagedBuilder = RcBuilder< + OrdValBuilder, Column<((Row, Row), T, R)>>, + >; + pub type RowValSpine = Spine>>>; pub type RowValBatcher = KeyValBatcher; pub type RowValBuilder = RcBuilder< @@ -97,6 +106,45 @@ mod spines { } } +#[cfg(test)] +mod bounds_checks { + //! Compile-time bound checks: ensure the new paged-batcher/builder pair + //! actually resolves the `Builder` / `Batcher` trait bounds together. + //! These functions are never called; if the bounds drift, the build + //! fails here instead of at the consumer call site. + + use differential_dataflow::trace::{Batcher, Builder}; + use mz_repr::Diff; + use mz_timely_util::columnar::Col2ValPagedBatcher; + + use super::RowRowColPagedBuilder; + + type T = mz_repr::Timestamp; + + #[allow(dead_code)] + fn assert_builder_resolves() { + fn require_builder() {} + require_builder::>(); + } + + #[allow(dead_code)] + fn assert_batcher_output_matches_builder_input() { + // `Batcher::Output` and `Builder::Input` must agree for + // `mz_arrange_core` to type-check. Both should be + // `Column<((Row, Row), T, R)>`. + fn require_match() + where + Ba: Batcher, + Bu: Builder, + { + } + require_match::< + Col2ValPagedBatcher, + RowRowColPagedBuilder, + >(); + } +} + /// A `Row`-specialized container using dictionary compression. mod container { @@ -105,7 +153,7 @@ mod container { use differential_dataflow::trace::implementations::BatchContainer; use timely::container::PushInto; - use mz_repr::{Datum, Row, RowPacker, read_datum}; + use mz_repr::{Datum, Row, RowPacker, RowRef, read_datum}; use super::bytes_container::BytesContainer; @@ -205,6 +253,12 @@ mod container { } } + impl PushInto<&RowRef> for DatumContainer { + fn push_into(&mut self, item: &RowRef) { + self.bytes.push_into(item.data()) + } + } + #[derive(Debug)] pub struct DatumSeq<'a> { bytes: &'a [u8], @@ -247,6 +301,12 @@ mod container { self.bytes.eq(other.data()) } } + impl<'a> PartialEq<&RowRef> for DatumSeq<'a> { + #[inline] + fn eq(&self, other: &&RowRef) -> bool { + self.bytes.eq(other.data()) + } + } impl<'a> Eq for DatumSeq<'a> {} impl<'a, 'b> PartialOrd> for DatumSeq<'b> { #[inline] diff --git a/src/timely-util/src/column_pager.rs b/src/timely-util/src/column_pager.rs index e66c9a31a2f81..935778d5434cc 100644 --- a/src/timely-util/src/column_pager.rs +++ b/src/timely-util/src/column_pager.rs @@ -35,7 +35,7 @@ pub mod policy; use std::io::{self, Read}; -use std::sync::Arc; +use std::sync::{Arc, LazyLock, RwLock}; use columnar::Columnar; use lz4_flex::frame::{FrameDecoder, FrameEncoder}; @@ -221,6 +221,104 @@ impl ColumnPager { Self { policy } } + /// Constructs a pager that never pages out: every [`page`] returns a + /// [`PagedColumn::Resident`] whose ticket discards release events. Useful + /// as a default when callers want a placeholder pager before injecting a + /// real policy. + /// + /// [`page`]: ColumnPager::page + pub fn disabled() -> Self { + Self::new(Arc::new(AlwaysResidentPolicy)) + } +} + +/// Policy that keeps every column resident and discards events. Backs +/// [`ColumnPager::disabled`]. +struct AlwaysResidentPolicy; + +impl PagingPolicy for AlwaysResidentPolicy { + fn decide(&self, _hint: PageHint) -> PageDecision { + PageDecision::Skip + } + fn record(&self, _event: PageEvent) {} +} + +// --------------------------------------------------------------------------- +// Process-global pager +// --------------------------------------------------------------------------- +// +// Following the pager design doc's spirit (`doc/developer/design/20260504_pager.md`): +// "the cluster runs on swap or file, not both at once; a global atomic +// encodes that operational reality directly. A per-pager design would +// either duplicate the global flag at the struct level or invite confusion +// about which configuration wins." +// +// The lower-level `mz_ore::pager` already uses a global atomic for backend +// selection. This module's policy/budget layer mirrors that shape: one +// `ColumnPager` per process, swapped atomically when the controller changes +// the configuration. Merge batchers clone the `Arc` inside on use; live +// reinstalls take effect on the next call without per-thread coordination. + +/// Process-global active pager. Defaults to [`ColumnPager::disabled`] +/// until worker init calls [`set_global_pager`]. +static GLOBAL_PAGER: LazyLock> = + LazyLock::new(|| RwLock::new(ColumnPager::disabled())); + +/// Install `pager` as the process-wide active pager. Subsequent +/// [`global_pager`] calls return a clone of this value across all threads. +/// +/// Worker init calls this on every config apply; each clusterd worker +/// thread in the same process sees the same `worker_config`, so the +/// repeated installs are idempotent (same `Arc`-equivalent state). Any +/// [`PagedColumn`]s already in flight keep their own `Arc` clone, so a reinstall doesn't invalidate handles. +pub fn set_global_pager(pager: ColumnPager) { + *GLOBAL_PAGER.write().expect("global pager poisoned") = pager; +} + +/// Process-wide decision counters. Diagnostic only — log a summary every +/// `DECISION_LOG_INTERVAL` Page decisions so we can tell whether the +/// pager is actually engaging without per-call log spam. +static SKIP_COUNT: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); +static PAGE_COUNT: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); +static SKIP_BYTES: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); +static PAGE_BYTES: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); +const DECISION_LOG_INTERVAL: usize = 1024; + +fn record_decision(paged: bool, bytes: usize) { + use std::sync::atomic::Ordering; + if paged { + let n = PAGE_COUNT.fetch_add(1, Ordering::Relaxed) + 1; + PAGE_BYTES.fetch_add(bytes, Ordering::Relaxed); + if n.is_multiple_of(DECISION_LOG_INTERVAL) { + let s = SKIP_COUNT.load(Ordering::Relaxed); + let sb = SKIP_BYTES.load(Ordering::Relaxed); + let pb = PAGE_BYTES.load(Ordering::Relaxed); + tracing::info!( + skip_calls = s, + skip_bytes = sb, + page_calls = n, + page_bytes = pb, + "column-pager: decision rate" + ); + } + } else { + SKIP_COUNT.fetch_add(1, Ordering::Relaxed); + SKIP_BYTES.fetch_add(bytes, Ordering::Relaxed); + } +} + +/// Returns the current global pager. Cheap: clones the inner `Arc`. +pub fn global_pager() -> ColumnPager { + GLOBAL_PAGER + .read() + .expect("global pager poisoned") + .clone() +} + +impl ColumnPager { + /// Drains `col` into a [`PagedColumn`]. After return `col` is left as a /// fresh `Column::default()` (typed, empty), ready to be refilled by the /// caller on the next loop iteration. @@ -242,13 +340,17 @@ impl ColumnPager { let (backend, codec) = match self.policy.decide(hint) { PageDecision::Skip => { + record_decision(false, len_bytes); let ticket = ResidentTicket { bytes: len_bytes, policy: Arc::clone(&self.policy), }; return PagedColumn::Resident(std::mem::take(col), ticket); } - PageDecision::Page { backend, codec } => (backend, codec), + PageDecision::Page { backend, codec } => { + record_decision(true, len_bytes); + (backend, codec) + } }; let meta = Meta { len_bytes }; diff --git a/src/timely-util/src/columnar.rs b/src/timely-util/src/columnar.rs index 2b632006e8a84..53e12ba489581 100644 --- a/src/timely-util/src/columnar.rs +++ b/src/timely-util/src/columnar.rs @@ -19,7 +19,9 @@ pub mod batcher; pub mod builder; +pub mod builder_input; pub mod consolidate; +pub mod merge_batcher; use std::hash::Hash; @@ -46,6 +48,21 @@ pub type Col2ValBatcher = MergeBatcher< /// A batcher for columnar storage with unit values. pub type Col2KeyBatcher = Col2ValBatcher; +/// Pageable counterpart to [`Col2ValBatcher`]. Routes every chunk produced +/// by chunking, merging, or extract through a [`ColumnPager`], so memory +/// pressure can spill chains to a backing store without touching the merge / +/// extract bodies. +/// +/// Drop-in shape at the type level: both aliases take `(K, V, T, R)` and +/// produce a `Batcher, Output = Column<((K, +/// V), T, R)>>`. Call sites can swap with `cargo fix`–style renaming once +/// downstream `Trace`/`Builder` impls have been wired up. The pager itself +/// defaults to [`ColumnPager::disabled`]; inject a real one via +/// [`merge_batcher::ColumnMergeBatcher::set_pager`]. +pub type Col2ValPagedBatcher = merge_batcher::ColumnMergeBatcher<(K, V), T, R>; +/// Pageable counterpart to [`Col2KeyBatcher`]. +pub type Col2KeyPagedBatcher = Col2ValPagedBatcher; + /// A container based on a columnar store, encoded in aligned bytes. /// /// The type can represent typed data, bytes from Timely, or an aligned allocation. The name diff --git a/src/timely-util/src/columnar/batcher.rs b/src/timely-util/src/columnar/batcher.rs index 6bad47e20681f..c76b131f98e04 100644 --- a/src/timely-util/src/columnar/batcher.rs +++ b/src/timely-util/src/columnar/batcher.rs @@ -314,9 +314,9 @@ impl Default for ColumnMerger { /// so the merger can call them without going through any wrapper indirection. impl Column<(D, T, R)> where - D: Columnar + Default, + D: Columnar, for<'a> columnar::Ref<'a, D>: Copy + Ord, - T: Columnar + Default + Clone + PartialOrder, + T: Columnar + Clone + PartialOrder, for<'a> columnar::Ref<'a, T>: Copy + Ord, R: Columnar + Default + Semigroup + for<'a> Semigroup>, for<'a> <(D, T, R) as Columnar>::Container: columnar::Push<&'a (D, T, R)>, @@ -572,7 +572,6 @@ where let self_view = self.borrow(); let len = self_view.len(); - let mut owned_t = T::default(); // Yield to the framework when either output buffer reaches the // ship threshold, so it can ship a full chunk and hand back a // fresh one. Required by the merger's extract contract: the @@ -585,7 +584,13 @@ where && !crate::columnar::at_serialized_capacity(&ship_c.borrow()) { let (_, time, _) = self_view.get(*position); - T::copy_from(&mut owned_t, time); + // `into_owned` rather than `default() + copy_from(time)` so we + // don't require `T: Default` on the impl bound — render + // timestamps don't have it. For variable-length `T` we + // allocate fresh per record instead of reusing a slot; for the + // primitive-shaped timestamps this path typically sees, the + // difference is unmeasurable. + let owned_t = T::into_owned(time); if upper.less_equal(&owned_t) { // `insert_with` only clones when the time isn't already // present in the antichain. diff --git a/src/timely-util/src/columnar/builder_input.rs b/src/timely-util/src/columnar/builder_input.rs new file mode 100644 index 0000000000000..8dbd0e8347339 --- /dev/null +++ b/src/timely-util/src/columnar/builder_input.rs @@ -0,0 +1,111 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! `BuilderInput` impl for [`Column`] so DD `Builder`s can drain our paged +//! batcher's output without an extra container conversion. +//! +//! Mirrors the impl on [`ColumnationStack`](crate::columnation::ColumnationStack) +//! at `columnation.rs`, but the `Item<'a>` here is a columnar `Ref` tuple +//! rather than a borrowed owned tuple, so: +//! +//! - `Key<'a>` / `Val<'a>` are `Ref<'a, K>` / `Ref<'a, V>` — no `Owned` +//! round-trip on the read side. +//! - `Time` / `Diff` materialize as owned on `into_parts` (the trait +//! contract requires owned for these). +//! +//! Distinct-counts (`key_val_upd_counts`) tally per chunk and sum, accepting +//! at most `chain.len()` over-counts at chunk boundaries. The downstream +//! consumer uses these as capacity hints, so a small over-estimate is +//! cheaper than the alternative (snapshotting `K::Owned` / `V::Owned` +//! across chunk boundaries). + +use columnar::{Columnar, Index, Len}; +use differential_dataflow::difference::Semigroup; +use differential_dataflow::lattice::Lattice; +use differential_dataflow::trace::implementations::{BatchContainer, BuilderInput}; +use timely::progress::Timestamp; + +use crate::columnar::Column; + +impl BuilderInput for Column<((K, V), T, R)> +where + K: Columnar, + V: Columnar, + T: Columnar + Timestamp + Lattice + Clone, + R: Columnar + Ord + Semigroup + Clone, + for<'a> columnar::Ref<'a, K>: Copy + Ord, + for<'a> columnar::Ref<'a, V>: Copy + Ord, + KBC: BatchContainer, + VBC: BatchContainer, + for<'a, 'b> KBC::ReadItem<'a>: PartialEq>, + for<'a, 'b> VBC::ReadItem<'a>: PartialEq>, +{ + type Key<'a> = columnar::Ref<'a, K>; + type Val<'a> = columnar::Ref<'a, V>; + type Time = T; + type Diff = R; + + fn into_parts<'a>( + item: Self::Item<'a>, + ) -> (Self::Key<'a>, Self::Val<'a>, Self::Time, Self::Diff) { + let ((key, val), time, diff) = item; + (key, val, T::into_owned(time), R::into_owned(diff)) + } + + fn key_eq(this: &columnar::Ref<'_, K>, other: KBC::ReadItem<'_>) -> bool { + KBC::reborrow(other) == *this + } + + fn val_eq(this: &columnar::Ref<'_, V>, other: VBC::ReadItem<'_>) -> bool { + VBC::reborrow(other) == *this + } + + fn key_val_upd_counts(chain: &[Self]) -> (usize, usize, usize) { + // Per-chunk dedup, summed. Skips cross-chunk equality checks; the + // counts may over-count by up to `chain.len()` (one boundary per + // chunk). Capacity-hint consumers tolerate over-estimates. + let mut keys = 0; + let mut vals = 0; + let mut upds = 0; + for col in chain.iter() { + let view = col.borrow(); + let len = view.len(); + if len == 0 { + continue; + } + let mut prev: Option<(columnar::Ref<'_, K>, columnar::Ref<'_, V>)> = None; + for i in 0..len { + let ((k, v), _, _) = view.get(i); + match prev { + None => { + keys += 1; + vals += 1; + } + Some((pk, pv)) => { + if pk != k { + keys += 1; + vals += 1; + } else if pv != v { + vals += 1; + } + } + } + upds += 1; + prev = Some((k, v)); + } + } + (keys, vals, upds) + } +} diff --git a/src/timely-util/src/columnar/merge_batcher.rs b/src/timely-util/src/columnar/merge_batcher.rs new file mode 100644 index 0000000000000..9c32481207b03 --- /dev/null +++ b/src/timely-util/src/columnar/merge_batcher.rs @@ -0,0 +1,939 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file at the +// root of this repository, or online at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Merge-batcher for [`Column`] chunks with per-chunk paging. +//! +//! Forks the [`differential_dataflow`] merge-batcher framework so chains can +//! hold [`PagedColumn`] entries — letting the [`ColumnPager`] page chunks +//! out as they're produced and fetch them back lazily during merge / extract. +//! +//! Reuses the resident building blocks from [`super::batcher`]: +//! [`ColumnChunker`] (input consolidation) and the inherent +//! `Column::merge_from` / `Column::extract` methods (per-chunk merge / split). +//! +//! [`differential_dataflow`]: differential_dataflow::trace::implementations::merge_batcher + +use std::collections::VecDeque; + +use columnar::{Columnar, Len}; +use differential_dataflow::difference::Semigroup; +use differential_dataflow::logging::{BatcherEvent, Logger}; +use differential_dataflow::trace::{Batcher, Builder, Description}; +use timely::Accountable; +use timely::PartialOrder; +use timely::container::{ContainerBuilder, PushInto, SizableContainer}; +use timely::dataflow::channels::ContainerBytes; +use timely::progress::Timestamp; +use timely::progress::frontier::{Antichain, AntichainRef}; + +use crate::column_pager::{self, ColumnPager, PagedColumn}; +use crate::columnar::Column; +use crate::columnar::batcher::ColumnChunker; + +// --------------------------------------------------------------------------- +// Batcher +// --------------------------------------------------------------------------- + +/// Drives the merge-batcher over [`Column`] chunks routed through a +/// [`ColumnPager`]. +/// +/// Chains hold [`PagedColumn`] entries rather than resident [`Column`]s, so +/// each insert / merge / extract step can hand its output to the pager and +/// store whatever the policy returns (resident, paged, or compressed). Reads +/// during merge materialize lazily via [`FetchIter`]. +/// +/// Resolves its pager lazily per call via [`column_pager::global_pager`], so +/// late-arriving dyncfg updates (e.g. `enable_column_paged_batcher` flipping +/// on after the batcher was constructed) take effect without rebuilding the +/// operator. Tests may override that lookup via [`Self::set_pager`]. +pub struct ColumnMergeBatcher +where + D: Columnar, + T: Columnar, + R: Columnar, +{ + chunker: ColumnChunker<(D, T, R)>, + chains: Vec>>, + lower: Antichain, + frontier: Antichain, + /// Optional override. `None` means "read [`column_pager::global_pager`] + /// fresh on every use" — the production path, so worker_config dyncfg + /// changes that re-install the process-global pager take effect on the + /// very next chunk this batcher processes. + pager_override: Option, + logger: Option, + operator_id: usize, +} + +impl ColumnMergeBatcher +where + D: Columnar, + T: Columnar, + R: Columnar, +{ + /// Pin the pager this batcher uses, overriding the thread-local lookup. + /// Mainly for tests; production should leave the override unset so + /// dyncfg-driven re-installs take effect immediately. + pub fn set_pager(&mut self, pager: ColumnPager) { + self.pager_override = Some(pager); + } + + /// Current pager — override if set, else the process-global pager + /// installed by `apply_worker_config`. `ColumnPager` is cheaply + /// cloneable (Arc inside). + fn pager(&self) -> ColumnPager { + self.pager_override + .clone() + .unwrap_or_else(column_pager::global_pager) + } + + /// Push a chain into `self.chains`, emitting a positive `BatcherEvent` + /// covering its resident entries. + fn chain_push(&mut self, chain: VecDeque>) { + self.emit_account(&chain, 1); + self.chains.push(chain); + } + + /// Pop a chain from `self.chains`, emitting a negative `BatcherEvent` + /// retracting its resident entries. + fn chain_pop(&mut self) -> Option>> { + let chain = self.chains.pop()?; + self.emit_account(&chain, -1); + Some(chain) + } + + /// Emit a single `BatcherEvent` summing resident accounting across + /// `chain` with the given sign. No-op when no logger is attached. + fn emit_account(&self, chain: &VecDeque>, diff: isize) { + let Some(logger) = &self.logger else { + return; + }; + let (mut records, mut size, mut capacity, mut allocations) = + (0isize, 0isize, 0isize, 0isize); + for entry in chain { + let (r, s, c, a) = account_chunk(entry); + records = records.saturating_add_unsigned(r); + size = size.saturating_add_unsigned(s); + capacity = capacity.saturating_add_unsigned(c); + allocations = allocations.saturating_add_unsigned(a); + } + logger.log(BatcherEvent { + operator: self.operator_id, + records_diff: records.saturating_mul(diff), + size_diff: size.saturating_mul(diff), + capacity_diff: capacity.saturating_mul(diff), + allocations_diff: allocations.saturating_mul(diff), + }); + } +} + +impl Drop for ColumnMergeBatcher +where + D: Columnar, + T: Columnar, + R: Columnar, +{ + fn drop(&mut self) { + // Retract accounting for any chains still resident at drop time so + // the BatcherEvent counters end at zero per-operator. + while self.chain_pop().is_some() {} + } +} + +/// Resident-only accounting. Returns `(records, size_bytes, capacity_bytes, +/// allocations)` for a single chain entry; paged-out entries contribute 0 +/// across the board. +/// +/// `BatcherEvent` feeds the `mz_arrangement_batcher_*_raw` introspection +/// tables, which downstream surface as memory-resource dashboards. Bytes +/// living on swap or in a pager file aren't part of RSS and shouldn't be +/// reported there. +fn account_chunk(entry: &PagedColumn) -> (usize, usize, usize, usize) { + match entry { + PagedColumn::Resident(col, _) => { + let records = usize::try_from(col.record_count()).expect("non-negative"); + let bytes = col.length_in_bytes(); + (records, bytes, bytes, 1) + } + PagedColumn::Paged { .. } | PagedColumn::Compressed { .. } => (0, 0, 0, 0), + } +} + +impl Batcher for ColumnMergeBatcher +where + D: Columnar + 'static, + for<'a> columnar::Ref<'a, D>: Copy + Ord, + T: Columnar + Timestamp + Clone + Ord + PartialOrder + 'static, + for<'a> columnar::Ref<'a, T>: Copy + Ord, + R: Columnar + Default + Semigroup + for<'a> Semigroup> + 'static, + for<'a> columnar::Ref<'a, R>: Ord, + for<'a> <(D, T, R) as Columnar>::Container: columnar::Push<&'a (D, T, R)>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a D>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a T>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a R>, +{ + type Input = Column<(D, T, R)>; + type Output = Column<(D, T, R)>; + type Time = T; + + fn new(logger: Option, operator_id: usize) -> Self { + // No pager snapshot taken here — `self.pager()` reads + // `column_pager::global_pager` per call, so dyncfg-driven re-installs + // take effect on the next chunk. + Self { + chunker: ColumnChunker::default(), + chains: Vec::new(), + lower: Antichain::from_elem(T::minimum()), + frontier: Antichain::new(), + pager_override: None, + logger, + operator_id, + } + } + + fn push_container(&mut self, container: &mut Self::Input) { + let pager = self.pager(); + self.chunker.push_into(container); + while let Some(chunk) = self.chunker.extract() { + let mut chunk = std::mem::take(chunk); + let paged = pager.page(&mut chunk); + self.insert_chain(VecDeque::from([paged])); + } + } + + fn seal>( + &mut self, + upper: Antichain, + ) -> B::Output { + let pager = self.pager(); + // Finish chunker, fold any tail chunks in. + while let Some(chunk) = self.chunker.finish() { + let mut chunk = std::mem::take(chunk); + let paged = pager.page(&mut chunk); + self.insert_chain(VecDeque::from([paged])); + } + + // Merge all remaining chains into one. + while self.chains.len() > 1 { + let a = self.chain_pop().unwrap(); + let b = self.chain_pop().unwrap(); + let merged = self.merge_by(a, b); + self.chain_push(merged); + } + let merged = self.chain_pop().unwrap_or_default(); + + // Extract `merged` into `readied` (ship side, materialized for the + // builder) and `kept_chain` (keep side, stays paged for the next + // round). + let mut readied: Vec> = Vec::new(); + let mut kept_chain: VecDeque> = VecDeque::new(); + self.frontier.clear(); + { + let pager = &pager; + let frontier = &mut self.frontier; + extract_chain( + FetchIter::new(merged, pager), + upper.borrow(), + frontier, + |paged| readied.push(pager.take(paged)), + |paged| kept_chain.push_back(paged), + ); + } + + if !kept_chain.is_empty() { + self.chain_push(kept_chain); + } + + let description = Description::new( + self.lower.clone(), + upper.clone(), + Antichain::from_elem(T::minimum()), + ); + let seal = B::seal(&mut readied, description); + self.lower = upper; + seal + } + + fn frontier(&mut self) -> AntichainRef<'_, Self::Time> { + self.frontier.borrow() + } +} + +impl ColumnMergeBatcher +where + D: Columnar + 'static, + for<'a> columnar::Ref<'a, D>: Copy + Ord, + T: Columnar + Clone + PartialOrder + 'static, + for<'a> columnar::Ref<'a, T>: Copy + Ord, + R: Columnar + Default + Semigroup + for<'a> Semigroup> + 'static, + for<'a> <(D, T, R) as Columnar>::Container: columnar::Push<&'a (D, T, R)>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a D>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a T>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a R>, +{ + /// Insert `chain` and rebalance: while the youngest chain is at least + /// half the size of its predecessor, merge them. + fn insert_chain(&mut self, chain: VecDeque>) { + if chain.is_empty() { + return; + } + self.chain_push(chain); + while self.chains.len() > 1 + && self.chains[self.chains.len() - 1].len() + >= self.chains[self.chains.len() - 2].len() / 2 + { + let a = self.chain_pop().unwrap(); + let b = self.chain_pop().unwrap(); + let merged = self.merge_by(a, b); + self.chain_push(merged); + } + } + + /// Merge two sorted chains. Outputs are routed through `self.pager.page` + /// per chunk produced, so the result chain holds `PagedColumn`s and the + /// caller never sees a fully materialized merge result. + fn merge_by( + &mut self, + a: VecDeque>, + b: VecDeque>, + ) -> VecDeque> { + let mut output: VecDeque> = VecDeque::new(); + let pager = self.pager(); + let pager = &pager; + merge_chains( + FetchIter::new(a, pager), + FetchIter::new(b, pager), + |paged| output.push_back(paged), + ); + output + } +} + +// --------------------------------------------------------------------------- +// FetchIter +// --------------------------------------------------------------------------- + +/// Streaming materializer over a chain of [`PagedColumn`] entries. +/// +/// `next` consumes one entry and calls [`ColumnPager::take`] to produce a +/// resident [`Column`]. Bounds materialized chunks to whatever the consumer +/// holds (typically one head per chain in [`merge_chains`]). +pub struct FetchIter<'a, D, T, R> +where + (D, T, R): Columnar, +{ + queue: VecDeque>, + pager: &'a ColumnPager, +} + +impl<'a, D, T, R> FetchIter<'a, D, T, R> +where + (D, T, R): Columnar, +{ + /// Wraps `queue` for streaming materialization through `pager`. + pub fn new(queue: VecDeque>, pager: &'a ColumnPager) -> Self { + Self { queue, pager } + } + + /// Borrow the pager backing this iter so drivers can route output chunks + /// back through `page()` without threading a separate `&pager`. The + /// returned reference is tied to the outer `'a`, not to `&self`, so it + /// stays valid across subsequent `next()` calls. + pub fn pager(&self) -> &'a ColumnPager { + self.pager + } + + /// Drain remaining queued entries as `PagedColumn`s without materializing. + /// Used by `merge_chains`'s drain-tail phase: once the other side is + /// exhausted, the remaining entries on this side can pass straight to the + /// output sink. + pub fn into_paged(self) -> std::collections::vec_deque::IntoIter> { + self.queue.into_iter() + } +} + +impl Iterator for FetchIter<'_, D, T, R> +where + (D, T, R): Columnar, +{ + type Item = Column<(D, T, R)>; + + fn next(&mut self) -> Option { + self.queue.pop_front().map(|p| self.pager.take(p)) + } +} + +// --------------------------------------------------------------------------- +// Streaming drivers +// --------------------------------------------------------------------------- + +/// Two-way merge driver. Reuses today's per-chunk gallop / ship-threshold +/// logic from `Column::merge_from`, but pulls heads from [`FetchIter`] and +/// emits finished output chunks through `sink` after routing them through +/// the pager exposed by [`FetchIter::pager`]. +/// +/// Whole-chunk passthrough is omitted: peeking endpoints on a paged head +/// would force materialization with no clean way to undo it. A follow-up can +/// add it back gated on `PagedColumn::Resident` heads (where peeks are free) +/// or by carrying first/last keys in the pager's metadata. +pub fn merge_chains( + list1: FetchIter<'_, D, T, R>, + list2: FetchIter<'_, D, T, R>, + mut sink: Sink, +) where + D: Columnar, + for<'a> columnar::Ref<'a, D>: Copy + Ord, + T: Columnar + Clone + PartialOrder, + for<'a> columnar::Ref<'a, T>: Copy + Ord, + R: Columnar + Default + Semigroup + for<'a> Semigroup>, + for<'a> <(D, T, R) as Columnar>::Container: columnar::Push<&'a (D, T, R)>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a D>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a T>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a R>, + Sink: FnMut(PagedColumn<(D, T, R)>), +{ + let pager = list1.pager(); + let mut list1 = list1; + let mut list2 = list2; + + let mut heads = [ + list1.next().unwrap_or_default(), + list2.next().unwrap_or_default(), + ]; + let mut positions = [0usize, 0usize]; + let mut result: Column<(D, T, R)> = Column::default(); + + loop { + let upper_l = heads[0].borrow().len(); + let upper_r = heads[1].borrow().len(); + if positions[0] >= upper_l || positions[1] >= upper_r { + break; + } + + let yielded = result.merge_from(&mut heads, &mut positions); + + if positions[0] >= heads[0].borrow().len() { + heads[0] = list1.next().unwrap_or_default(); + positions[0] = 0; + } + if positions[1] >= heads[1].borrow().len() { + heads[1] = list2.next().unwrap_or_default(); + positions[1] = 0; + } + if yielded || result.at_capacity() { + sink(pager.page(&mut result)); + } + } + + // Drain remaining: copy partial head through `merge_from`'s 1-input + // dispatch, then hand the rest of the chain's `PagedColumn`s straight to + // the sink without materializing. + drain_side(&mut heads[0], &mut positions[0], list1, &mut result, &mut sink, pager); + drain_side(&mut heads[1], &mut positions[1], list2, &mut result, &mut sink, pager); + + if !result.is_empty() { + sink(pager.page(&mut result)); + } +} + +/// Helper for `merge_chains`'s drain phase: copy a partially-consumed head +/// into `result` (via 1-input `merge_from`), ship `result` if non-empty, then +/// pass the remaining queued `PagedColumn`s straight through. +fn drain_side( + head: &mut Column<(D, T, R)>, + pos: &mut usize, + rest: FetchIter<'_, D, T, R>, + result: &mut Column<(D, T, R)>, + sink: &mut Sink, + pager: &ColumnPager, +) where + D: Columnar, + for<'a> columnar::Ref<'a, D>: Copy + Ord, + T: Columnar + Clone + PartialOrder, + for<'a> columnar::Ref<'a, T>: Copy + Ord, + R: Columnar + Default + Semigroup + for<'a> Semigroup>, + for<'a> <(D, T, R) as Columnar>::Container: columnar::Push<&'a (D, T, R)>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a D>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a T>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a R>, + Sink: FnMut(PagedColumn<(D, T, R)>), +{ + if *pos < head.borrow().len() { + // 1-input dispatch — bulk copy that runs to completion. + let _ = result.merge_from(std::slice::from_mut(head), std::slice::from_mut(pos)); + } + if !result.is_empty() { + sink(pager.page(result)); + } + for paged in rest.into_paged() { + sink(paged); + } +} + +/// Streaming extract: walks `merged` chunk-by-chunk via `Column::extract`, +/// routing each filled keep/ship chunk through its sink after pageing. +/// Mirrors the per-chunk ship-threshold yield already inside +/// `Column::extract`. +pub fn extract_chain( + merged: FetchIter<'_, D, T, R>, + upper: AntichainRef, + frontier: &mut Antichain, + mut ship: SinkShip, + mut keep: SinkKeep, +) where + D: Columnar, + for<'a> columnar::Ref<'a, D>: Copy + Ord, + T: Columnar + Clone + PartialOrder, + for<'a> columnar::Ref<'a, T>: Copy + Ord, + R: Columnar + Default + Semigroup + for<'a> Semigroup>, + for<'a> <(D, T, R) as Columnar>::Container: columnar::Push<&'a (D, T, R)>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a D>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a T>, + for<'a> ::Container: columnar::Push>, + for<'a> ::Container: columnar::Push<&'a R>, + SinkShip: FnMut(PagedColumn<(D, T, R)>), + SinkKeep: FnMut(PagedColumn<(D, T, R)>), +{ + let pager = merged.pager(); + let mut keep_buf: Column<(D, T, R)> = Column::default(); + let mut ship_buf: Column<(D, T, R)> = Column::default(); + + for mut buffer in merged { + let mut position = 0; + let len = buffer.borrow().len(); + while position < len { + buffer.extract(&mut position, upper, frontier, &mut keep_buf, &mut ship_buf); + if keep_buf.at_capacity() { + keep(pager.page(&mut keep_buf)); + } + if ship_buf.at_capacity() { + ship(pager.page(&mut ship_buf)); + } + } + } + if !keep_buf.is_empty() { + keep(pager.page(&mut keep_buf)); + } + if !ship_buf.is_empty() { + ship(pager.page(&mut ship_buf)); + } +} + +#[cfg(test)] +#[allow(clippy::clone_on_ref_ptr)] +mod tests { + use std::sync::Arc; + + use columnar::Index; + use timely::container::PushInto as _; + + use super::*; + use crate::column_pager::{PageDecision, PageEvent, PageHint, PagingPolicy}; + + // ----- helpers ----------------------------------------------------------- + + type KvUpdate = ((u64, u64), u64, i64); + + fn col(rows: &[KvUpdate]) -> Column { + let mut c: Column = Default::default(); + for &t in rows { + c.push_into(t); + } + c + } + + fn collect_pc(chunks: &[PagedColumn], pager: &ColumnPager) -> Vec { + // `collect_pc` peeks via materialization on a side path so the test's + // assertions don't consume the chain. + chunks + .iter() + .flat_map(|p| { + let view: Column = match p { + PagedColumn::Resident(c, _) => clone_column(c), + _ => pager.take(clone_paged(p)), + }; + collect_column(&view).into_iter() + }) + .collect() + } + + fn collect_column(c: &Column) -> Vec { + c.borrow() + .into_index_iter() + .map(|((k, v), t, r)| { + ( + (u64::into_owned(k), u64::into_owned(v)), + u64::into_owned(t), + i64::into_owned(r), + ) + }) + .collect() + } + + fn clone_column(c: &Column) -> Column { + // `Column` is `Clone` when `C::Container: Clone`, which is true for + // tuple-of-primitive containers. Used so test helpers can peek at a + // chain without consuming it. + c.clone() + } + + /// Helper that bypasses `pager.take` for non-`Resident` variants by + /// taking and re-pageing. Only used in test inspection paths where the + /// extra round-trip is acceptable. + fn clone_paged(p: &PagedColumn) -> PagedColumn { + match p { + PagedColumn::Resident(c, _) => { + // Wrap via a disabled pager so the ticket is fresh. + let mut c = c.clone(); + ColumnPager::disabled().page(&mut c) + } + // For paged/compressed variants we can't clone without + // re-reading; the tests below only inspect Resident chains. + _ => panic!("clone_paged only supports Resident"), + } + } + + /// Always-page policy: bypasses any resident shortcut so we can assert + /// the chains remain in `Paged` form regardless of memory pressure. + struct ForcePagePolicy { + out: std::sync::atomic::AtomicUsize, + r#in: std::sync::atomic::AtomicUsize, + } + impl ForcePagePolicy { + fn new() -> Arc { + Arc::new(Self { + out: std::sync::atomic::AtomicUsize::new(0), + r#in: std::sync::atomic::AtomicUsize::new(0), + }) + } + } + impl PagingPolicy for ForcePagePolicy { + fn decide(&self, _hint: PageHint) -> PageDecision { + PageDecision::Page { + backend: mz_ore::pager::Backend::Swap, + codec: None, + } + } + fn record(&self, event: PageEvent) { + use std::sync::atomic::Ordering; + match event { + PageEvent::PagedOut { .. } => { + self.out.fetch_add(1, Ordering::Relaxed); + } + PageEvent::PagedIn { .. } => { + self.r#in.fetch_add(1, Ordering::Relaxed); + } + _ => {} + } + } + } + + /// Wrap a Vec as a paged chain for `FetchIter`. + fn to_chain(cols: Vec>, pager: &ColumnPager) -> VecDeque> { + cols.into_iter() + .map(|mut c| pager.page(&mut c)) + .collect() + } + + /// Drive `merge_chains` with a disabled pager and return owned tuples. + fn drive_merge( + chain1: Vec>, + chain2: Vec>, + ) -> Vec { + let pager = ColumnPager::disabled(); + let q1 = to_chain(chain1, &pager); + let q2 = to_chain(chain2, &pager); + let mut output: Vec> = Vec::new(); + merge_chains( + FetchIter::new(q1, &pager), + FetchIter::new(q2, &pager), + |paged| output.push(paged), + ); + collect_pc(&output, &pager) + } + + // ----- merge_chains correctness ----------------------------------------- + + /// Disjoint chains: same data as the legacy passthrough test. Without + /// passthrough, the merger runs per-record but should still produce the + /// fully ordered output. + #[mz_ore::test] + fn merge_chains_disjoint_ranges() { + let out = drive_merge( + vec![ + col(&[((0, 0), 0, 1), ((1, 0), 0, 1)]), + col(&[((2, 0), 0, 1), ((3, 0), 0, 1)]), + ], + vec![ + col(&[((10, 0), 0, 1), ((11, 0), 0, 1)]), + col(&[((12, 0), 0, 1), ((13, 0), 0, 1)]), + ], + ); + let expected: Vec<_> = (0..4u64) + .map(|d| ((d, 0u64), 0u64, 1i64)) + .chain((10..14u64).map(|d| ((d, 0u64), 0u64, 1i64))) + .collect(); + assert_eq!(out, expected); + } + + /// Interleaved chains: every record alternates between the two chains. + #[mz_ore::test] + fn merge_chains_interleaved() { + let out = drive_merge( + vec![ + col(&[((0, 0), 0, 1), ((2, 0), 0, 1)]), + col(&[((4, 0), 0, 1), ((6, 0), 0, 1)]), + ], + vec![ + col(&[((1, 0), 0, 1), ((3, 0), 0, 1)]), + col(&[((5, 0), 0, 1), ((7, 0), 0, 1)]), + ], + ); + let expected: Vec<_> = (0..8u64).map(|d| ((d, 0u64), 0u64, 1i64)).collect(); + assert_eq!(out, expected); + } + + /// Equal-key consolidation across chunk boundaries: chain1's last record + /// shares `(d, t)` with chain2's first; sum of diffs should land on a + /// single output record. + #[mz_ore::test] + fn merge_chains_equal_boundary() { + let out = drive_merge( + vec![col(&[((0, 0), 0, 1), ((5, 0), 0, 1)])], + vec![col(&[((5, 0), 0, 1), ((10, 0), 0, 1)])], + ); + assert_eq!( + out, + vec![((0, 0), 0, 1), ((5, 0), 0, 2), ((10, 0), 0, 1)] + ); + } + + /// Same merge, force-paged: chains stay in `Paged` form throughout, and + /// the consolidated result still matches. + #[mz_ore::test] + fn merge_chains_force_paged_round_trip() { + let policy = ForcePagePolicy::new(); + let pager = ColumnPager::new(policy.clone()); + let q1 = to_chain( + vec![col(&[((0, 0), 0, 1), ((2, 0), 0, 1)])], + &pager, + ); + let q2 = to_chain( + vec![col(&[((1, 0), 0, 1), ((3, 0), 0, 1)])], + &pager, + ); + + // Confirm the chains started paged-out (not Resident). + assert!(matches!(q1.front().unwrap(), PagedColumn::Paged { .. })); + assert!(matches!(q2.front().unwrap(), PagedColumn::Paged { .. })); + + let mut output: Vec> = Vec::new(); + merge_chains( + FetchIter::new(q1, &pager), + FetchIter::new(q2, &pager), + |paged| output.push(paged), + ); + + // Output entries should also have been routed through the pager. + for p in &output { + assert!(matches!(p, PagedColumn::Paged { .. })); + } + + // Materialize the output and check correctness. + let mut collected = Vec::new(); + for p in output { + let c = pager.take(p); + collected.extend(collect_column(&c)); + } + let expected: Vec<_> = (0..4u64).map(|d| ((d, 0u64), 0u64, 1i64)).collect(); + assert_eq!(collected, expected); + } + + // ----- extract_chain correctness ---------------------------------------- + + #[mz_ore::test] + fn extract_chain_partitions_by_frontier() { + let pager = ColumnPager::disabled(); + let data = vec![ + ((0, 0), 0u64, 1i64), + ((1, 0), 1, 1), + ((2, 0), 2, 1), + ((3, 0), 3, 1), + ]; + let chain = to_chain(vec![col(&data)], &pager); + let upper = Antichain::from_elem(2u64); + let mut frontier: Antichain = Antichain::new(); + let mut ship: Vec> = Vec::new(); + let mut keep: Vec> = Vec::new(); + + extract_chain( + FetchIter::new(chain, &pager), + upper.borrow(), + &mut frontier, + |p| ship.push(p), + |p| keep.push(p), + ); + + let shipped = collect_pc(&ship, &pager); + let kept = collect_pc(&keep, &pager); + for (_, t, _) in &shipped { + assert!(*t < 2, "shipped time {t} should be < upper"); + } + for (_, t, _) in &kept { + assert!(*t >= 2, "kept time {t} should be >= upper"); + } + assert_eq!(shipped.len() + kept.len(), data.len()); + } + + // ----- ColumnMergeBatcher end-to-end ------------------------------------ + + /// Trivial Builder used by `seal`: collects inputs into a Vec for the + /// test to inspect. + #[derive(Default)] + struct VecBuilder; + impl differential_dataflow::trace::Builder for VecBuilder { + type Input = Column; + type Time = u64; + type Output = Vec; + fn with_capacity(_keys: usize, _vals: usize, _upds: usize) -> Self { + Self + } + fn push(&mut self, _chunk: &mut Self::Input) {} + fn done(self, _description: differential_dataflow::trace::Description) -> Self::Output { + Vec::new() + } + fn seal( + chain: &mut Vec, + _description: differential_dataflow::trace::Description, + ) -> Self::Output { + let mut out = Vec::new(); + for c in chain.drain(..) { + out.extend(collect_column(&c)); + } + out + } + } + + #[mz_ore::test] + fn batcher_seal_round_trip() { + let mut b: ColumnMergeBatcher<(u64, u64), u64, i64> = + differential_dataflow::trace::Batcher::new(None, 0); + // Two pushes; second has an equal-key collision with the first. + let mut input1 = col(&[((1, 1), 0, 1), ((2, 0), 0, 1), ((3, 0), 0, 1)]); + let mut input2 = col(&[((2, 0), 0, 2), ((4, 0), 0, 1)]); + differential_dataflow::trace::Batcher::push_container(&mut b, &mut input1); + differential_dataflow::trace::Batcher::push_container(&mut b, &mut input2); + + // Seal everything (upper = ∞-ish, here just past any time we used). + let upper = Antichain::from_elem(u64::MAX); + let out: Vec = + differential_dataflow::trace::Batcher::seal::(&mut b, upper); + + // (2, 0)@0 was pushed with +1 then +2; sums to +3 after consolidation. + let mut expected = vec![ + ((1u64, 1u64), 0u64, 1i64), + ((2, 0), 0, 3), + ((3, 0), 0, 1), + ((4, 0), 0, 1), + ]; + expected.sort(); + let mut out_sorted = out.clone(); + out_sorted.sort(); + assert_eq!(out_sorted, expected); + } + + #[mz_ore::test] + fn account_chunk_resident_vs_paged() { + let policy = ForcePagePolicy::new(); + let pager_paged = ColumnPager::new(policy.clone()); + let pager_res = ColumnPager::disabled(); + + let mut c1 = col(&[((1, 1), 0, 1), ((2, 0), 0, 1), ((3, 0), 0, 1)]); + let resident = pager_res.page(&mut c1); + let (records, size, capacity, allocations) = account_chunk(&resident); + assert_eq!(records, 3); + assert!(size > 0); + assert_eq!(size, capacity); + assert_eq!(allocations, 1); + + let mut c2 = col(&[((1, 1), 0, 1), ((2, 0), 0, 1)]); + let paged = pager_paged.page(&mut c2); + assert!(matches!(paged, PagedColumn::Paged { .. })); + // Paged variants contribute zero to memory accounting. + assert_eq!(account_chunk(&paged), (0, 0, 0, 0)); + } + + #[mz_ore::test] + fn batcher_seal_keeps_kept_chain_paged() { + // Force-page policy; verify that after seal, the kept chain in + // self.chains contains only Paged entries (no Resident). + let policy = ForcePagePolicy::new(); + let pager = ColumnPager::new(policy.clone()); + + let mut b: ColumnMergeBatcher<(u64, u64), u64, i64> = + differential_dataflow::trace::Batcher::new(None, 0); + b.set_pager(pager); + + // Push records straddling an upper of 5 — half should be kept, half + // shipped. Use enough records to fill at least one chunk. + let n: u64 = 200; + for i in 0..n { + let mut input = col(&[((i, 0), i % 10, 1)]); + differential_dataflow::trace::Batcher::push_container(&mut b, &mut input); + } + let upper = Antichain::from_elem(5u64); + let _ = differential_dataflow::trace::Batcher::seal::(&mut b, upper); + + // Anything kept (times >= 5) should be sitting in b.chains as paged. + let kept_records: usize = b + .chains + .iter() + .flat_map(|c| c.iter()) + .map(|p| match p { + PagedColumn::Paged { meta, .. } => { + // Records aren't directly available here; sanity-check + // that no Resident snuck in. + let _ = meta; + 1 + } + PagedColumn::Compressed { meta, .. } => { + let _ = meta; + 1 + } + PagedColumn::Resident(_, _) => { + panic!("kept chain entry was Resident under ForcePagePolicy"); + } + }) + .sum(); + // We expect *some* kept entries (times in [5..10) loop slot). + assert!(kept_records > 0, "expected at least one kept paged entry"); + assert!(policy.out.load(std::sync::atomic::Ordering::Relaxed) > 0); + let _ = n; + } +} From 161d66a795ec20724ee8621bfdd26e646e28a5be Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Tue, 19 May 2026 11:04:56 -0400 Subject: [PATCH 33/34] timely-util,feature-benchmark: benches for the column-paged batcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three pieces of validation tooling for the column-paged merge batcher: a criterion microbench, an end-to-end timely example, and feature-benchmark scenarios. Criterion bench (`src/timely-util/benches/columnar_merge_batcher.rs`): compares the legacy `ColumnMerger` against the new path with disabled / swap / lz4 pagers across three input regimes (mixed, collisions, disjoint) and four cache-tier sizes. Prints a throughput summary table when the group finishes. Good for per-chunk-merge perf comparisons; doesn't exercise the dataflow operator graph. End-to-end example (`src/timely-util/examples/column_paged_spill.rs`): drives `arrange_core` over a cancellation workload (positives + negatives at the same time, so the spine stays empty and all pressure lives in the merge-batcher). Configurable workers / records / budget; back-to-back baseline + spill modes; optional RSS sampler thread via `ps`. Modeled on `differential-dataflow/examples/columnar_spill.rs` but uses our `Col2ValPagedBatcher` + `ColumnPager` + `TieredPolicy` directly instead of DD's `SpillBatcher`/`Threshold`/`FileSpill` plumbing. `cargo run --release --example column_paged_spill` for a smoke test; see `--help` for sweep options. Feature-benchmark scenarios (`misc/python/.../scenarios/benchmark_main.py`): * `DifferentialJoinColumnPaged` — same query shape as `DifferentialJoin`, paged batcher enabled. Measures steady-state overhead vs the legacy path. * `DifferentialJoinHydrationBaseline` / `DifferentialJoinHydrationFile` — sister leaves of a non-runnable `DifferentialJoinHydration` parent. Each measures the time to re-hydrate a linear-join arrangement after `REPLICATION FACTOR 0 -> 1` toggling. Baseline has the paged batcher off; File enables it with the file backend and `budget_fraction = 0.01` so chunks spill rather than competing with the spine for RAM. Compare under `--this-memory` + `--this-memory-swap` to evaluate user-space spill vs OS swap. Feature-benchmark CLI plumbing (`test/feature-benchmark/mzcompose.py`): adds `--this-memory`, `--this-memory-swap`, `--this-mem-swappiness` (and `--other-*` companions) so memory caps and swap behavior are configurable per side, plus `--skip-other` for iterating on `this` without the comparison round trip. The benchmark-result evaluator tolerates the single-side case by returning `None` ratios instead of indexing past the end of `_points`. --- .../feature_benchmark/benchmark_result.py | 5 + .../benchmark_result_evaluator.py | 8 +- .../scenarios/benchmark_main.py | 134 +++++ src/timely-util/Cargo.toml | 4 + .../benches/columnar_merge_batcher.rs | 456 ++++++++++++++++++ .../examples/column_paged_spill.rs | 327 +++++++++++++ test/feature-benchmark/mzcompose.py | 119 ++++- 7 files changed, 1046 insertions(+), 7 deletions(-) create mode 100644 src/timely-util/benches/columnar_merge_batcher.rs create mode 100644 src/timely-util/examples/column_paged_spill.rs diff --git a/misc/python/materialize/feature_benchmark/benchmark_result.py b/misc/python/materialize/feature_benchmark/benchmark_result.py index 31c5f56f0c7ef..dd272a6688f2e 100644 --- a/misc/python/materialize/feature_benchmark/benchmark_result.py +++ b/misc/python/materialize/feature_benchmark/benchmark_result.py @@ -93,6 +93,11 @@ def this_as_str(self) -> str: return f"{self.this():>11.3f}" def other(self) -> T: + # `_points` has length 1 when the runner ran only the THIS side + # (e.g. `--skip-other`); treat the absent baseline as `None` so + # report rendering falls through to its `None` formatting. + if len(self._points) < 2: + return None # type: ignore[return-value] return self._points[1] def other_as_str(self) -> str: diff --git a/misc/python/materialize/feature_benchmark/benchmark_result_evaluator.py b/misc/python/materialize/feature_benchmark/benchmark_result_evaluator.py index 6900b3f57ce90..87fcd997a07ae 100644 --- a/misc/python/materialize/feature_benchmark/benchmark_result_evaluator.py +++ b/misc/python/materialize/feature_benchmark/benchmark_result_evaluator.py @@ -51,7 +51,13 @@ def get_threshold(self, metric: BenchmarkScenarioMetric) -> float: return self.threshold_by_measurement_type[metric.measurement_type] def ratio(self, metric: BenchmarkScenarioMetric) -> float | None: - if metric._points[0] is None or metric._points[1] is None: + # `_points` has length 1 when the runner ran only the THIS side + # (e.g. `--skip-other`); there's no baseline to compare against. + if ( + len(metric._points) < 2 + or metric._points[0] is None + or metric._points[1] is None + ): return None else: return metric._points[0] / metric._points[1] diff --git a/misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py b/misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py index 4f1d577fb6e0c..fee6c354f0c80 100644 --- a/misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py +++ b/misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py @@ -947,6 +947,140 @@ def benchmark(self) -> MeasurementSource: """) +class DifferentialJoinColumnPaged(Dataflow): + """Same shape as `DifferentialJoin`, but with the column-paged merge + batcher enabled for the linear-join arrange stage. + + Compare against `DifferentialJoin` to gauge the steady-state overhead of + the paged path (resident chunks plus byte-budget bookkeeping) when no + pressure forces spill. To measure spill cost, see + `DifferentialJoinHydrationFile`. + """ + + def init(self) -> list[Action]: + return [ + self.view_ten(), + TdAction(f""" +$ postgres-connect name=mz_system url=postgres://mz_system:materialize@${{testdrive.materialize-internal-sql-addr}} +$ postgres-execute connection=mz_system +ALTER SYSTEM SET enable_column_paged_batcher = true; + +> CREATE MATERIALIZED VIEW v1 AS SELECT {self.unique_values()} AS f1, {self.unique_values()} AS f2 FROM {self.join()} +"""), + ] + + def benchmark(self) -> MeasurementSource: + return Td(f""" +> SELECT 1; + /* A */ +1 + + +> SELECT COUNT(*) FROM v1 AS a1 JOIN v1 AS a2 USING (f1) + /* B */ +{self.n()} +""") + + +class DifferentialJoinHydration(Dataflow): + """Non-leaf parent for the linear-join hydration benchmark family. + + Holds the shared `init` / `benchmark` (replica-toggle hydration loop) so + Baseline and File variants only need to override `shared()` with the + dyncfgs they want set. Has subclasses, so the feature-benchmark runner + treats it as non-leaf and never executes it directly — pick one of the + leaf classes via `--root-scenario`. + + Run both leaves under a memory-capped Materialized (`--this-memory=2g`) + so the baseline has to swap and the paged-file variant has somewhere + predictable to spill. + """ + + # SCALE=8 → 100M rows per side, ~1.6 GiB per side input. Two sides plus + # the join arrangement (typically 2–4× input) reliably exceeds a few + # GiB total; a 2g container cap forces real swap pressure on the + # baseline. File variant's 16 MiB per-worker + 128 MiB shared budget + # means almost everything spills under that cap. + SCALE = 8 + + def init(self) -> list[Action]: + # `v1` lives on the default cluster, not `join_cluster`, so the + # replication-factor toggle in `benchmark` only tears down `v2`'s + # dataflow. Keeps the measurement scoped to the join-arrangement + # rebuild we're trying to measure. + return [ + self.view_ten(), + TdAction(f""" +> CREATE MATERIALIZED VIEW v1 + AS SELECT {self.unique_values()} AS f1, {self.unique_values()} AS f2 FROM {self.join()} +> SELECT COUNT(*) FROM v1 +{self.n()} + +> CREATE CLUSTER join_cluster SIZE 'scale=1,workers=16', REPLICATION FACTOR 1 +"""), + ] + + def benchmark(self) -> MeasurementSource: + # Match HydrateIndex's pattern: take the cluster offline *before* + # defining the object so the dataflow doesn't pre-hydrate. The + # `REPLICATION FACTOR 1` flip after `/* A */` is the actual + # hydration trigger we want to time. + return Td(f""" +> DROP MATERIALIZED VIEW IF EXISTS v2 + +> ALTER CLUSTER join_cluster SET (REPLICATION FACTOR 0) + +> CREATE MATERIALIZED VIEW v2 + IN CLUSTER join_cluster + AS SELECT COUNT(*) FROM v1 AS a1 JOIN v1 AS a2 USING (f1) + +> SELECT 1 + /* A */ +1 +> ALTER CLUSTER join_cluster SET (REPLICATION FACTOR 1) +> SET CLUSTER = join_cluster +> SELECT * FROM v2 + /* B */ +{self.n()} +> SET CLUSTER = default +""") + + +class DifferentialJoinHydrationBaseline(DifferentialJoinHydration): + """Hydration measurement with the paged batcher disabled (current + production path). Compare against `DifferentialJoinHydrationFile` to + see if user-space file-backed spill beats OS swap under pressure. + """ + + def shared(self) -> Action: + return TdAction(""" +$ postgres-connect name=mz_system url=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr} +$ postgres-execute connection=mz_system +ALTER SYSTEM SET enable_column_paged_batcher = false; +""") + + +class DifferentialJoinHydrationFile(DifferentialJoinHydration): + """Hydration time with the column-paged batcher on, file backend, and + a tight budget fraction so the merge-batcher transient spills rather + than competing with the spine for RAM. + + `budget_fraction = 0.01` (1% of announced memory limit) lands in the + clamp floors of the worker-init derivation (per-worker 16 MiB, + shared 128 MiB), giving us the same effective sizing we benchmarked + before the fraction-knob refactor. + """ + + def shared(self) -> Action: + return TdAction(""" +$ postgres-connect name=mz_system url=postgres://mz_system:materialize@${testdrive.materialize-internal-sql-addr} +$ postgres-execute connection=mz_system +ALTER SYSTEM SET enable_column_paged_batcher = true; +ALTER SYSTEM SET column_paged_batcher_backend = 'file'; +ALTER SYSTEM SET column_paged_batcher_budget_fraction = 0.01; +""") + + class FullOuterJoin(Dataflow): def benchmark(self) -> BenchmarkingSequence: columns_select = ", ".join( diff --git a/src/timely-util/Cargo.toml b/src/timely-util/Cargo.toml index 6000b887ad64d..cb4d4a1d67801 100644 --- a/src/timely-util/Cargo.toml +++ b/src/timely-util/Cargo.toml @@ -21,6 +21,10 @@ harness = false name = "columnar_merger" harness = false +[[bench]] +name = "columnar_merge_batcher" +harness = false + [[bench]] name = "column_pager" harness = false diff --git a/src/timely-util/benches/columnar_merge_batcher.rs b/src/timely-util/benches/columnar_merge_batcher.rs new file mode 100644 index 0000000000000..78a6cae60a76f --- /dev/null +++ b/src/timely-util/benches/columnar_merge_batcher.rs @@ -0,0 +1,456 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Microbenchmark comparing the legacy column-backed `ColumnMerger` against +//! the new pageable `ColumnMergeBatcher` driver on the merge-batcher's hot +//! path. +//! +//! Each iteration drives a single 2-input merge — either via `Merger::merge` +//! (legacy) or via `merge_chains` (new, driven through a `ColumnPager`). +//! +//! Three pager configurations sweep the cost of the new path: +//! +//! - **`paged-disabled`** — `ColumnPager::disabled`; chunks stay `Resident` +//! throughout. Compared to `column`, this isolates the pager-wrapping +//! overhead (extra `Resident(_, ticket)` enum dispatch and the +//! `FetchIter`-shaped driver). +//! - **`paged-swap`** — every chunk routes through the Swap backend +//! uncompressed. Measures the cost of byte-level serialization + buffered +//! allocation moves with no codec work. +//! - **`paged-lz4`** — same as `paged-swap` but with lz4 frame compression. +//! Adds codec CPU cost to the swap baseline. +//! +//! Two axes match the sister bench `columnar_merger.rs`: +//! regime × size. See that file for axis rationale. + +use std::collections::VecDeque; +use std::mem::size_of; +use std::sync::Arc; + +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use differential_dataflow::trace::implementations::merge_batcher::Merger; +use mz_ore::cast::{CastFrom, CastLossy}; +use mz_ore::pager::Backend; +use mz_timely_util::column_pager::{ + Codec, ColumnPager, PageDecision, PageEvent, PageHint, PagedColumn, PagingPolicy, +}; +use mz_timely_util::columnar::Column; +use mz_timely_util::columnar::batcher::ColumnMerger; +use mz_timely_util::columnar::merge_batcher::{FetchIter, merge_chains}; +use rand::{Rng, SeedableRng, rngs::StdRng}; +use timely::container::PushInto; + +type Data = (u64, u64); +type Time = u64; +type Diff = i64; +type Tuple = (Data, Time, Diff); + +/// Per-side heap footprints to sweep across. Same tiers as +/// `columnar_merger.rs`. +const SIZES: &[(&str, usize)] = &[ + ("32K", 32 * 1024), + ("512K", 512 * 1024), + ("8M", 8 * 1024 * 1024), + ("128M", 128 * 1024 * 1024), +]; + +// --------------------------------------------------------------------------- +// Pager driver helpers +// --------------------------------------------------------------------------- + +/// Always-page policy parameterized over backend + codec. Used to force the +/// merger through the byte-shaped path even when chunks are tiny. +struct ForcePage { + backend: Backend, + codec: Option, +} +impl PagingPolicy for ForcePage { + fn decide(&self, _hint: PageHint) -> PageDecision { + PageDecision::Page { + backend: self.backend, + codec: self.codec, + } + } + fn record(&self, _event: PageEvent) {} +} + +fn pager_disabled() -> ColumnPager { + ColumnPager::disabled() +} + +fn pager_force(backend: Backend, codec: Option) -> ColumnPager { + ColumnPager::new(Arc::new(ForcePage { backend, codec })) +} + +/// Wrap a single resident `Column` as a one-entry chain. +fn one_chain(mut c: Column, pager: &ColumnPager) -> VecDeque> { + let paged = pager.page(&mut c); + VecDeque::from([paged]) +} + +// --------------------------------------------------------------------------- +// Data generation (mirrors `columnar_merger.rs`) +// --------------------------------------------------------------------------- + +fn make(seed: u64, n: usize, key_range: u64, time_range: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + let mut raw: Vec = (0..n) + .map(|_| { + ( + ( + rng.random_range(0..key_range), + rng.random_range(0..key_range), + ), + rng.random_range(0..time_range), + rng.random_range(-3i64..=3), + ) + }) + .collect(); + raw.sort(); + let mut out: Vec = Vec::new(); + for (d, t, r) in raw { + if let Some(last) = out.last_mut() { + if last.0 == d && last.1 == t { + last.2 += r; + continue; + } + } + out.push((d, t, r)); + } + out.retain(|x| x.2 != 0); + out +} + +fn build_column(data: &[Tuple]) -> Column { + let mut col: Column = Default::default(); + for &tup in data { + col.push_into(tup); + } + col +} + +fn configs(n: usize) -> [(&'static str, Vec, Vec); 3] { + let n_u64 = u64::cast_from(n); + [ + ("mixed", make(1, n, 2 * n_u64, 4), make(2, n, 2 * n_u64, 4)), + ( + "collisions", + make(3, n, u64::cast_from(n / 4), 2), + make(4, n, u64::cast_from(n / 4), 2), + ), + ( + "disjoint", + make(5, n, n_u64, 4), + make(6, n, n_u64, 4) + .into_iter() + .map(|((k1, k2), t, r)| ((k1 + n_u64, k2 + n_u64), t, r)) + .collect(), + ), + ] +} + +// --------------------------------------------------------------------------- +// Benchmark +// --------------------------------------------------------------------------- + +/// One row of the throughput summary — bytes-per-iter, plus the four variant +/// labels we'll look up in `target/criterion////...`. +const VARIANTS: &[&str] = &["column", "paged-disabled", "paged-swap", "paged-lz4"]; + +fn bench_merge_batcher(c: &mut Criterion) { + let mut group = c.benchmark_group("merge_batcher_two_sorted"); + + let bytes_per_record = size_of::(); + let mut summary: Vec<(String, u64)> = Vec::new(); + + for (size_label, bytes_per_side) in SIZES { + let n = bytes_per_side / bytes_per_record; + let cfgs = configs(n); + + for (regime, a, b) in &cfgs { + let bytes = u64::try_from((a.len() + b.len()) * bytes_per_record).unwrap(); + group.throughput(Throughput::Bytes(bytes)); + + let id = format!("{regime}/{size_label}"); + summary.push((id.clone(), bytes)); + + // Variant 1: legacy `ColumnMerger::merge`. Baseline. + group.bench_with_input(BenchmarkId::new("column", &id), &(), |bencher, _| { + bencher.iter_batched( + || (build_column(a), build_column(b)), + |(ca, cb)| { + let mut merger: ColumnMerger = Default::default(); + let mut output = Vec::new(); + let mut stash = Vec::new(); + merger.merge(vec![ca], vec![cb], &mut output, &mut stash); + output + }, + BatchSize::LargeInput, + ); + }); + + // Variant 2: new path, disabled pager. Isolates wrapping cost. + group.bench_with_input( + BenchmarkId::new("paged-disabled", &id), + &(), + |bencher, _| { + let pager = pager_disabled(); + bencher.iter_batched( + || { + ( + one_chain(build_column(a), &pager), + one_chain(build_column(b), &pager), + ) + }, + |(q1, q2)| { + let mut output: Vec> = Vec::new(); + merge_chains( + FetchIter::new(q1, &pager), + FetchIter::new(q2, &pager), + |p| output.push(p), + ); + output + }, + BatchSize::LargeInput, + ); + }, + ); + + // Variant 3: force-page to Swap, no codec. + group.bench_with_input(BenchmarkId::new("paged-swap", &id), &(), |bencher, _| { + let pager = pager_force(Backend::Swap, None); + bencher.iter_batched( + || { + ( + one_chain(build_column(a), &pager), + one_chain(build_column(b), &pager), + ) + }, + |(q1, q2)| { + let mut output: Vec> = Vec::new(); + merge_chains( + FetchIter::new(q1, &pager), + FetchIter::new(q2, &pager), + |p| output.push(p), + ); + output + }, + BatchSize::LargeInput, + ); + }); + + // Variant 4: force-page to Swap with lz4. Codec cost. + group.bench_with_input(BenchmarkId::new("paged-lz4", &id), &(), |bencher, _| { + let pager = pager_force(Backend::Swap, Some(Codec::Lz4)); + bencher.iter_batched( + || { + ( + one_chain(build_column(a), &pager), + one_chain(build_column(b), &pager), + ) + }, + |(q1, q2)| { + let mut output: Vec> = Vec::new(); + merge_chains( + FetchIter::new(q1, &pager), + FetchIter::new(q2, &pager), + |p| output.push(p), + ); + output + }, + BatchSize::LargeInput, + ); + }); + } + } + + group.finish(); + + print_throughput_table( + "Throughput summary — primitive ((u64, u64), u64, i64):", + "merge_batcher_two_sorted", + &summary, + ); +} + +// =========================================================================== +// Throughput summary helpers +// +// Same shape as `columnar_merger.rs` but widened for our four variants. The +// helpers are duplicated rather than shared because bench files don't have +// an easy way to import each other. +// =========================================================================== + +fn criterion_dir() -> std::path::PathBuf { + let mut cur = std::env::current_dir().unwrap_or_default(); + loop { + let candidate = cur.join("target").join("criterion"); + if candidate.is_dir() { + return candidate; + } + if !cur.pop() { + return std::path::PathBuf::from("target/criterion"); + } + } +} + +fn read_criterion_median_ns(group: &str, bench_id: &str) -> Option { + let path = criterion_dir() + .join(group) + .join(bench_id) + .join("new") + .join("estimates.json"); + let json = std::fs::read_to_string(&path).ok()?; + let median_idx = json.find("\"median\"")?; + let after = &json[median_idx..]; + let pe_marker = "\"point_estimate\""; + let pe_idx = after.find(pe_marker)?; + let rest = after[pe_idx + pe_marker.len()..].trim_start(); + let rest = rest.strip_prefix(':')?.trim_start(); + let end = rest.find(|c: char| c == ',' || c == '}')?; + rest[..end].trim().parse::().ok() +} + +fn fmt_throughput(bytes: u64, ns: f64) -> String { + if !ns.is_finite() || ns <= 0.0 { + return "—".to_string(); + } + let bytes_per_sec = f64::cast_lossy(bytes) * 1e9 / ns; + let gibs = bytes_per_sec / f64::cast_lossy(1u64 << 30); + if gibs >= 1.0 { + format!("{gibs:.2} GiB/s") + } else { + let mibs = bytes_per_sec / f64::cast_lossy(1u64 << 20); + format!("{mibs:.0} MiB/s") + } +} + +fn fmt_time(ns: f64) -> String { + if !ns.is_finite() { + "—".to_string() + } else if ns < 1e3 { + format!("{:.0} ns", ns) + } else if ns < 1e6 { + format!("{:.1} µs", ns / 1e3) + } else if ns < 1e9 { + format!("{:.2} ms", ns / 1e6) + } else { + format!("{:.2} s", ns / 1e9) + } +} + +fn fmt_ratio(num_ns: f64, den_ns: f64) -> String { + if !(num_ns.is_finite() && den_ns.is_finite()) || den_ns <= 0.0 { + return "—".to_string(); + } + let r = num_ns / den_ns; + if (r - 1.0).abs() < 0.01 { + "≈ 1.00×".to_string() + } else if r < 1.0 { + format!("{:.2}× faster", 1.0 / r) + } else { + format!("{:.2}× slower", r) + } +} + +fn print_throughput_table(title: &str, group: &str, rows: &[(String, u64)]) { + // Columns: Config | column | paged-disabled | paged-swap | paged-lz4 | + // disabled vs column. + let mut cells: Vec> = Vec::with_capacity(rows.len()); + for (label, bytes) in rows { + let ns: Vec = VARIANTS + .iter() + .map(|v| { + let bench_id = format!("{}/{}", v, label.replace('/', "_")); + read_criterion_median_ns(group, &bench_id).unwrap_or(f64::NAN) + }) + .collect(); + let column_ns = ns[0]; + let disabled_ns = ns[1]; + + let mut row = vec![label.clone()]; + for (variant_ns, _variant) in ns.iter().zip(VARIANTS.iter()) { + row.push(format!( + "{} ({})", + fmt_throughput(*bytes, *variant_ns), + fmt_time(*variant_ns) + )); + } + row.push(fmt_ratio(disabled_ns, column_ns)); + cells.push(row); + } + + let headers = [ + "Config", + "column", + "paged-disabled", + "paged-swap", + "paged-lz4", + "disabled vs column", + ]; + let max_chars = |i: usize| -> usize { + cells + .iter() + .map(|c| c[i].chars().count()) + .max() + .unwrap_or(0) + .max(headers[i].chars().count()) + }; + let widths: Vec = (0..headers.len()).map(max_chars).collect(); + + let bar = |l: char, m: char, r: char| -> String { + let mut s = String::new(); + s.push(l); + for (i, &w) in widths.iter().enumerate() { + for _ in 0..w + 2 { + s.push('─'); + } + s.push(if i + 1 < widths.len() { m } else { r }); + } + s + }; + + println!(); + println!("{title}"); + println!(); + println!("{}", bar('┌', '┬', '┐')); + let header_row = headers + .iter() + .zip(widths.iter()) + .map(|(h, w)| format!(" {:^w$} ", h, w = w)) + .collect::>() + .join("│"); + println!("│{header_row}│"); + println!("{}", bar('├', '┼', '┤')); + for (i, row) in cells.iter().enumerate() { + if i > 0 { + println!("{}", bar('├', '┼', '┤')); + } + let line = row + .iter() + .zip(widths.iter()) + .enumerate() + .map(|(idx, (cell, w))| { + if idx == 0 { + format!(" {:w$} ", cell, w = w) + } else { + format!(" {:>() + .join("│"); + println!("│{line}│"); + } + println!("{}", bar('└', '┴', '┘')); +} + +criterion_group!(benches, bench_merge_batcher); +criterion_main!(benches); diff --git a/src/timely-util/examples/column_paged_spill.rs b/src/timely-util/examples/column_paged_spill.rs new file mode 100644 index 0000000000000..c755795dd2218 --- /dev/null +++ b/src/timely-util/examples/column_paged_spill.rs @@ -0,0 +1,327 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! End-to-end spill demo for the column-paged merge batcher. +//! +//! Drives a real timely dataflow (`arrange_core` over multiple workers) +//! against a cancellation workload: each `(k, v, t, +d)` is followed by +//! `(k, v, t, -d)` at the same logical time, so the *spine* stays empty +//! and all memory pressure lives in the merge-batcher's transient state. +//! That's the regime where the paged batcher should obviously win over +//! the no-spill baseline. +//! +//! Cancellation pattern, RSS sampler thread, and key-scrambling +//! (`mix()` so post-sort columnar bytes look incompressible) all +//! mirror `differential-dataflow/examples/columnar_spill.rs`. The +//! pager indirection swaps DD's `Spill`/`SpillPolicy`/`Fetch`/`Threshold` +//! plumbing for our existing `ColumnPager` + `TieredPolicy`. +//! +//! ```text +//! cargo run --release --example column_paged_spill -- --help +//! cargo run --release --example column_paged_spill -- --mode both --workers 4 \ +//! --times 64 --keys 24000000 --per-worker 33554432 --shared 536870912 \ +//! --sample-secs 30 +//! ``` + +use std::rc::Rc; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +use differential_dataflow::operators::arrange::arrangement::arrange_core; +use differential_dataflow::trace::implementations::Vector; +use differential_dataflow::trace::implementations::ord_neu::{OrdValBatch, OrdValBuilder}; +use differential_dataflow::trace::implementations::spine_fueled::Spine; +use differential_dataflow::trace::rc_blanket_impls::RcBuilder; +use mz_ore::pager::{self, Backend}; +use mz_timely_util::column_pager::policy::TieredPolicy; +use mz_timely_util::column_pager::{ColumnPager, set_global_pager}; +use mz_timely_util::columnar::Column; +use mz_timely_util::columnar::Col2ValPagedBatcher; +use mz_timely_util::columnar::builder::ColumnBuilder; +use timely::dataflow::InputHandle; +use timely::dataflow::channels::pact::Pipeline; +use timely::dataflow::operators::Input; +use timely::dataflow::operators::probe::{Handle as ProbeHandle, Probe}; + +type Update = ((u64, u64), u64, i64); + +type MyBatcher = Col2ValPagedBatcher; +type MyBuilder = RcBuilder, Column>>; +type MySpine = Spine>>>; + +#[derive(Debug, Clone, Copy, PartialEq)] +enum Mode { + Both, + Spill, + Baseline, +} + +struct Config { + times: u64, + keys_per_time: u64, + per_worker_bytes: usize, + shared_bytes: usize, + workers: usize, + sample_secs: u64, + mode: Mode, +} + +fn install_pager(spill: bool, per_worker: usize, shared: usize) { + if spill { + // Each process keeps a single `mz-pager-{pid}-{nonce}` subdir under + // this root; reused across `set_global_pager` reinstalls. + pager::set_scratch_dir(std::env::temp_dir()); + let policy = Arc::new(TieredPolicy::new(per_worker, shared, Backend::File, None)); + set_global_pager(ColumnPager::new(policy)); + } else { + set_global_pager(ColumnPager::disabled()); + } +} + +fn run_dataflow(cfg: &Config, label: &str) -> Duration { + let stop = Arc::new(AtomicBool::new(false)); + + // RSS sampler thread. `ps -o rss=` is portable across Linux + macOS + // and doesn't add a dep just to read /proc/self/status. + let sampler = if cfg.sample_secs > 0 { + let stop = stop.clone(); + let label_owned = label.to_string(); + let interval = Duration::from_secs(cfg.sample_secs); + let start = Instant::now(); + Some(std::thread::spawn(move || { + while !stop.load(Ordering::Relaxed) { + if let Some(rss) = rss_kb() { + println!( + " [{}] +{:>5.0}s RSS {:>9} kB", + label_owned, + start.elapsed().as_secs_f64(), + rss, + ); + } + std::thread::sleep(interval); + } + })) + } else { + None + }; + + let times = cfg.times; + let keys_per_time = cfg.keys_per_time; + let timer = Instant::now(); + + timely::execute(timely::Config::process(cfg.workers), move |worker| { + let index = worker.index(); + let peers = worker.peers(); + + let mut input = + >>::new_with_builder(); + let mut probe: ProbeHandle = ProbeHandle::new(); + + worker.dataflow::(|scope| { + let stream = scope.input_from(&mut input); + let arranged = arrange_core::<_, MyBatcher, MyBuilder, MySpine>( + stream, + Pipeline, + "ColumnPagedSpillArrange", + ); + arranged.stream.probe_with(&mut probe); + }); + + // Push positives then negatives at the same logical time so they + // cancel inside the merger rather than producing two giant sealed + // batches that cancel only at the spine. `mix` scrambles the keys + // so the post-sort columnar bytes look incompressible — without + // this macOS' page compressor crushes the sequential-u64 pattern + // and skews the comparison toward baseline. + const STEP_EVERY: usize = 1 << 16; + let mut sent_since_step = 0usize; + for sign in [1i64, -1] { + for t in 0..times { + let mut k = index as u64; + while k < keys_per_time { + let kh = mix(k); + let d = ((kh as i64) >> 1) | 1; + input.send(((kh, kh & 0x3), t, sign * d)); + k += peers as u64; + sent_since_step += 1; + if sent_since_step >= STEP_EVERY { + worker.step(); + sent_since_step = 0; + } + } + } + } + input.advance_to(1); + input.flush(); + + while probe.less_than(input.time()) { + worker.step(); + } + }) + .expect("timely::execute failed"); + + let elapsed = timer.elapsed(); + stop.store(true, Ordering::Relaxed); + if let Some(s) = sampler { + let _ = s.join(); + } + elapsed +} + +/// Reversible bijection that destroys spatial locality of sequential keys. +/// `xorshift*` mixing — output is determined by `k` so cancellation still +/// pairs the same `(k, v, t, +d)` with `(k, v, t, -d)`. +fn mix(k: u64) -> u64 { + let x = k.wrapping_mul(0x9E37_79B9_7F4A_7C15); + x ^ (x >> 32) +} + +fn rss_kb() -> Option { + let pid = std::process::id(); + let output = std::process::Command::new("ps") + .args(["-o", "rss=", "-p", &pid.to_string()]) + .output() + .ok()?; + let s = std::str::from_utf8(&output.stdout).ok()?; + s.trim().parse::().ok() +} + +fn main() { + let cfg = match parse_args() { + Some(cfg) => cfg, + None => return, + }; + + let total_records = (cfg.times * cfg.keys_per_time) as usize * 2; + let bytes_per_record = std::mem::size_of::(); + let raw_gb = (total_records * bytes_per_record) as f64 / (1u64 << 30) as f64; + println!( + "config: times={} keys={} workers={} per_worker={} shared={} mode={:?} sample_secs={}", + cfg.times, + cfg.keys_per_time, + cfg.workers, + cfg.per_worker_bytes, + cfg.shared_bytes, + cfg.mode, + cfg.sample_secs, + ); + println!( + "workload: {} records ({:.2} GB raw, {} bytes/record) — cancellation, spine stays empty", + total_records, raw_gb, bytes_per_record, + ); + + if cfg.mode != Mode::Baseline { + install_pager(true, cfg.per_worker_bytes, cfg.shared_bytes); + let elapsed = run_dataflow(&cfg, "spill"); + println!( + "spill: {:.2}s | {:.2} M records/s | {:.2} GB/s", + elapsed.as_secs_f64(), + total_records as f64 / elapsed.as_secs_f64() / 1e6, + raw_gb / elapsed.as_secs_f64(), + ); + } + + if cfg.mode != Mode::Spill { + install_pager(false, 0, 0); + let elapsed = run_dataflow(&cfg, "baseline"); + println!( + "baseline: {:.2}s | {:.2} M records/s | {:.2} GB/s", + elapsed.as_secs_f64(), + total_records as f64 / elapsed.as_secs_f64() / 1e6, + raw_gb / elapsed.as_secs_f64(), + ); + } +} + +fn parse_args() -> Option { + let mut cfg = Config { + times: 8, + keys_per_time: 500_000, + per_worker_bytes: 32 * 1024 * 1024, + shared_bytes: 512 * 1024 * 1024, + workers: 1, + sample_secs: 0, + mode: Mode::Both, + }; + let mut it = std::env::args().skip(1); + while let Some(a) = it.next() { + let take = |it: &mut dyn Iterator, name: &str| -> String { + it.next().unwrap_or_else(|| { + print_usage(); + panic!("--{} requires a value", name) + }) + }; + match a.as_str() { + "-h" | "--help" => { + print_usage(); + return None; + } + "--times" => cfg.times = take(&mut it, "times").parse().expect("times: u64"), + "--keys" => { + cfg.keys_per_time = take(&mut it, "keys").parse().expect("keys: u64"); + } + "--per-worker" => { + cfg.per_worker_bytes = take(&mut it, "per-worker") + .parse() + .expect("per-worker: usize"); + } + "--shared" => { + cfg.shared_bytes = take(&mut it, "shared").parse().expect("shared: usize"); + } + "--workers" => { + cfg.workers = take(&mut it, "workers").parse().expect("workers: usize"); + } + "--sample-secs" => { + cfg.sample_secs = take(&mut it, "sample-secs") + .parse() + .expect("sample-secs: u64"); + } + "--mode" => { + cfg.mode = match take(&mut it, "mode").as_str() { + "both" => Mode::Both, + "spill" => Mode::Spill, + "baseline" => Mode::Baseline, + other => { + print_usage(); + panic!("unknown mode: {other}"); + } + }; + } + other => { + print_usage(); + panic!("unknown arg: {other}"); + } + } + } + Some(cfg) +} + +fn print_usage() { + eprintln!("Usage: column_paged_spill [OPTIONS]"); + eprintln!(); + eprintln!(" --times N distinct data timestamps (default 8)"); + eprintln!(" --keys N keys per timestamp (default 500000)"); + eprintln!(" --per-worker BYTES TieredPolicy per-worker budget (default 32 MiB)"); + eprintln!(" --shared BYTES TieredPolicy shared budget (default 512 MiB)"); + eprintln!(" --workers N timely worker threads (default 1)"); + eprintln!(" --sample-secs N print RSS every N seconds (default 0 = off)"); + eprintln!(" --mode MODE spill | baseline | both (default both)"); + eprintln!(); + eprintln!("Total records pushed = 2 * times * keys (positives + negatives that cancel)."); + eprintln!("Records partitioned across workers by `k % workers` after `mix()` scramble."); + eprintln!(); + eprintln!("Examples:"); + eprintln!(" # quick smoke — 8M records, both modes, 1 worker"); + eprintln!(" column_paged_spill"); + eprintln!(); + eprintln!(" # 100 GB workload on 4 workers, RSS every 30s, spill-only"); + eprintln!(" column_paged_spill --mode spill --workers 4 \\"); + eprintln!(" --times 64 --keys 24000000 --sample-secs 30"); +} diff --git a/test/feature-benchmark/mzcompose.py b/test/feature-benchmark/mzcompose.py index 3ad49b02704fb..e1013d9c8e046 100644 --- a/test/feature-benchmark/mzcompose.py +++ b/test/feature-benchmark/mzcompose.py @@ -189,15 +189,27 @@ def run_one_scenario( early_abort = False - for mz_id, instance in enumerate(["this", "other"]): - balancerd, tag, size, params = ( - (args.this_balancerd, args.this_tag, args.this_size, args.this_params) + instances = ["this"] if args.skip_other else ["this", "other"] + for mz_id, instance in enumerate(instances): + balancerd, tag, size, params, memory, memory_swap, mem_swappiness = ( + ( + args.this_balancerd, + args.this_tag, + args.this_size, + args.this_params, + args.this_memory, + args.this_memory_swap, + args.this_mem_swappiness, + ) if instance == "this" else ( args.other_balancerd, args.other_tag, args.other_size, args.other_params, + args.other_memory, + args.other_memory_swap, + args.other_mem_swappiness, ) ) @@ -228,10 +240,18 @@ def run_one_scenario( size, additional_system_parameter_defaults, args.azurite and instance == "this", + memory=memory, + memory_swap=memory_swap, + mem_swappiness=mem_swappiness, ) clusterd_image = f"materialize/clusterd:{tag}" if tag else None clusterd = create_clusterd_service( - clusterd_image, size, additional_system_parameter_defaults + clusterd_image, + size, + additional_system_parameter_defaults, + memory=memory, + memory_swap=memory_swap, + mem_swappiness=mem_swappiness, ) if tag is not None and not c.try_pull_service_image(mz): @@ -245,10 +265,18 @@ def run_one_scenario( size, additional_system_parameter_defaults, args.azurite and instance == "this", + memory=memory, + memory_swap=memory_swap, + mem_swappiness=mem_swappiness, ) clusterd_image = f"materialize/clusterd:{tag}" if tag else None clusterd = create_clusterd_service( - clusterd_image, size, additional_system_parameter_defaults + clusterd_image, + size, + additional_system_parameter_defaults, + memory=memory, + memory_swap=memory_swap, + mem_swappiness=mem_swappiness, ) start_overridden_mz_clusterd_and_cockroach( @@ -360,6 +388,9 @@ def create_mz_service( default_size: int, additional_system_parameter_defaults: dict[str, str] | None, azurite: bool, + memory: str | None = None, + memory_swap: str | None = None, + mem_swappiness: int | None = None, ) -> Materialized: return Materialized( image=mz_image, @@ -374,6 +405,9 @@ def create_mz_service( blob_store_is_azure=azurite, sanity_restart=False, support_external_clusterd=True, + memory=memory, + memory_swap=memory_swap, + mem_swappiness=mem_swappiness, ) @@ -381,8 +415,16 @@ def create_clusterd_service( clusterd_image: str | None, default_size: int, additional_system_parameter_defaults: dict[str, str] | None, + memory: str | None = None, + memory_swap: str | None = None, + mem_swappiness: int | None = None, ) -> Clusterd: - return Clusterd(image=clusterd_image) + return Clusterd( + image=clusterd_image, + memory=memory, + memory_swap=memory_swap, + mem_swappiness=mem_swappiness, + ) def start_overridden_mz_clusterd_and_cockroach( @@ -519,6 +561,71 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: help="SIZE use for 'THIS'", ) + parser.add_argument( + "--this-memory", + metavar="MEM", + type=str, + default=os.getenv("THIS_MEMORY", None), + help="Docker memory limit for the 'THIS' Materialized + Clusterd " + "containers (e.g. '2g', '512m'). Defaults to no limit. Useful for " + "exercising spill paths under realistic pressure.", + ) + + parser.add_argument( + "--other-memory", + metavar="MEM", + type=str, + default=os.getenv("OTHER_MEMORY", None), + help="Docker memory limit for the 'OTHER' Materialized + Clusterd containers.", + ) + + parser.add_argument( + "--this-memory-swap", + metavar="MEM", + type=str, + default=os.getenv("THIS_MEMORY_SWAP", None), + help="Total RAM + swap available to the 'THIS' Materialized + Clusterd " + "containers (e.g. '5g'). Must be >= --this-memory to enable swap. " + "Lets the host kernel swap pages instead of OOM-killing under " + "memory pressure — useful for benchmarking OS swap as a baseline " + "vs application-managed spill.", + ) + + parser.add_argument( + "--this-mem-swappiness", + metavar="N", + type=int, + default=None, + help="`mem_swappiness` (0-100) for the 'THIS' containers. Higher " + "values bias the kernel toward swapping anonymous pages aggressively " + "instead of dropping page cache. Default leaves Docker's default.", + ) + + parser.add_argument( + "--other-memory-swap", + metavar="MEM", + type=str, + default=os.getenv("OTHER_MEMORY_SWAP", None), + help="Total RAM + swap for the 'OTHER' containers.", + ) + + parser.add_argument( + "--other-mem-swappiness", + metavar="N", + type=int, + default=None, + help="`mem_swappiness` (0-100) for the 'OTHER' containers.", + ) + + parser.add_argument( + "--skip-other", + action=argparse.BooleanOptionalAction, + default=False, + help="Run only the 'THIS' side; skip the comparison against 'OTHER'. " + "Useful for iterating on a new scenario without re-running the " + "baseline tag every time.", + ) + parser.add_argument( "--ignore-other-tag-missing", action=argparse.BooleanOptionalAction, From ecd5a88e11c03a9e0ea931120ee0652f9bffc9c5 Mon Sep 17 00:00:00 2001 From: Dov Alperin Date: Wed, 20 May 2026 15:13:24 -0400 Subject: [PATCH 34/34] lint fixes --- misc/python/materialize/mzcompose/__init__.py | 3 ++ misc/python/materialize/mzcompose/service.py | 7 +++ .../materialize/parallel_workload/action.py | 3 ++ src/compute/src/compute_state.rs | 11 ++-- src/compute/src/row_spine.rs | 5 +- src/ore/src/pager/swap.rs | 5 -- .../benches/columnar_merge_batcher.rs | 4 ++ .../examples/column_paged_spill.rs | 17 +++--- src/timely-util/src/column_pager.rs | 6 +-- src/timely-util/src/columnar/merge_batcher.rs | 54 ++++++++++--------- 10 files changed, 67 insertions(+), 48 deletions(-) diff --git a/misc/python/materialize/mzcompose/__init__.py b/misc/python/materialize/mzcompose/__init__.py index 49b4c731dfe26..fb2468c755bdf 100644 --- a/misc/python/materialize/mzcompose/__init__.py +++ b/misc/python/materialize/mzcompose/__init__.py @@ -667,6 +667,9 @@ def get_default_system_parameters( "enable_mcp_developer", "mcp_max_response_size", "user_id_pool_batch_size", + "enable_column_paged_batcher", + "column_paged_batcher_budget_fraction", + "column_paged_batcher_backend", ] diff --git a/misc/python/materialize/mzcompose/service.py b/misc/python/materialize/mzcompose/service.py index 3d5d4e4c38b0c..c07535c2e9a08 100644 --- a/misc/python/materialize/mzcompose/service.py +++ b/misc/python/materialize/mzcompose/service.py @@ -147,6 +147,13 @@ class ServiceConfig(TypedDict, total=False): TODO(benesch): this should use a nested TypedDict. """ + memswap_limit: str | int + """Total memory limit (memory + swap). Set greater than the memory limit to + enable host swap usage under memory pressure.""" + + mem_swappiness: int + """Kernel swappiness for the container (0-100).""" + ulimits: dict[str, Any] """Override the default ulimits for a container.""" diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index a693d07fecf7e..d762198ab9e72 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -1822,6 +1822,9 @@ def __init__( "oidc_group_role_sync_strict", "console_oidc_client_id", "console_oidc_scopes", + "enable_column_paged_batcher", + "column_paged_batcher_budget_fraction", + "column_paged_batcher_backend", ] def run(self, exe: Executor) -> bool: diff --git a/src/compute/src/compute_state.rs b/src/compute/src/compute_state.rs index 3d9ab30df383c..43d3bbbca669e 100644 --- a/src/compute/src/compute_state.rs +++ b/src/compute/src/compute_state.rs @@ -36,7 +36,7 @@ use mz_compute_types::plan::render_plan::RenderPlan; use mz_dyncfg::ConfigSet; use mz_expr::row::RowCollection; use mz_expr::{RowComparator, SafeMfpPlan}; -use mz_ore::cast::CastFrom; +use mz_ore::cast::{CastFrom, CastLossy}; use mz_ore::collections::CollectionExt; use mz_ore::metrics::{MetricsRegistry, UIntGauge}; use mz_ore::now::EpochMillis; @@ -315,12 +315,13 @@ impl ComputeState { // assumption if no limit was announced (e.g. dev environments). const MIB: usize = 1024 * 1024; const DEFAULT_MEM_LIMIT: usize = 4 * 1024 * MIB; - let mem_limit = crate::memory_limiter::get_memory_limit() - .unwrap_or(DEFAULT_MEM_LIMIT); + let mem_limit = crate::memory_limiter::get_memory_limit().unwrap_or(DEFAULT_MEM_LIMIT); let fraction = COLUMN_PAGED_BATCHER_BUDGET_FRACTION.get(config).max(0.0); - let total = ((mem_limit as f64) * fraction) as usize; + let total = usize::cast_lossy(f64::cast_lossy(mem_limit) * fraction); let per_worker = (total / 8).clamp(16 * MIB, 64 * MIB); - let shared = total.saturating_sub(per_worker).clamp(128 * MIB, 1024 * MIB); + let shared = total + .saturating_sub(per_worker) + .clamp(128 * MIB, 1024 * MIB); let backend_str = COLUMN_PAGED_BATCHER_BACKEND.get(config); let backend = match backend_str.as_str() { diff --git a/src/compute/src/row_spine.rs b/src/compute/src/row_spine.rs index c731f39aabc95..792ff5beacbb1 100644 --- a/src/compute/src/row_spine.rs +++ b/src/compute/src/row_spine.rs @@ -44,9 +44,8 @@ mod spines { /// [`Col2ValPagedBatcher`] for the spillable arrange path. /// /// [`Col2ValPagedBatcher`]: mz_timely_util::columnar::Col2ValPagedBatcher - pub type RowRowColPagedBuilder = RcBuilder< - OrdValBuilder, Column<((Row, Row), T, R)>>, - >; + pub type RowRowColPagedBuilder = + RcBuilder, Column<((Row, Row), T, R)>>>; pub type RowValSpine = Spine>>>; pub type RowValBatcher = KeyValBatcher; diff --git a/src/ore/src/pager/swap.rs b/src/ore/src/pager/swap.rs index 9c87cb24c20fa..c70d6599b3c03 100644 --- a/src/ore/src/pager/swap.rs +++ b/src/ore/src/pager/swap.rs @@ -93,11 +93,6 @@ fn page_size() -> usize { usize::try_from(raw).expect("page size is positive and fits usize") } -#[cfg(not(target_os = "linux"))] -fn page_size() -> usize { - 4096 -} - pub(crate) fn read_at_swap(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec) { let inner = handle .swap_inner() diff --git a/src/timely-util/benches/columnar_merge_batcher.rs b/src/timely-util/benches/columnar_merge_batcher.rs index 78a6cae60a76f..7ab8971f9e3c8 100644 --- a/src/timely-util/benches/columnar_merge_batcher.rs +++ b/src/timely-util/benches/columnar_merge_batcher.rs @@ -7,6 +7,10 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +// Equal-length `Iterator::zip` is fine for bench-output formatting; pulling in +// itertools just for `zip_eq` isn't worth it here. +#![allow(clippy::disallowed_methods)] + //! Microbenchmark comparing the legacy column-backed `ColumnMerger` against //! the new pageable `ColumnMergeBatcher` driver on the merge-batcher's hot //! path. diff --git a/src/timely-util/examples/column_paged_spill.rs b/src/timely-util/examples/column_paged_spill.rs index c755795dd2218..d3946b69df93b 100644 --- a/src/timely-util/examples/column_paged_spill.rs +++ b/src/timely-util/examples/column_paged_spill.rs @@ -29,6 +29,12 @@ //! --sample-secs 30 //! ``` +#![allow(clippy::as_conversions)] +// Example deliberately exercises differential's `arrange_core` directly to +// drive the column-paged batcher in isolation; the `MzArrange` wrapper in +// `mz-compute` would pull in the whole compute crate just for a benchmark. +#![allow(clippy::disallowed_methods)] + use std::rc::Rc; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; @@ -42,8 +48,8 @@ use differential_dataflow::trace::rc_blanket_impls::RcBuilder; use mz_ore::pager::{self, Backend}; use mz_timely_util::column_pager::policy::TieredPolicy; use mz_timely_util::column_pager::{ColumnPager, set_global_pager}; -use mz_timely_util::columnar::Column; use mz_timely_util::columnar::Col2ValPagedBatcher; +use mz_timely_util::columnar::Column; use mz_timely_util::columnar::builder::ColumnBuilder; use timely::dataflow::InputHandle; use timely::dataflow::channels::pact::Pipeline; @@ -91,7 +97,7 @@ fn run_dataflow(cfg: &Config, label: &str) -> Duration { // RSS sampler thread. `ps -o rss=` is portable across Linux + macOS // and doesn't add a dep just to read /proc/self/status. let sampler = if cfg.sample_secs > 0 { - let stop = stop.clone(); + let stop = Arc::clone(&stop); let label_owned = label.to_string(); let interval = Duration::from_secs(cfg.sample_secs); let start = Instant::now(); @@ -120,9 +126,8 @@ fn run_dataflow(cfg: &Config, label: &str) -> Duration { let index = worker.index(); let peers = worker.peers(); - let mut input = - >>::new_with_builder(); - let mut probe: ProbeHandle = ProbeHandle::new(); + let mut input = >>::new_with_builder(); + let probe: ProbeHandle = ProbeHandle::new(); worker.dataflow::(|scope| { let stream = scope.input_from(&mut input); @@ -131,7 +136,7 @@ fn run_dataflow(cfg: &Config, label: &str) -> Duration { Pipeline, "ColumnPagedSpillArrange", ); - arranged.stream.probe_with(&mut probe); + arranged.stream.probe_with(&probe); }); // Push positives then negatives at the same logical time so they diff --git a/src/timely-util/src/column_pager.rs b/src/timely-util/src/column_pager.rs index 935778d5434cc..b1b743aa10f32 100644 --- a/src/timely-util/src/column_pager.rs +++ b/src/timely-util/src/column_pager.rs @@ -311,14 +311,10 @@ fn record_decision(paged: bool, bytes: usize) { /// Returns the current global pager. Cheap: clones the inner `Arc`. pub fn global_pager() -> ColumnPager { - GLOBAL_PAGER - .read() - .expect("global pager poisoned") - .clone() + GLOBAL_PAGER.read().expect("global pager poisoned").clone() } impl ColumnPager { - /// Drains `col` into a [`PagedColumn`]. After return `col` is left as a /// fresh `Column::default()` (typed, empty), ready to be refilled by the /// caller on the next loop iteration. diff --git a/src/timely-util/src/columnar/merge_batcher.rs b/src/timely-util/src/columnar/merge_batcher.rs index 9c32481207b03..98c5022b79024 100644 --- a/src/timely-util/src/columnar/merge_batcher.rs +++ b/src/timely-util/src/columnar/merge_batcher.rs @@ -311,7 +311,7 @@ where /// per chunk produced, so the result chain holds `PagedColumn`s and the /// caller never sees a fully materialized merge result. fn merge_by( - &mut self, + &self, a: VecDeque>, b: VecDeque>, ) -> VecDeque> { @@ -449,8 +449,22 @@ pub fn merge_chains( // Drain remaining: copy partial head through `merge_from`'s 1-input // dispatch, then hand the rest of the chain's `PagedColumn`s straight to // the sink without materializing. - drain_side(&mut heads[0], &mut positions[0], list1, &mut result, &mut sink, pager); - drain_side(&mut heads[1], &mut positions[1], list2, &mut result, &mut sink, pager); + drain_side( + &mut heads[0], + &mut positions[0], + list1, + &mut result, + &mut sink, + pager, + ); + drain_side( + &mut heads[1], + &mut positions[1], + list2, + &mut result, + &mut sink, + pager, + ); if !result.is_empty() { sink(pager.page(&mut result)); @@ -655,17 +669,15 @@ mod tests { } /// Wrap a Vec as a paged chain for `FetchIter`. - fn to_chain(cols: Vec>, pager: &ColumnPager) -> VecDeque> { - cols.into_iter() - .map(|mut c| pager.page(&mut c)) - .collect() + fn to_chain( + cols: Vec>, + pager: &ColumnPager, + ) -> VecDeque> { + cols.into_iter().map(|mut c| pager.page(&mut c)).collect() } /// Drive `merge_chains` with a disabled pager and return owned tuples. - fn drive_merge( - chain1: Vec>, - chain2: Vec>, - ) -> Vec { + fn drive_merge(chain1: Vec>, chain2: Vec>) -> Vec { let pager = ColumnPager::disabled(); let q1 = to_chain(chain1, &pager); let q2 = to_chain(chain2, &pager); @@ -728,10 +740,7 @@ mod tests { vec![col(&[((0, 0), 0, 1), ((5, 0), 0, 1)])], vec![col(&[((5, 0), 0, 1), ((10, 0), 0, 1)])], ); - assert_eq!( - out, - vec![((0, 0), 0, 1), ((5, 0), 0, 2), ((10, 0), 0, 1)] - ); + assert_eq!(out, vec![((0, 0), 0, 1), ((5, 0), 0, 2), ((10, 0), 0, 1)]); } /// Same merge, force-paged: chains stay in `Paged` form throughout, and @@ -740,14 +749,8 @@ mod tests { fn merge_chains_force_paged_round_trip() { let policy = ForcePagePolicy::new(); let pager = ColumnPager::new(policy.clone()); - let q1 = to_chain( - vec![col(&[((0, 0), 0, 1), ((2, 0), 0, 1)])], - &pager, - ); - let q2 = to_chain( - vec![col(&[((1, 0), 0, 1), ((3, 0), 0, 1)])], - &pager, - ); + let q1 = to_chain(vec![col(&[((0, 0), 0, 1), ((2, 0), 0, 1)])], &pager); + let q2 = to_chain(vec![col(&[((1, 0), 0, 1), ((3, 0), 0, 1)])], &pager); // Confirm the chains started paged-out (not Resident). assert!(matches!(q1.front().unwrap(), PagedColumn::Paged { .. })); @@ -825,7 +828,10 @@ mod tests { Self } fn push(&mut self, _chunk: &mut Self::Input) {} - fn done(self, _description: differential_dataflow::trace::Description) -> Self::Output { + fn done( + self, + _description: differential_dataflow::trace::Description, + ) -> Self::Output { Vec::new() } fn seal(