From a8cb6a125610d0729575841a5574f264a22f1f38 Mon Sep 17 00:00:00 2001 From: Tom Wambsgans Date: Wed, 6 May 2026 23:45:07 +0200 Subject: [PATCH] fix: safe arena routing (size-routing + sticky-System realloc) (mirrors https://github.com/Barnadrot/zk-alloc/pull/9) (replaces the "flush_rayon" ugly fix) Co-authored-by: Barnadrot --- Cargo.lock | 1 - crates/backend/system-info/Cargo.toml | 1 - crates/backend/system-info/src/lib.rs | 33 -------------------- crates/backend/zk-alloc/src/lib.rs | 34 ++++++++++++++++++--- crates/backend/zk-alloc/tests/test_rayon.rs | 4 ++- 5 files changed, 33 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d938586b8..014747f9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1116,7 +1116,6 @@ name = "system-info" version = "0.1.0" dependencies = [ "libc", - "rayon", ] [[package]] diff --git a/crates/backend/system-info/Cargo.toml b/crates/backend/system-info/Cargo.toml index c63ee1297..862e36e89 100644 --- a/crates/backend/system-info/Cargo.toml +++ b/crates/backend/system-info/Cargo.toml @@ -5,7 +5,6 @@ edition.workspace = true [dependencies] libc = "0.2" -rayon.workspace = true [lints] workspace = true diff --git a/crates/backend/system-info/src/lib.rs b/crates/backend/system-info/src/lib.rs index 07180559b..5323c1ce4 100644 --- a/crates/backend/system-info/src/lib.rs +++ b/crates/backend/system-info/src/lib.rs @@ -9,36 +9,3 @@ pub fn peak_rss_bytes() -> u64 { // ru_maxrss unit: bytes on macOS, KiB on Linux. if cfg!(target_os = "macos") { max } else { max * 1024 } } - -/// Number of jobs [`flush_rayon`] pushes. Must exceed -/// `crossbeam_deque::deque::BLOCK_CAP` (currently 63 — -/// `crossbeam-deque-0.8.6/src/deque.rs:1191`). -const RAYON_FLUSH_JOBS: usize = 256; - -/// Drain rayon's internal queues so they release any storage allocated during the -/// previous phase. -/// -/// Rayon's global pool owns a `crossbeam_deque::Injector`, internally a linked list -/// of fixed-size blocks (`Block` and `Injector::push` — -/// `crossbeam-deque-0.8.6/src/deque.rs:1219` and `:1371`). A block is freed only -/// once its last slot has been consumed. -/// -/// `rayon::join` from a non-worker thread reaches that injector via -/// `join` (`rayon-core-1.13.0/src/join/mod.rs:132`) -> -/// `registry::in_worker` (`registry.rs:946`) -> -/// `Registry::in_worker_cold` (`:517`) -> -/// `Registry::inject` (`:428`) -> `Injector::push`. -/// -/// Under an arena allocator that recycles memory between phases (e.g. `zk-alloc`), -/// a block allocated *during* a phase points into a slab the next `begin_phase()` -/// will reuse. The next push then writes a `JobRef` straight through whatever the -/// application has placed on top, silently corrupting it. -/// -/// Pushing more than `BLOCK_CAP` jobs while the arena is off forces the Injector -/// to allocate a fresh tail block (which lands in System), and forces workers to -/// steal the last slot of every preceding block (which destroys them). -pub fn flush_rayon() { - for _ in 0..RAYON_FLUSH_JOBS { - rayon::join(|| {}, || {}); - } -} diff --git a/crates/backend/zk-alloc/src/lib.rs b/crates/backend/zk-alloc/src/lib.rs index cae70642f..39350ef90 100644 --- a/crates/backend/zk-alloc/src/lib.rs +++ b/crates/backend/zk-alloc/src/lib.rs @@ -30,6 +30,16 @@ const SLACK: usize = 4; // SLACK absorbs the main thread and any non-rayon helpe const MAX_THREADS: usize = NUM_THREADS + SLACK; const REGION_SIZE: usize = SLAB_SIZE * MAX_THREADS; +/// Allocations smaller than this go to System even during active phases. +/// Routes registry / hashmap / injector-block-sized allocations away from the +/// arena, so library state that outlives a phase doesn't land in recycled +/// memory. Covers the known phase-crossing patterns: crossbeam_deque::Injector +/// blocks (~1.5 KB), tracing-subscriber Registry slot data (sub-KB), hashbrown +/// HashMap entries (sub-KB), rayon-core job stack frames (sub-KB). +/// +/// TODO is there a cleaner way? +const MIN_ARENA_BYTES: usize = 4096; + #[derive(Debug)] pub struct ZkAllocator; @@ -106,12 +116,8 @@ pub fn begin_phase() { /// Deactivates the arena. New allocations go to the system allocator; existing arena /// pointers stay valid until the next `begin_phase()` resets the slabs. -/// -/// Also calls [`system_info::flush_rayon`] to release any rayon/crossbeam storage -/// still referencing this phase's arena memory. pub fn end_phase() { ARENA_ACTIVE.store(false, Ordering::Release); - system_info::flush_rayon(); } #[cold] @@ -152,6 +158,15 @@ unsafe impl GlobalAlloc for ZkAllocator { #[inline(always)] unsafe fn alloc(&self, layout: Layout) -> *mut u8 { if ARENA_ACTIVE.load(Ordering::Relaxed) { + // Small allocs bypass arena: registry slots / HashMap entries / + // injector-block-sized allocations from rayon/tracing libraries + // commonly outlive a phase. Routing them to System keeps them + // safe across begin_phase()/end_phase() boundaries. + // + // TODO is there a cleaner way? + if layout.size() < MIN_ARENA_BYTES { + return unsafe { std::alloc::System.alloc(layout) }; + } let generation = GENERATION.load(Ordering::Relaxed); if ARENA_GEN.get() == generation { let align = layout.align(); @@ -182,6 +197,17 @@ unsafe impl GlobalAlloc for ZkAllocator { if new_size <= layout.size() { return ptr; } + // Sticky-System routing: if the original allocation came from System + // (small, or pre-phase, or routed by size-routing), keep the grown + // allocation in System too. Without this, a Vec allocated outside a + // phase that grows inside one would silently migrate into the arena + // and become subject to phase recycling. + let addr = ptr as usize; + let base = REGION_BASE.load(Ordering::Relaxed); + let in_arena = base != 0 && addr >= base && addr < base + REGION_SIZE; + if !in_arena { + return unsafe { std::alloc::System.realloc(ptr, layout, new_size) }; + } // SAFETY: new_size > layout.size() > 0, align unchanged from valid layout. let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) }; let new_ptr = unsafe { self.alloc(new_layout) }; diff --git a/crates/backend/zk-alloc/tests/test_rayon.rs b/crates/backend/zk-alloc/tests/test_rayon.rs index ae084af21..eebcfe5cf 100644 --- a/crates/backend/zk-alloc/tests/test_rayon.rs +++ b/crates/backend/zk-alloc/tests/test_rayon.rs @@ -1,4 +1,6 @@ -//! Regression test for the bug prevented by `system_info::flush_rayon`. +//! Regression test for arena/rayon corruption: rayon's `crossbeam_deque::Injector` +//! blocks (~1.5 KB) used to land in the arena and outlive a phase. Now prevented +//! by `MIN_ARENA_BYTES` size-routing in `ZkAllocator::alloc`. use rayon::prelude::*;