From cbeb341fe061345e5a4bcd7e9734ca795f7591b7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 May 2026 21:23:12 +0900 Subject: [PATCH 1/4] arena: multi-page incremental allocator + one-pass ToT builder Replace the one-shot Arena (a single slab sized by a pre-walk of every inner range) with a multi-page bump allocator, so an arena-backed ToT outer tile no longer has to know all inner-tensor sizes before construction. Arena (arena.h): - A growing list of pages; each page buffer is a stable heap block, so the raw Cell* of every ArenaTensor view stays valid as pages are added. claim_bytes() bumps the current page and appends a fresh one on a miss; a request larger than a page (or needing finer alignment) gets its own dedicated, exactly-sized page. - reserve_page() lays down a single exact page -- the up-front path uses it so kernel/einsum-built tiles keep their one contiguous slab, byte-identical to before. - Page size is a knob (TILEDARRAY_ARENA_PAGE_BYTES). Construction is single-threaded by contract (one task per outer tile); the bump path is intentionally unsynchronized. - Drop the unused plan()/ArenaPlan helpers. arena_kernels.h: - arena_outer_init reimplemented against the new Arena (reserve_page + sequential claim_bytes); signature unchanged, so einsum/tensor.h callers are untouched. - ArenaToTBuilder: one-pass incremental construction -- the caller discovers each inner range and fills the returned cell in a single step, driving its own loop. A cell larger than a page and a single-cell tile both route to an exactly-sized dedicated page. - arena_compact: coalesce a multi-page incrementally-built tile into one contiguous slab. Tests: rewrite tests/arena.cpp for the new API (page rollover, oversized/dedicated pages, single exact page, aliasing survival); add ArenaToTBuilder + arena_compact coverage for both TA::Tensor and ArenaTensor inner cells. --- src/TiledArray/tensor/arena.h | 223 +++++++++++++++++--------- src/TiledArray/tensor/arena_kernels.h | 146 ++++++++++++++--- tests/arena.cpp | 168 ++++++++++--------- tests/arena_kernels.cpp | 58 +++++++ tests/arena_tensor_kernels.cpp | 58 +++++++ 5 files changed, 473 insertions(+), 180 deletions(-) diff --git a/src/TiledArray/tensor/arena.h b/src/TiledArray/tensor/arena.h index b37b962436..f045d87983 100644 --- a/src/TiledArray/tensor/arena.h +++ b/src/TiledArray/tensor/arena.h @@ -5,6 +5,7 @@ #include "TiledArray/config.h" #include "TiledArray/error.h" +#include #include #include #include @@ -22,13 +23,55 @@ inline bool& arena_disabled() { return flag; } -/// One-shot bump allocator; slab is co-owned via aliasing shared_ptrs. +/// Default size, in bytes, of a standard arena page. Inner ToT tensors are +/// small (tens to hundreds of elements), so a page packs many of them; the +/// default amortizes the per-page allocation and inter-cell alignment +/// padding. Override at configure time by defining TILEDARRAY_ARENA_PAGE_BYTES. +#ifndef TILEDARRAY_ARENA_PAGE_BYTES +#define TILEDARRAY_ARENA_PAGE_BYTES (64 * 1024) +#endif +inline constexpr std::size_t kArenaDefaultPageBytes = + TILEDARRAY_ARENA_PAGE_BYTES; + +/// Cache-line-floor alignment; also the alignment every standard page base is +/// allocated to, so any per-cell alignment up to this value is satisfied +/// without a dedicated page. +inline constexpr std::size_t kArenaCachelineAlign = 128; + +/// Round bytes up to a power-of-two alignment. +inline std::size_t arena_align_up(std::size_t bytes, + std::size_t alignment) noexcept { + return (bytes + alignment - 1) & ~(alignment - 1); +} + +/// Incremental, multi-page bump allocator backing `Tensor` outer +/// tiles. Memory is handed out from a growing list of pages; each page is a +/// stable heap block (the page list may grow, but a page's buffer never +/// moves), which is what keeps the raw `Cell*` of every `ArenaTensor` view +/// valid for the arena's lifetime. Pages are co-owned with the cells via +/// aliasing `shared_ptr`s. +/// +/// Two ways to drive it: +/// - up-front: the total size is known, so `reserve_page(total, ...)` lays +/// down a single exact page and subsequent `claim`s pack into it (one +/// contiguous slab, zero tail waste -- what the kernels and einsum use); +/// - incremental: `claim` cells one at a time as their sizes are discovered; +/// a fresh page is appended whenever the current one cannot satisfy a +/// request. A cell larger than a page gets its own dedicated, exactly +/// sized page (which never becomes the bump target). +/// +/// Thread-safety: a single `Arena` is built by a single thread. ToT outer +/// tiles are produced one task per tile (`init_tiles` / kernels), each with +/// its own `Arena`, so the bump path is deliberately not synchronized. class Arena { public: explicit Arena( - std::pmr::memory_resource* mr = std::pmr::new_delete_resource()) noexcept - : resource_(mr) { + std::pmr::memory_resource* mr = std::pmr::new_delete_resource(), + std::size_t page_size = kArenaDefaultPageBytes, + bool zero_init = false) noexcept + : resource_(mr), page_size_(page_size), zero_init_(zero_init) { TA_ASSERT(resource_ != nullptr); + TA_ASSERT(page_size_ > 0); } Arena(const Arena&) = delete; @@ -37,97 +80,118 @@ class Arena { Arena& operator=(Arena&&) noexcept = default; ~Arena() = default; - /// Allocate the slab once; zero_init clears it for accumulation kernels. - /// `alignment` (default `alignof(std::max_align_t)`) is the alignment of - /// the slab base; pass a larger power-of-two when callers need SIMD-aligned - /// element pointers at known interior offsets. - void reserve(std::size_t bytes, bool zero_init = false, - std::size_t alignment = alignof(std::max_align_t)) { - TA_ASSERT(capacity_ == 0); + /// Append a page sized exactly `bytes` (the page base is aligned to + /// `alignment`, bumped to at least `max_align_t`) and make it the current + /// bump page. Used by the up-front path with `bytes` set to the known + /// total, and for a single-cell tile so the lone page is exactly sized. + void reserve_page(std::size_t bytes, std::size_t alignment) { TA_ASSERT(bytes > 0); - TA_ASSERT(alignment >= alignof(std::max_align_t)); - TA_ASSERT((alignment & (alignment - 1)) == 0); - void* raw = resource_->allocate(bytes, alignment); - auto* mr = resource_; - auto deleter = [mr, bytes, alignment](std::byte* p) noexcept { - mr->deallocate(p, bytes, alignment); - }; - slab_ = std::shared_ptr(static_cast(raw), - std::move(deleter)); - capacity_ = bytes; - cursor_ = 0; - if (zero_init) std::memset(slab_.get(), 0, bytes); + std::size_t a = alignment > alignof(std::max_align_t) + ? alignment + : alignof(std::max_align_t); + TA_ASSERT((a & (a - 1)) == 0); + add_page(bytes, a); + current_ = pages_.size() - 1; } - /// Aliasing view at a caller-aligned offset. - template - std::shared_ptr slice(std::size_t offset, std::size_t /*n_elem*/) const { - TA_ASSERT(slab_); - TA_ASSERT(offset % alignof(T) == 0); - TA_ASSERT(offset <= capacity_); - auto* p = reinterpret_cast(slab_.get() + offset); - return std::shared_ptr(slab_, p); + /// Bump-allocate `bytes` aligned to `alignment`. Tries the current page; + /// on a miss appends a fresh page -- a dedicated, exactly sized page when + /// the request exceeds the page size (or needs more alignment than a + /// standard page base provides), otherwise a standard page that becomes + /// the new bump target. The returned handle aliases the owning page's + /// buffer, so it keeps that page alive on its own. + std::shared_ptr claim_bytes(std::size_t bytes, + std::size_t alignment) { + TA_ASSERT(bytes > 0); + TA_ASSERT(alignment > 0 && (alignment & (alignment - 1)) == 0); + + if (current_ != kNoPage) { + Page& p = pages_[current_]; + const auto base = reinterpret_cast(p.buffer.get()); + const auto cur = base + p.cursor; + const auto aligned = + (cur + alignment - 1) & ~(std::uintptr_t(alignment) - 1); + const std::size_t pad = static_cast(aligned - cur); + if (pad + bytes <= p.capacity - p.cursor) { + p.cursor += pad + bytes; + bytes_allocated_ += bytes; + return std::shared_ptr( + p.buffer, reinterpret_cast(aligned)); + } + } + + // Need a fresh page. A page base is at least `kArenaCachelineAlign`- + // aligned and a fresh cursor is 0, so a standard page needs no padding; + // an over-large request, or one needing finer alignment than a standard + // page base, gets a dedicated exactly-sized page. + if (bytes > page_size_ || alignment > kArenaCachelineAlign) { + std::size_t a = + alignment > kArenaCachelineAlign ? alignment : kArenaCachelineAlign; + Page& d = add_page(bytes, a); + d.cursor = bytes; // dedicated: full after this one claim + bytes_allocated_ += bytes; + // A dedicated page is never the bump target; `current_` is unchanged. + return std::shared_ptr(d.buffer, d.buffer.get()); + } + + Page& p = add_page(page_size_, kArenaCachelineAlign); + current_ = pages_.size() - 1; + p.cursor = bytes; + bytes_allocated_ += bytes; + return std::shared_ptr(p.buffer, p.buffer.get()); } - /// Bump-allocate n elements of T; result is T-aligned. + /// Typed bump-allocate of `n` elements of `T`; result is `T`-aligned. template std::shared_ptr claim(std::size_t n) { - TA_ASSERT(slab_); - auto base = reinterpret_cast(slab_.get() + cursor_); - auto aligned = (base + alignof(T) - 1) & ~(alignof(T) - 1); - std::size_t pad = static_cast(aligned - base); - std::size_t consumed = pad + n * sizeof(T); - TA_ASSERT(cursor_ + consumed <= capacity_); - cursor_ += consumed; - return std::shared_ptr(slab_, reinterpret_cast(aligned)); + auto h = claim_bytes(n * sizeof(T), alignof(T)); + return std::shared_ptr(h, reinterpret_cast(h.get())); } - std::size_t capacity() const noexcept { return capacity_; } - std::size_t cursor() const noexcept { return cursor_; } - std::size_t remaining() const noexcept { return capacity_ - cursor_; } - bool empty() const noexcept { return cursor_ == 0; } + std::size_t page_count() const noexcept { return pages_.size(); } + std::size_t bytes_allocated() const noexcept { return bytes_allocated_; } + std::size_t bytes_reserved() const noexcept { + std::size_t s = 0; + for (const auto& p : pages_) s += p.capacity; + return s; + } + bool empty() const noexcept { return bytes_allocated_ == 0; } + std::size_t page_size() const noexcept { return page_size_; } std::pmr::memory_resource* resource() const noexcept { return resource_; } private: - std::pmr::memory_resource* resource_; - std::shared_ptr slab_; - std::size_t capacity_ = 0; - std::size_t cursor_ = 0; -}; - -/// Per-cell offsets and total slab size produced by plan(). -struct ArenaPlan { - std::vector offsets; - std::size_t total_bytes = 0; -}; - -/// Cache-line-floor alignment used by production callers. -inline constexpr std::size_t kArenaCachelineAlign = 128; + struct Page { + std::shared_ptr buffer; + std::size_t capacity = 0; + std::size_t cursor = 0; + }; -/// Round bytes up to a power-of-two alignment. -inline std::size_t arena_align_up(std::size_t bytes, - std::size_t alignment) noexcept { - return (bytes + alignment - 1) & ~(alignment - 1); -} + static constexpr std::size_t kNoPage = static_cast(-1); -/// Pre-walk cells once to compute offsets and total bytes. -template -ArenaPlan plan(std::size_t N_cells, ShapeFn&& shape_fn, - std::size_t element_size, std::size_t alignment) { - ArenaPlan out; - out.offsets.resize(N_cells); - std::size_t total = 0; - for (std::size_t ord = 0; ord < N_cells; ++ord) { - out.offsets[ord] = total; - auto&& r = shape_fn(ord); - std::size_t bytes = r.volume() * element_size; - total += arena_align_up(bytes, alignment); + Page& add_page(std::size_t capacity, std::size_t alignment) { + void* raw = resource_->allocate(capacity, alignment); + auto* mr = resource_; + auto deleter = [mr, capacity, alignment](std::byte* p) noexcept { + mr->deallocate(p, capacity, alignment); + }; + Page pg; + pg.buffer = std::shared_ptr(static_cast(raw), + std::move(deleter)); + pg.capacity = capacity; + if (zero_init_) std::memset(pg.buffer.get(), 0, capacity); + pages_.push_back(std::move(pg)); + return pages_.back(); } - out.total_bytes = total; - return out; -} -/// PMR adapter over an Arena; deallocation is a no-op (slab-owned lifetime). + std::pmr::memory_resource* resource_; + std::size_t page_size_; + bool zero_init_; + std::vector pages_; + std::size_t current_ = kNoPage; // index of the current bump page + std::size_t bytes_allocated_ = 0; +}; + +/// PMR adapter over an Arena; deallocation is a no-op (page-owned lifetime). class ArenaResource final : public std::pmr::memory_resource { public: explicit ArenaResource(Arena* arena) noexcept : arena_(arena) { @@ -138,8 +202,7 @@ class ArenaResource final : public std::pmr::memory_resource { protected: void* do_allocate(std::size_t bytes, std::size_t alignment) override { - auto h = arena_->claim(arena_align_up(bytes, alignment)); - return h.get(); + return arena_->claim_bytes(bytes, alignment).get(); } void do_deallocate(void* /*p*/, std::size_t /*bytes*/, diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h index 8dcd97c870..b74384e57f 100644 --- a/src/TiledArray/tensor/arena_kernels.h +++ b/src/TiledArray/tensor/arena_kernels.h @@ -16,8 +16,10 @@ #include "TiledArray/tensor/arena.h" #include "TiledArray/tensor/arena_tensor.h" +#include #include #include +#include #include #include #include @@ -49,15 +51,20 @@ std::shared_ptr make_outer_data( } // namespace -/// Allocate a slab-backed ToT outer tile with caller-provided inner ranges. +/// Allocate an arena-backed ToT outer tile with caller-provided inner ranges. /// /// `inner_range_fn(cell_ordinal)` -> inner `range_type` for each cell ordinal /// in `[0, outer_range.volume() * batch_sz)`; a zero-volume range yields a -/// deliberately-null inner cell that consumes no slab bytes. Element storage +/// deliberately-null inner cell that consumes no arena bytes. Element storage /// is left zero-initialized when `zero_init` is true. `cell_stride_align` is /// the minimum byte stride between adjacent cells; it is bumped up to the /// inner type's natural alignment (`ArenaTensor::cell_alignment()`, or /// `alignof(T)` for `TA::Tensor` inners). +/// +/// This is the *up-front* path: all inner ranges are known, so the total is +/// pre-walked and laid down as a single exactly-sized arena page -- one +/// contiguous slab, no page-tail waste. For one-pass construction where the +/// inner sizes are discovered incrementally, use `ArenaToTBuilder`. template OuterTensor arena_outer_init( const typename OuterTensor::range_type& outer_range, std::size_t batch_sz, @@ -75,24 +82,19 @@ OuterTensor arena_outer_init( } else { if (alignof(T) > stride) stride = alignof(T); } - // Cells pack at `stride` granularity, but the slab base handed to - // `Arena::reserve` must be at least `max_align_t`-aligned. + // Cells pack at `stride` granularity; the page base must be at least + // `max_align_t`-aligned. const std::size_t slab_align = stride > alignof(std::max_align_t) ? stride : alignof(std::max_align_t); const std::size_t N_cells = outer_range.volume() * batch_sz; - constexpr std::size_t kNull = static_cast(-1); std::vector ranges; ranges.reserve(N_cells); - std::vector offsets(N_cells, 0); std::size_t total = 0; for (std::size_t ord = 0; ord < N_cells; ++ord) { ranges.emplace_back(inner_range_fn(ord)); const std::size_t vol = ranges.back().volume(); - if (vol == 0) { - offsets[ord] = kNull; - } else { - offsets[ord] = total; + if (vol != 0) { // `if constexpr`, not a ternary: `InnerT::cell_size` does not exist for // a `TA::Tensor` inner, so the non-arena branch must not be formed. std::size_t bytes; @@ -104,15 +106,19 @@ OuterTensor arena_outer_init( } } - auto arena_slab = std::make_shared(); - if (total > 0) arena_slab->reserve(total, zero_init, slab_align); - auto data = make_outer_data(N_cells, arena_slab, + auto arena_ptr = std::make_shared(std::pmr::new_delete_resource(), + kArenaDefaultPageBytes, zero_init); + // One exact page holds every cell -- subsequent `claim_bytes` calls pack + // into it in order, reproducing the old single-slab layout. + if (total > 0) arena_ptr->reserve_page(total, slab_align); + auto data = make_outer_data(N_cells, arena_ptr, std::shared_ptr{}); OuterTensor result(outer_range, batch_sz, std::move(data)); for (std::size_t ord = 0; ord < N_cells; ++ord) { auto& r = ranges[ord]; - if (offsets[ord] == kNull) { + const std::size_t vol = r.volume(); + if (vol == 0) { if constexpr (arena) { ::new (result.data() + ord) InnerT(); } else { @@ -123,15 +129,13 @@ OuterTensor arena_outer_init( ::new (result.data() + ord) InnerT(r); } } else if constexpr (arena) { - // slice(offset, 1) returns an aliased shared_ptr; we only - // need its raw pointer to placement-new the Cell -- the slab's lifetime - // is held by `arena_handle` captured in the outer's deleter. - auto byte_view = arena_slab->template slice(offsets[ord], 1); + auto h = arena_ptr->claim_bytes(InnerT::cell_size(vol), stride); ::new (result.data() + ord) - InnerT(make_arena_tensor_in(byte_view.get(), std::move(r))); + InnerT(make_arena_tensor_in(h.get(), std::move(r))); } else { - auto elem_data = arena_slab->template slice(offsets[ord], r.volume()); - ::new (result.data() + ord) InnerT(r, std::move(elem_data)); + auto h = arena_ptr->claim_bytes(vol * sizeof(T), stride); + ::new (result.data() + ord) + InnerT(r, std::shared_ptr(h, reinterpret_cast(h.get()))); } } return result; @@ -170,6 +174,93 @@ OuterTensor make_nested_tile( return result; } +/// One-pass incremental builder for an arena-backed ToT outer tile. +/// +/// `make_nested_tile` / `arena_outer_init` pre-walk every inner range before +/// any storage is allocated. `ArenaToTBuilder` instead sizes and binds inner +/// cells one at a time: the caller discovers each inner range and fills the +/// returned cell in a single step, driving its own loop. Backed by the +/// multi-page `Arena`, so no total size is needed up front. +/// +/// Cells should be `emplace`d in outer cell-ordinal order -- recommended, not +/// required: a view is a pointer, so any order is correct, but in-order +/// emplacement keeps the page layout cache-friendly for later iteration. +/// `arena_compact` coalesces a finished tile into one contiguous slab. +/// +/// A builder, its `Arena`, and the tile under construction are single-thread +/// objects (see `Arena`). +template +class ArenaToTBuilder { + public: + using outer_range_type = typename OuterTensor::range_type; + using inner_t = typename OuterTensor::value_type; + using inner_range_t = typename inner_t::range_type; + using elem_t = typename inner_t::value_type; + + explicit ArenaToTBuilder(const outer_range_type& outer_range, + std::size_t batch_sz = 1, bool zero_init = false, + std::size_t page_size = kArenaDefaultPageBytes) + : outer_range_(outer_range), + batch_sz_(batch_sz), + n_cells_(outer_range.volume() * batch_sz), + arena_(std::make_shared(std::pmr::new_delete_resource(), + page_size, zero_init)) { + data_ = make_outer_data(n_cells_, arena_, + std::shared_ptr{}); + // Cells start null (the deleter destroys all n_cells_); `emplace` binds. + for (std::size_t ord = 0; ord < n_cells_; ++ord) + ::new (data_.get() + ord) inner_t(); + } + + /// Size and bind the inner cell at outer cell ordinal `ord` to + /// `inner_range`, returning a reference to the bound cell for the caller to + /// fill. A zero-volume range leaves the cell null. Outer element indices + /// translate via `outer_range().ordinal(idx)`. + inner_t& emplace(std::size_t ord, inner_range_t inner_range) { + TA_ASSERT(ord < n_cells_); + inner_t& cell = data_[ord]; + const std::size_t vol = inner_range.volume(); + if (vol == 0) return cell; // stays null + constexpr bool arena = is_arena_tensor_v; + std::size_t stride; + std::size_t bytes; + if constexpr (arena) { + stride = inner_t::cell_alignment(); + bytes = inner_t::cell_size(vol); + } else { + stride = alignof(elem_t); + bytes = vol * sizeof(elem_t); + } + // Single-cell tile: lay down one exactly-sized page (corner case b). + if (n_cells_ == 1 && arena_->empty()) arena_->reserve_page(bytes, stride); + auto h = arena_->claim_bytes(bytes, stride); + if constexpr (arena) { + cell = make_arena_tensor_in(h.get(), std::move(inner_range)); + } else { + cell = inner_t( + std::move(inner_range), + std::shared_ptr(h, reinterpret_cast(h.get()))); + } + return cell; + } + + /// Finalize and hand back the assembled outer tile; the builder is spent. + OuterTensor finish() && { + return OuterTensor(outer_range_, batch_sz_, std::move(data_)); + } + + std::size_t cell_count() const noexcept { return n_cells_; } + const outer_range_type& outer_range() const noexcept { return outer_range_; } + const Arena& arena() const noexcept { return *arena_; } + + private: + outer_range_type outer_range_; + std::size_t batch_sz_; + std::size_t n_cells_; + std::shared_ptr arena_; + std::shared_ptr data_; +}; + /// Apply a unary fill op while preserving each source inner range. /// `fill_op(dst_data, src_data, n_elements)` writes the result cell. template @@ -196,6 +287,19 @@ OuterTensor arena_trivial_unary(const SrcOuterTensor& src, FillOp&& fill_op) { return result; } +/// Coalesce a (possibly multi-page, incrementally built) arena-backed ToT +/// outer tile into a fresh single-page tile: one exact allocation, no page +/// tail waste, inner cells laid out contiguously in outer order. Returns a +/// new tile; `src` is unchanged. A tile already built up-front via +/// `arena_outer_init` is single-page already, so compacting it just +/// deep-copies. +template +OuterTensor arena_compact(const OuterTensor& src) { + return arena_trivial_unary( + src, + [](auto* dst, const auto* s, std::size_t n) { std::copy_n(s, n, dst); }); +} + /// Apply a binary fill op using the left operand's inner ranges (asserted /// equal to the right's per cell). `fill_op(dst, l, r, n_elements)`. template +#include #include #include #include using TiledArray::detail::Arena; -using TiledArray::detail::ArenaPlan; using TiledArray::detail::ArenaResource; -using TiledArray::detail::plan; +using TiledArray::detail::kArenaDefaultPageBytes; namespace { -// Minimal Range-like shim for plan() tests: supports only volume(). -struct FakeRange { - std::size_t v; - std::size_t volume() const noexcept { return v; } -}; +bool is_aligned(const void* p, std::size_t a) { + return reinterpret_cast(p) % a == 0; } +} // namespace BOOST_AUTO_TEST_SUITE(arena_suite, TA_UT_LABEL_SERIAL) BOOST_AUTO_TEST_CASE(default_arena_is_empty) { Arena a; - BOOST_CHECK_EQUAL(a.capacity(), 0u); - BOOST_CHECK_EQUAL(a.cursor(), 0u); BOOST_CHECK(a.empty()); + BOOST_CHECK_EQUAL(a.page_count(), 0u); + BOOST_CHECK_EQUAL(a.bytes_allocated(), 0u); + BOOST_CHECK_EQUAL(a.bytes_reserved(), 0u); + BOOST_CHECK_EQUAL(a.page_size(), kArenaDefaultPageBytes); BOOST_CHECK(a.resource() != nullptr); } -BOOST_AUTO_TEST_CASE(reserve_initializes_capacity) { +BOOST_AUTO_TEST_CASE(reserve_page_lays_down_one_exact_page) { Arena a; - a.reserve(1024); - BOOST_CHECK_EQUAL(a.capacity(), 1024u); - BOOST_CHECK_EQUAL(a.cursor(), 0u); - BOOST_CHECK_EQUAL(a.remaining(), 1024u); -} - -BOOST_AUTO_TEST_CASE(reserve_zero_init_clears_slab) { - Arena a; - a.reserve(64, /*zero_init=*/true); - auto h = a.slice(0, 64); - for (std::size_t i = 0; i < 64; ++i) BOOST_CHECK_EQUAL(h[i], 0u); + a.reserve_page(1024, 64); + BOOST_CHECK_EQUAL(a.page_count(), 1u); + BOOST_CHECK_EQUAL(a.bytes_reserved(), 1024u); + // nothing claimed yet + BOOST_CHECK(a.empty()); + BOOST_CHECK_EQUAL(a.bytes_allocated(), 0u); } -BOOST_AUTO_TEST_CASE(slice_random_access_and_aliasing) { +BOOST_AUTO_TEST_CASE(claims_pack_into_the_reserved_page) { Arena a; - a.reserve(1024); - std::shared_ptr p1 = a.slice(0, 4); - std::shared_ptr p2 = a.slice(64, 4); - for (int i = 0; i < 4; ++i) p1[i] = double(i); - for (int i = 0; i < 4; ++i) p2[i] = double(10 + i); - for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p1[i], double(i)); - for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p2[i], double(10 + i)); - BOOST_CHECK(static_cast(&p2[0]) >= static_cast(&p1[4])); + a.reserve_page(1024, 128); + auto h1 = a.claim_bytes(100, 64); + auto h2 = a.claim_bytes(100, 64); + auto h3 = a.claim_bytes(100, 64); + // all three land in the single reserved page + BOOST_CHECK_EQUAL(a.page_count(), 1u); + BOOST_CHECK_EQUAL(a.bytes_allocated(), 300u); + BOOST_CHECK(is_aligned(h1.get(), 64)); + BOOST_CHECK(is_aligned(h2.get(), 64)); + BOOST_CHECK(is_aligned(h3.get(), 64)); + // distinct, non-overlapping + BOOST_CHECK(h2.get() >= h1.get() + 100); + BOOST_CHECK(h3.get() >= h2.get() + 100); } -BOOST_AUTO_TEST_CASE(claim_advances_cursor_and_aligns) { - Arena a; - a.reserve(1024); +BOOST_AUTO_TEST_CASE(claim_auto_allocates_a_standard_page) { + Arena a; // no reserve_page std::shared_ptr h = a.claim(10); BOOST_REQUIRE(h.get() != nullptr); - BOOST_CHECK_EQUAL(reinterpret_cast(h.get()) % alignof(double), - 0u); - BOOST_CHECK(a.cursor() >= 10u * sizeof(double)); + BOOST_CHECK(is_aligned(h.get(), alignof(double))); + BOOST_CHECK_EQUAL(a.page_count(), 1u); + BOOST_CHECK_EQUAL(a.bytes_reserved(), kArenaDefaultPageBytes); + for (int i = 0; i < 10; ++i) h[i] = double(i); + for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(h[i], double(i)); } -BOOST_AUTO_TEST_CASE(slab_survives_arena_destruction) { - std::shared_ptr survivor; - { - Arena tmp; - tmp.reserve(256); - survivor = tmp.claim(10); - for (int i = 0; i < 10; ++i) survivor[i] = -i; - } - for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(survivor[i], -i); +BOOST_AUTO_TEST_CASE(claims_roll_over_to_fresh_pages) { + Arena a(std::pmr::new_delete_resource(), /*page_size=*/256); + std::vector> handles; + // 64 B at 64-B alignment => 4 per 256 B page; 10 claims => >= 3 pages + for (int i = 0; i < 10; ++i) handles.push_back(a.claim_bytes(64, 64)); + BOOST_CHECK_GE(a.page_count(), 3u); + BOOST_CHECK_EQUAL(a.bytes_allocated(), 10u * 64u); + // every handle is a distinct, valid, writable region + for (std::size_t i = 0; i < handles.size(); ++i) + std::memset(handles[i].get(), int(i), 64); + for (std::size_t i = 0; i < handles.size(); ++i) + BOOST_CHECK_EQUAL(static_cast(handles[i][0]), + static_cast(i)); } -BOOST_AUTO_TEST_CASE(plan_uniform_cells) { - ArenaPlan p = plan( - /*N_cells=*/6, - /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{10}; }, - /*element_size=*/sizeof(double), - /*alignment=*/alignof(double)); - BOOST_CHECK_EQUAL(p.total_bytes, 6u * 10u * sizeof(double)); - BOOST_CHECK_EQUAL(p.offsets.size(), 6u); - BOOST_CHECK_EQUAL(p.offsets[0], 0u); - BOOST_CHECK_EQUAL(p.offsets[5], 5u * 10u * sizeof(double)); +BOOST_AUTO_TEST_CASE(oversized_claim_gets_a_dedicated_page) { + Arena a(std::pmr::new_delete_resource(), /*page_size=*/256); + // request larger than a page -> a dedicated, exactly-sized page + auto big = a.claim_bytes(1024, 64); + BOOST_REQUIRE(big.get() != nullptr); + BOOST_CHECK(is_aligned(big.get(), 64)); + BOOST_CHECK_EQUAL(a.page_count(), 1u); + BOOST_CHECK_EQUAL(a.bytes_reserved(), 1024u); + // a following normal claim does not reuse the dedicated page; it opens a + // standard page + auto small = a.claim_bytes(64, 64); + BOOST_CHECK_EQUAL(a.page_count(), 2u); + BOOST_CHECK_EQUAL(a.bytes_reserved(), 1024u + 256u); + std::memset(big.get(), 1, 1024); + std::memset(small.get(), 2, 64); } -BOOST_AUTO_TEST_CASE(plan_variable_cells_match_pivot_doc_example) { - ArenaPlan p = plan( - /*N_cells=*/12, - /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{20}; }, - /*element_size=*/sizeof(double), - /*alignment=*/alignof(double)); - BOOST_CHECK_EQUAL(p.total_bytes, 12u * 20u * sizeof(double)); - BOOST_CHECK_EQUAL(p.offsets[1], 20u * sizeof(double)); +BOOST_AUTO_TEST_CASE(single_exact_page_corner_case) { + // corner case (b): a lone cell -> one exactly-sized page, no waste + Arena a; + a.reserve_page(640, 128); + auto h = a.claim_bytes(640, 128); + BOOST_CHECK_EQUAL(a.page_count(), 1u); + BOOST_CHECK_EQUAL(a.bytes_reserved(), 640u); + BOOST_CHECK_EQUAL(a.bytes_allocated(), 640u); + BOOST_CHECK(is_aligned(h.get(), 128)); } -BOOST_AUTO_TEST_CASE(plan_then_construct_then_read) { - const std::size_t N = 4; - std::vector volumes = {3, 5, 2, 7}; - auto shape_fn = [&volumes](std::size_t ord) { return FakeRange{volumes[ord]}; }; - ArenaPlan p = plan(N, shape_fn, sizeof(double), alignof(double)); - Arena a; - a.reserve(p.total_bytes); - std::vector> handles(N); - for (std::size_t ord = 0; ord < N; ++ord) { - handles[ord] = a.slice(p.offsets[ord], volumes[ord]); - for (std::size_t i = 0; i < volumes[ord]; ++i) - handles[ord][i] = double(100 * ord + i); +BOOST_AUTO_TEST_CASE(zero_init_clears_each_page) { + Arena a(std::pmr::new_delete_resource(), /*page_size=*/256, + /*zero_init=*/true); + auto h = a.claim(200); + for (std::size_t i = 0; i < 200; ++i) BOOST_CHECK_EQUAL(h[i], 0u); +} + +BOOST_AUTO_TEST_CASE(claimed_memory_survives_arena_destruction) { + std::shared_ptr survivor; + { + Arena tmp(std::pmr::new_delete_resource(), /*page_size=*/256); + survivor = tmp.claim(10); + for (int i = 0; i < 10; ++i) survivor[i] = -i; } - for (std::size_t ord = 0; ord < N; ++ord) - for (std::size_t i = 0; i < volumes[ord]; ++i) - BOOST_CHECK_EQUAL(handles[ord][i], double(100 * ord + i)); + // the aliasing handle keeps its page alive past the Arena + for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(survivor[i], -i); } BOOST_AUTO_TEST_CASE(arena_resource_is_identity_equal) { Arena a; - a.reserve(64); ArenaResource r1(&a); ArenaResource r2(&a); BOOST_CHECK(r1.is_equal(r1)); diff --git a/tests/arena_kernels.cpp b/tests/arena_kernels.cpp index 4e278fd495..e6c1fe2a74 100644 --- a/tests/arena_kernels.cpp +++ b/tests/arena_kernels.cpp @@ -157,4 +157,62 @@ BOOST_AUTO_TEST_CASE(inner_permute_rank3_cell) { } } +BOOST_AUTO_TEST_CASE(builder_matches_up_front_baseline) { + // build a ToT one cell at a time, then compare to the up-front baseline + outer_t baseline = make_tot(4, 5, 1.0); + TA::detail::ArenaToTBuilder b(TA::Range{4l}); + for (std::size_t ord = 0; ord < 4; ++ord) { + inner_t& cell = b.emplace(ord, TA::Range{5l}); + for (std::size_t i = 0; i < 5; ++i) + cell.at_ordinal(i) = 1.0 + ord * 100.0 + double(i); + } + outer_t built = std::move(b).finish(); + BOOST_CHECK(tot_equal(built, baseline)); +} + +BOOST_AUTO_TEST_CASE(builder_rolls_over_to_multiple_pages) { + // a small page forces the incremental arena to span several pages + const std::size_t N = 20; + TA::detail::ArenaToTBuilder b(TA::Range{static_cast(N)}, + /*batch=*/1, /*zero_init=*/false, + /*page_size=*/256); + for (std::size_t ord = 0; ord < N; ++ord) { + inner_t& cell = b.emplace(ord, TA::Range{5l}); + for (std::size_t i = 0; i < 5; ++i) + cell.at_ordinal(i) = 1.0 + ord * 100.0 + double(i); + } + BOOST_CHECK_GT(b.arena().page_count(), 1u); + outer_t built = std::move(b).finish(); + BOOST_CHECK(tot_equal(built, make_tot(N, 5, 1.0))); +} + +BOOST_AUTO_TEST_CASE(builder_single_cell_uses_one_exact_page) { + // corner case (b): a lone inner tensor -> one exactly-sized page + TA::detail::ArenaToTBuilder b(TA::Range{1l}); + inner_t& cell = b.emplace(0, TA::Range{7l}); + for (std::size_t i = 0; i < 7; ++i) cell.at_ordinal(i) = double(i); + BOOST_CHECK_EQUAL(b.arena().page_count(), 1u); + BOOST_CHECK_EQUAL(b.arena().bytes_reserved(), 7u * sizeof(double)); + outer_t built = std::move(b).finish(); + BOOST_REQUIRE_EQUAL(built.range().volume(), 1u); + for (std::size_t i = 0; i < 7; ++i) + BOOST_CHECK_EQUAL(built.data()->at_ordinal(i), double(i)); +} + +BOOST_AUTO_TEST_CASE(compact_coalesces_a_multipage_tile) { + const std::size_t N = 16; + TA::detail::ArenaToTBuilder b(TA::Range{static_cast(N)}, 1, + false, /*page_size=*/256); + for (std::size_t ord = 0; ord < N; ++ord) { + inner_t& cell = b.emplace(ord, TA::Range{5l}); + for (std::size_t i = 0; i < 5; ++i) + cell.at_ordinal(i) = 1.0 + ord * 100.0 + double(i); + } + BOOST_CHECK_GT(b.arena().page_count(), 1u); + outer_t multipage = std::move(b).finish(); + outer_t compacted = TA::detail::arena_compact(multipage); + BOOST_CHECK(tot_equal(compacted, multipage)); + BOOST_CHECK(tot_equal(compacted, make_tot(N, 5, 1.0))); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp index fea3046aa8..5b26678ad4 100644 --- a/tests/arena_tensor_kernels.cpp +++ b/tests/arena_tensor_kernels.cpp @@ -158,6 +158,64 @@ bool outers_equal(const Outer& a, const Outer& b) { } // namespace +BOOST_AUTO_TEST_CASE(builder_matches_up_front_baseline) { + // incremental one-pass construction of an ArenaTensor-celled outer tile + Outer baseline = make_outer(4, 8, 1.0); + TA::detail::ArenaToTBuilder b(TA::Range{4}); + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& cell = b.emplace(ord, TA::Range{8}); + for (std::size_t i = 0; i < 8; ++i) + cell.data()[i] = 1.0 + ord * 100.0 + double(i); + } + Outer built = std::move(b).finish(); + BOOST_CHECK(outers_equal(built, baseline)); +} + +BOOST_AUTO_TEST_CASE(builder_rolls_over_to_multiple_pages) { + const std::size_t N = 10; + TA::detail::ArenaToTBuilder b(TA::Range{static_cast(N)}, 1, + false, + /*page_size=*/Inner::cell_size(8) * 4); + for (std::size_t ord = 0; ord < N; ++ord) { + Inner& cell = b.emplace(ord, TA::Range{8}); + for (std::size_t i = 0; i < 8; ++i) + cell.data()[i] = 1.0 + ord * 100.0 + double(i); + } + BOOST_CHECK_GT(b.arena().page_count(), 1u); + Outer built = std::move(b).finish(); + BOOST_CHECK(outers_equal(built, make_outer(N, 8, 1.0))); +} + +BOOST_AUTO_TEST_CASE(builder_single_cell_uses_one_exact_page) { + // corner case (b): a lone ArenaTensor cell -> one exactly-sized page + TA::detail::ArenaToTBuilder b(TA::Range{1}); + Inner& cell = b.emplace(0, TA::Range{7}); + for (std::size_t i = 0; i < 7; ++i) cell.data()[i] = double(i); + BOOST_CHECK_EQUAL(b.arena().page_count(), 1u); + BOOST_CHECK_EQUAL(b.arena().bytes_reserved(), Inner::cell_size(7)); + Outer built = std::move(b).finish(); + BOOST_REQUIRE_EQUAL(built.range().volume(), 1u); + BOOST_REQUIRE(bool(built.data()[0])); + for (std::size_t i = 0; i < 7; ++i) + BOOST_CHECK_EQUAL(built.data()[0].data()[i], double(i)); +} + +BOOST_AUTO_TEST_CASE(compact_coalesces_a_multipage_tile) { + const std::size_t N = 9; + TA::detail::ArenaToTBuilder b(TA::Range{static_cast(N)}, 1, + false, Inner::cell_size(6) * 4); + for (std::size_t ord = 0; ord < N; ++ord) { + Inner& cell = b.emplace(ord, TA::Range{6}); + for (std::size_t i = 0; i < 6; ++i) + cell.data()[i] = 1.0 + ord * 100.0 + double(i); + } + BOOST_CHECK_GT(b.arena().page_count(), 1u); + Outer multipage = std::move(b).finish(); + Outer compacted = TA::detail::arena_compact(multipage); + BOOST_CHECK(outers_equal(compacted, multipage)); + BOOST_CHECK(outers_equal(compacted, make_outer(N, 6, 1.0))); +} + BOOST_AUTO_TEST_CASE(arena_tensor_is_a_tensor_but_a_view) { // ArenaTensor is registered as is_tensor_helper / is_contiguous_tensor so // kernel paths treat it like Tensor; the `is_tensor_view` trait From 3372118f9b0c58e13ac6871addda40f43a41f543 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 May 2026 04:59:23 +0900 Subject: [PATCH 2/4] test: DistArray-level incremental construction of arena ToT tiles Add a test that builds a TA::DistArray> by calling ArenaToTBuilder inside the init_tiles callback -- each outer tile's inner cells are sized (jagged) and filled one at a time, with no up-front range_fn. Confirms the incremental builder composes with init_tiles and needs no new DistArray API. --- tests/arena_tensor_kernels.cpp | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp index 5b26678ad4..2e501c66db 100644 --- a/tests/arena_tensor_kernels.cpp +++ b/tests/arena_tensor_kernels.cpp @@ -470,6 +470,43 @@ BOOST_AUTO_TEST_CASE(distarray_arena_tensor_construct_and_init_tiles) { } } +// DistArray-level incremental construction: each outer tile is built with +// ArenaToTBuilder *inside* the init_tiles callback -- inner cells are sized +// and filled one at a time, with no up-front range_fn. This needs no new +// DistArray API: init_tiles already supplies a per-tile callback. Serial-only. +BOOST_AUTO_TEST_CASE(distarray_arena_tensor_incremental_init_tiles) { + using Array = TA::DistArray; + auto& world = TA::get_default_world(); + TA::TiledRange tr{TA::TiledRange1{0, 2, 4}}; + Array A(world, tr); + A.init_tiles([](const TA::Range& tile_range) { + TA::detail::ArenaToTBuilder b(tile_range); + const std::size_t n = tile_range.volume(); + for (std::size_t ord = 0; ord < n; ++ord) { + // inner extent discovered per cell (jagged) -- no pre-walk + const std::size_t inner = 2 + ord; + Inner& cell = b.emplace(ord, TA::Range{static_cast(inner)}); + for (std::size_t i = 0; i < inner; ++i) + cell.data()[i] = double(ord * 10 + i); + } + return std::move(b).finish(); + }); + world.gop.fence(); + BOOST_CHECK_EQUAL(A.trange().tiles_range().volume(), 2u); + for (std::size_t t = 0; t < 2; ++t) { + if (!A.is_local(t)) continue; + Outer tile = A.find(t).get(); + const std::size_t n = tile.range().volume(); + for (std::size_t ord = 0; ord < n; ++ord) { + const Inner& cell = tile.data()[ord]; + BOOST_REQUIRE(bool(cell)); + BOOST_CHECK_EQUAL(cell.size(), 2u + ord); + for (std::size_t i = 0; i < cell.size(); ++i) + BOOST_CHECK_EQUAL(cell.data()[i], double(ord * 10 + i)); + } + } +} + // Mixed scalar/ArenaTensor outer Hadamard: each scalar-side outer cell // multiplies the corresponding ArenaTensor-side inner element-wise. // Exercises Tensor::mult(Tensor) and the symmetric From 176df8a2bdf291c75f6901c19272894044da2cc8 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 May 2026 05:24:42 +0900 Subject: [PATCH 3/4] arena ToT: single-pass DistArray construction (drop buffering) The arena-ToT construction paths pre-walked their cells twice: the two-pass make_nested_tile invoked its source once to size each cell and again to fill it, so callers with a single-pass source materialized the whole outer tile into a temporary vector first. ArenaToTBuilder makes a single ascending pass possible everywhere. - make_nested_tile (arena_kernels.h): rebuilt on ArenaToTBuilder -- inner_range_fn and inner_fill_fn are now interleaved per cell instead of two full passes; no separate all-ranges walk. Cells stay zero-initialized so the no-op-fill (shape-only) path is unchanged. - DistArray::make_arena_nested_tile: rebuilt on ArenaToTBuilder; cell_source is invoked exactly once per cell in ascending order. - DistArray::init_elements (arena branch): drops the std::vector that collected every inner tensor of the outer tile before building. - DistArray::set(i, InIter) (arena branch): drops the std::vector that buffered the single-pass iterator; it now feeds straight through. - ArrayImpl retile (arena-ToT branch): builds each target tile with ArenaToTBuilder, one source-cell lookup per cell instead of two. Eliminates a peak-memory doubling during construction (the temporary held the whole tile's data alongside the arena slab). foreach / make_array were also reviewed: both are tile-type-agnostic (the result tile is default-constructed and the user op populates it) -- no two-pass machinery there, nothing to relax. --- src/TiledArray/array_impl.h | 30 ++++++------ src/TiledArray/dist_array.h | 67 +++++++++++++-------------- src/TiledArray/tensor/arena_kernels.h | 49 +++++++++----------- 3 files changed, 69 insertions(+), 77 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index 3b745fdd28..f6dff7f066 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -1001,7 +1001,6 @@ std::shared_ptr> make_with_new_trange( // target cell at global outer element `e` takes its inner range and data // from the source cell at `e` (elements outside the source range, e.g. a // retile that grows the element range, yield null cells). - using inner_range_type = typename Tile::value_type::range_type; const auto& source_elements = source_array.trange().elements_range(); std::map src_tile_cache; auto source_cell_at = @@ -1024,21 +1023,20 @@ std::shared_ptr> make_with_new_trange( }; for (const auto target_ord : *target_array.pmap()) { if (target_array.is_zero(target_ord)) continue; - Tile tile = make_nested_tile( - target_trange.make_tile_range(target_ord), - [&](const auto& e) -> inner_range_type { - const auto* sc = source_cell_at(e); - return (sc && !sc->empty()) ? sc->range() : inner_range_type{}; - }, - [&](auto& cell, const auto& e) { - const auto* sc = source_cell_at(e); - if (sc && !sc->empty()) { - const auto* s = sc->data(); - auto* d = cell.data(); - for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p]; - } - }); - target_array.set(target_ord, std::move(tile)); + // build each target tile in one pass: a single source lookup per cell + // sizes it and fills it together (no separate all-ranges walk). + const auto outer_range = target_trange.make_tile_range(target_ord); + ArenaToTBuilder builder(outer_range); + const std::size_t n = outer_range.volume(); + for (std::size_t o = 0; o < n; ++o) { + const auto* sc = source_cell_at(outer_range.idx(o)); + if (!sc || sc->empty()) continue; // leaves a deliberately-null cell + auto& cell = builder.emplace(o, sc->range()); + const auto* s = sc->data(); + auto* d = cell.data(); + for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p]; + } + target_array.set(target_ord, std::move(builder).finish()); } target_array.world().gop.fence(); } else { diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 6b994e17c0..c2994963df 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -811,18 +811,17 @@ class DistArray : public madness::archive::ParallelSerializableObject { check_index(i); if constexpr (detail::is_tensor_of_tensor_v && is_arena_tensor_v) { - // arena ToT: the iterated inner tiles carry the ranges needed to size - // the slab; buffer them (the iterator is single-pass) and build. + // arena ToT: each iterated inner tile carries the range that sizes its + // cell. make_arena_nested_tile pulls the source once per cell in + // ascending order, so the single-pass iterator feeds straight through. const auto outer_range = pimpl_->trange().make_tile_range(i); using SrcTile = std::decay_t; - std::vector buf; - buf.reserve(outer_range.volume()); - for (std::size_t k = 0; k < outer_range.volume(); ++k, ++first) - buf.emplace_back(*first); - pimpl_->set(i, make_arena_nested_tile( - outer_range, [&buf](std::size_t k) -> const SrcTile& { - return buf[k]; - })); + pimpl_->set(i, make_arena_nested_tile(outer_range, + [&first](std::size_t) -> SrcTile { + SrcTile t = *first; + ++first; + return t; + })); } else { pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first)); } @@ -1165,15 +1164,11 @@ class DistArray : public madness::archive::ParallelSerializableObject { std::is_assignable_v, "DistArray::init_elements: op must return a freestanding " "tensor assignable to the inner tile type"); - // pass 1: collect op's freestanding inner tensors; pass 2: - // make_arena_nested_tile sizes the slab and deep-copies them in - std::vector collected; - collected.reserve(outer_range.volume()); - for (std::size_t o = 0; o < outer_range.volume(); ++o) - collected.emplace_back(op(outer_range.idx(o))); + // single pass: make_arena_nested_tile pulls each cell once, in + // ascending order, so op runs once per cell with no buffer return make_arena_nested_tile( - outer_range, [&collected](std::size_t k) -> const R& { - return collected[k]; + outer_range, [&op, &outer_range](std::size_t k) -> R { + return op(outer_range.idx(k)); }); }, skip_set); @@ -1927,29 +1922,31 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// Engine behind the arena-ToT paths of \c init_elements and \c set: /// \p cell_source(ordinal) returns a freestanding tensor whose range sizes - /// inner cell \p ordinal and whose data fills it. The slab is allocated by - /// \c detail::make_nested_tile and each cell deep-copies its source. + /// inner cell \p ordinal and whose data fills it. Built in one pass with + /// \c detail::ArenaToTBuilder; \p cell_source is invoked exactly once per + /// cell, in ascending ordinal order, so a single-pass source (a generator + /// op or an input iterator) can be fed straight through without buffering. /// \param[in] outer_range the outer tile's range /// \param[in] cell_source maps a cell ordinal to its source tensor template static value_type make_arena_nested_tile(const TiledArray::Range& outer_range, CellSource&& cell_source) { using InnerRange = typename element_type::range_type; - return detail::make_nested_tile( - outer_range, - [&](const auto& idx) -> InnerRange { - // the inner-cell range type is built from an extent list -- it is - // not constructible from a foreign range type - const auto& src = cell_source(outer_range.ordinal(idx)).range(); - const auto& src_ext = src.extent(); - std::vector ext(src.rank()); - for (std::size_t d = 0; d < src.rank(); ++d) - ext[d] = static_cast(src_ext[d]); - return InnerRange(ext); - }, - [&](auto& cell, const auto& idx) { - cell = cell_source(outer_range.ordinal(idx)); - }); + detail::ArenaToTBuilder builder(outer_range); + const std::size_t n = outer_range.volume(); + for (std::size_t k = 0; k < n; ++k) { + const auto& src = cell_source(k); + // the inner-cell range type is built from an extent list -- it is not + // constructible from a foreign range type + const auto& src_range = src.range(); + const auto& src_ext = src_range.extent(); + std::vector ext(src_range.rank()); + for (std::size_t d = 0; d < src_range.rank(); ++d) + ext[d] = static_cast(src_ext[d]); + auto& cell = builder.emplace(k, InnerRange(ext)); + if (!cell.empty()) cell = src; // deep copy into the bound arena cell + } + return std::move(builder).finish(); } /// Code factorization of the actual assert for the other overloads diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h index b74384e57f..f5277ec749 100644 --- a/src/TiledArray/tensor/arena_kernels.h +++ b/src/TiledArray/tensor/arena_kernels.h @@ -148,32 +148,6 @@ struct nested_fill_noop { void operator()(Cell&, const Index&) const noexcept {} }; -/// Build one ToT outer tile over `outer_range`, two-pass: -/// pass 1: `inner_range_fn(outer_element_index)` -> inner `range_type` -/// sizes every inner cell (zero-volume -> deliberately-null cell); -/// pass 2: `inner_fill_fn(inner_cell&, outer_element_index)` fills each -/// non-null cell. The default fill leaves storage zero-initialized. -/// Dispatches internally on the inner-tile type (see `arena_outer_init`). -template -OuterTensor make_nested_tile( - const typename OuterTensor::range_type& outer_range, - InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) { - // arena_outer_init keys ranges on the cell ordinal; user code keys on the - // (global) outer element index -- translate via the outer range. - auto cell_range_fn = [&](std::size_t ord) { - return inner_range_fn(outer_range.idx(ord)); - }; - OuterTensor result = - arena_outer_init(outer_range, 1, cell_range_fn); - const std::size_t N = outer_range.volume(); - for (std::size_t ord = 0; ord < N; ++ord) { - auto& cell = result.data()[ord]; - if (!cell.empty()) inner_fill_fn(cell, outer_range.idx(ord)); - } - return result; -} - /// One-pass incremental builder for an arena-backed ToT outer tile. /// /// `make_nested_tile` / `arena_outer_init` pre-walk every inner range before @@ -261,6 +235,29 @@ class ArenaToTBuilder { std::shared_ptr data_; }; +/// Build one ToT outer tile over `outer_range` in a single pass: each inner +/// cell is sized by `inner_range_fn(outer_element_index)` and immediately +/// filled by `inner_fill_fn(inner_cell&, outer_element_index)` before moving +/// to the next -- no separate all-ranges walk. A zero-volume inner range +/// yields a deliberately-null cell, which `inner_fill_fn` is not invoked on. +/// Cells are zero-initialized, so the default no-op fill still leaves zeroed +/// storage. Backed by `ArenaToTBuilder`. +template +OuterTensor make_nested_tile( + const typename OuterTensor::range_type& outer_range, + InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) { + ArenaToTBuilder builder(outer_range, /*batch_sz=*/1, + /*zero_init=*/true); + const std::size_t N = outer_range.volume(); + for (std::size_t ord = 0; ord < N; ++ord) { + const auto idx = outer_range.idx(ord); + auto& cell = builder.emplace(ord, inner_range_fn(idx)); + if (!cell.empty()) inner_fill_fn(cell, idx); + } + return std::move(builder).finish(); +} + /// Apply a unary fill op while preserving each source inner range. /// `fill_op(dst_data, src_data, n_elements)` writes the result cell. template From a02f28e60e49e0a0134f78eb08492ec6a338a96f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 May 2026 05:47:19 +0900 Subject: [PATCH 4/4] arena: preserve rank>0 zero-volume ranges; fix stale view comment ArenaToTBuilder::emplace, given a zero-volume range, used to leave the cell default/null. For an owning (non-view) inner that drops the range metadata -- arena_outer_init keeps a rank>0 zero-volume range as an empty-but-ranked tensor and only collapses a rank-0 range to null. Since make_nested_tile now routes through the builder, mirror that handling so a TA::Tensor inner with e.g. Range{0} stays an empty rank-1 tensor. Arena view inners (which cannot carry a standalone range) still go null. Adds a regression test. Also drops a stale type_traits.h comment that listed TensorInterface as an is_tensor_view specialization -- it is deliberately not a view. --- src/TiledArray/tensor/arena_kernels.h | 17 +++++++++++++---- src/TiledArray/tensor/type_traits.h | 8 +++++--- tests/arena_kernels.cpp | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h index f5277ec749..4db5212ef3 100644 --- a/src/TiledArray/tensor/arena_kernels.h +++ b/src/TiledArray/tensor/arena_kernels.h @@ -188,14 +188,23 @@ class ArenaToTBuilder { /// Size and bind the inner cell at outer cell ordinal `ord` to /// `inner_range`, returning a reference to the bound cell for the caller to - /// fill. A zero-volume range leaves the cell null. Outer element indices - /// translate via `outer_range().ordinal(idx)`. + /// fill. A zero-volume range yields an empty cell: an owning inner keeps a + /// rank>0 range (a rank-0 range stays null), a view inner stays null. + /// Outer element indices translate via `outer_range().ordinal(idx)`. inner_t& emplace(std::size_t ord, inner_range_t inner_range) { TA_ASSERT(ord < n_cells_); inner_t& cell = data_[ord]; - const std::size_t vol = inner_range.volume(); - if (vol == 0) return cell; // stays null constexpr bool arena = is_arena_tensor_v; + const std::size_t vol = inner_range.volume(); + if (vol == 0) { + // Mirror arena_outer_init: an owning (non-view) inner preserves a + // rank>0 zero-volume range as an empty-but-ranked tensor; a rank-0 + // range -- and any arena view inner -- leaves the cell default/null. + if constexpr (!arena) { + if (inner_range.rank() != 0) cell = inner_t(std::move(inner_range)); + } + return cell; + } std::size_t stride; std::size_t bytes; if constexpr (arena) { diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 1a457e61b1..ac622675e7 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -120,9 +120,11 @@ inline constexpr const bool is_nested_tensor_v = is_nested_tensor::value; } // namespace detail /// Forward decl for the tensor-view predicate. Specializations live in -/// `tensor/arena_tensor.h` (`ArenaTensor`, `detail::TensorInterface`) and -/// `external/btas.h` (`btas::TensorView`). Declared here so the operator-body -/// predicates below can consult it without including arena_tensor.h. +/// `tensor/arena_tensor.h` (`ArenaTensor`) and `external/btas.h` +/// (`btas::TensorView`). Declared here so the operator-body predicates below +/// can consult it without including arena_tensor.h. Note `TensorInterface` / +/// `TensorMap` is deliberately *not* a view here -- it has value-returning +/// member arithmetic (see arena_tensor.h). template struct is_tensor_view : std::false_type {}; template diff --git a/tests/arena_kernels.cpp b/tests/arena_kernels.cpp index e6c1fe2a74..22fe5bc155 100644 --- a/tests/arena_kernels.cpp +++ b/tests/arena_kernels.cpp @@ -199,6 +199,21 @@ BOOST_AUTO_TEST_CASE(builder_single_cell_uses_one_exact_page) { BOOST_CHECK_EQUAL(built.data()->at_ordinal(i), double(i)); } +BOOST_AUTO_TEST_CASE(builder_zero_volume_nonscalar_range_keeps_rank) { + // an owning inner given a zero-volume but rank>0 range keeps that range + // (mirrors arena_outer_init): the rank-1 range is preserved rather than + // collapsed to a rank-0 null cell + TA::detail::ArenaToTBuilder b(TA::Range{2l}); + inner_t& c0 = b.emplace(0, TA::Range{0l}); // rank 1, extent 0, volume 0 + inner_t& c1 = b.emplace(1, TA::Range{3l}); + BOOST_CHECK_EQUAL(c0.range().rank(), 1u); + BOOST_CHECK_EQUAL(c0.range().volume(), 0u); + BOOST_CHECK(!c1.empty()); + outer_t built = std::move(b).finish(); + BOOST_CHECK_EQUAL(built.data()[0].range().rank(), 1u); + BOOST_CHECK_EQUAL(built.data()[1].range().volume(), 3u); +} + BOOST_AUTO_TEST_CASE(compact_coalesces_a_multipage_tile) { const std::size_t N = 16; TA::detail::ArenaToTBuilder b(TA::Range{static_cast(N)}, 1,