From cbeb341fe061345e5a4bcd7e9734ca795f7591b7 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 19 May 2026 21:23:12 +0900
Subject: [PATCH 1/4] arena: multi-page incremental allocator + one-pass ToT
 builder

Replace the one-shot Arena (a single slab sized by a pre-walk of every
inner range) with a multi-page bump allocator, so an arena-backed ToT
outer tile no longer has to know all inner-tensor sizes before
construction.

Arena (arena.h):
- A growing list of pages; each page buffer is a stable heap block, so
  the raw Cell* of every ArenaTensor view stays valid as pages are
  added. claim_bytes() bumps the current page and appends a fresh one
  on a miss; a request larger than a page (or needing finer alignment)
  gets its own dedicated, exactly-sized page.
- reserve_page() lays down a single exact page -- the up-front path
  uses it so kernel/einsum-built tiles keep their one contiguous slab,
  byte-identical to before.
- Page size is a knob (TILEDARRAY_ARENA_PAGE_BYTES). Construction is
  single-threaded by contract (one task per outer tile); the bump path
  is intentionally unsynchronized.
- Drop the unused plan()/ArenaPlan helpers.

arena_kernels.h:
- arena_outer_init reimplemented against the new Arena (reserve_page +
  sequential claim_bytes); signature unchanged, so einsum/tensor.h
  callers are untouched.
- ArenaToTBuilder: one-pass incremental construction -- the caller
  discovers each inner range and fills the returned cell in a single
  step, driving its own loop. A cell larger than a page and a
  single-cell tile both route to an exactly-sized dedicated page.
- arena_compact: coalesce a multi-page incrementally-built tile into
  one contiguous slab.

Tests: rewrite tests/arena.cpp for the new API (page rollover,
oversized/dedicated pages, single exact page, aliasing survival); add
ArenaToTBuilder + arena_compact coverage for both TA::Tensor and
ArenaTensor inner cells.
---
 src/TiledArray/tensor/arena.h         | 223 +++++++++++++++++---------
 src/TiledArray/tensor/arena_kernels.h | 146 ++++++++++++++---
 tests/arena.cpp                       | 168 ++++++++++---------
 tests/arena_kernels.cpp               |  58 +++++++
 tests/arena_tensor_kernels.cpp        |  58 +++++++
 5 files changed, 473 insertions(+), 180 deletions(-)
diff --git a/src/TiledArray/tensor/arena.h b/src/TiledArray/tensor/arena.h
index b37b962436..f045d87983 100644
--- a/src/TiledArray/tensor/arena.h
+++ b/src/TiledArray/tensor/arena.h
@@ -5,6 +5,7 @@
 #include "TiledArray/config.h"
 #include "TiledArray/error.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -22,13 +23,55 @@ inline bool& arena_disabled() {
   return flag;
 }
 
-/// One-shot bump allocator; slab is co-owned via aliasing shared_ptrs.
+/// Default size, in bytes, of a standard arena page. Inner ToT tensors are
+/// small (tens to hundreds of elements), so a page packs many of them; the
+/// default amortizes the per-page allocation and inter-cell alignment
+/// padding. Override at configure time by defining TILEDARRAY_ARENA_PAGE_BYTES.
+#ifndef TILEDARRAY_ARENA_PAGE_BYTES
+#define TILEDARRAY_ARENA_PAGE_BYTES (64 * 1024)
+#endif
+inline constexpr std::size_t kArenaDefaultPageBytes =
+    TILEDARRAY_ARENA_PAGE_BYTES;
+
+/// Cache-line-floor alignment; also the alignment every standard page base is
+/// allocated to, so any per-cell alignment up to this value is satisfied
+/// without a dedicated page.
+inline constexpr std::size_t kArenaCachelineAlign = 128;
+
+/// Round bytes up to a power-of-two alignment.
+inline std::size_t arena_align_up(std::size_t bytes,
+                                  std::size_t alignment) noexcept {
+  return (bytes + alignment - 1) & ~(alignment - 1);
+}
+
+/// Incremental, multi-page bump allocator backing `Tensor<ArenaTensor>` outer
+/// tiles. Memory is handed out from a growing list of pages; each page is a
+/// stable heap block (the page list may grow, but a page's buffer never
+/// moves), which is what keeps the raw `Cell*` of every `ArenaTensor` view
+/// valid for the arena's lifetime. Pages are co-owned with the cells via
+/// aliasing `shared_ptr`s.
+///
+/// Two ways to drive it:
+///  - up-front: the total size is known, so `reserve_page(total, ...)` lays
+///    down a single exact page and subsequent `claim`s pack into it (one
+///    contiguous slab, zero tail waste -- what the kernels and einsum use);
+///  - incremental: `claim` cells one at a time as their sizes are discovered;
+///    a fresh page is appended whenever the current one cannot satisfy a
+///    request. A cell larger than a page gets its own dedicated, exactly
+///    sized page (which never becomes the bump target).
+///
+/// Thread-safety: a single `Arena` is built by a single thread. ToT outer
+/// tiles are produced one task per tile (`init_tiles` / kernels), each with
+/// its own `Arena`, so the bump path is deliberately not synchronized.
 class Arena {
  public:
   explicit Arena(
-      std::pmr::memory_resource* mr = std::pmr::new_delete_resource()) noexcept
-      : resource_(mr) {
+      std::pmr::memory_resource* mr = std::pmr::new_delete_resource(),
+      std::size_t page_size = kArenaDefaultPageBytes,
+      bool zero_init = false) noexcept
+      : resource_(mr), page_size_(page_size), zero_init_(zero_init) {
     TA_ASSERT(resource_ != nullptr);
+    TA_ASSERT(page_size_ > 0);
   }
 
   Arena(const Arena&) = delete;
@@ -37,97 +80,118 @@ class Arena {
   Arena& operator=(Arena&&) noexcept = default;
   ~Arena() = default;
 
-  /// Allocate the slab once; zero_init clears it for accumulation kernels.
-  /// `alignment` (default `alignof(std::max_align_t)`) is the alignment of
-  /// the slab base; pass a larger power-of-two when callers need SIMD-aligned
-  /// element pointers at known interior offsets.
-  void reserve(std::size_t bytes, bool zero_init = false,
-               std::size_t alignment = alignof(std::max_align_t)) {
-    TA_ASSERT(capacity_ == 0);
+  /// Append a page sized exactly `bytes` (the page base is aligned to
+  /// `alignment`, bumped to at least `max_align_t`) and make it the current
+  /// bump page. Used by the up-front path with `bytes` set to the known
+  /// total, and for a single-cell tile so the lone page is exactly sized.
+  void reserve_page(std::size_t bytes, std::size_t alignment) {
     TA_ASSERT(bytes > 0);
-    TA_ASSERT(alignment >= alignof(std::max_align_t));
-    TA_ASSERT((alignment & (alignment - 1)) == 0);
-    void* raw = resource_->allocate(bytes, alignment);
-    auto* mr = resource_;
-    auto deleter = [mr, bytes, alignment](std::byte* p) noexcept {
-      mr->deallocate(p, bytes, alignment);
-    };
-    slab_ = std::shared_ptr<std::byte[]>(static_cast<std::byte*>(raw),
-                                         std::move(deleter));
-    capacity_ = bytes;
-    cursor_ = 0;
-    if (zero_init) std::memset(slab_.get(), 0, bytes);
+    std::size_t a = alignment > alignof(std::max_align_t)
+                        ? alignment
+                        : alignof(std::max_align_t);
+    TA_ASSERT((a & (a - 1)) == 0);
+    add_page(bytes, a);
+    current_ = pages_.size() - 1;
   }
 
-  /// Aliasing view at a caller-aligned offset.
-  template <typename T>
-  std::shared_ptr<T[]> slice(std::size_t offset, std::size_t /*n_elem*/) const {
-    TA_ASSERT(slab_);
-    TA_ASSERT(offset % alignof(T) == 0);
-    TA_ASSERT(offset <= capacity_);
-    auto* p = reinterpret_cast<T*>(slab_.get() + offset);
-    return std::shared_ptr<T[]>(slab_, p);
+  /// Bump-allocate `bytes` aligned to `alignment`. Tries the current page;
+  /// on a miss appends a fresh page -- a dedicated, exactly sized page when
+  /// the request exceeds the page size (or needs more alignment than a
+  /// standard page base provides), otherwise a standard page that becomes
+  /// the new bump target. The returned handle aliases the owning page's
+  /// buffer, so it keeps that page alive on its own.
+  std::shared_ptr<std::byte[]> claim_bytes(std::size_t bytes,
+                                           std::size_t alignment) {
+    TA_ASSERT(bytes > 0);
+    TA_ASSERT(alignment > 0 && (alignment & (alignment - 1)) == 0);
+
+    if (current_ != kNoPage) {
+      Page& p = pages_[current_];
+      const auto base = reinterpret_cast<std::uintptr_t>(p.buffer.get());
+      const auto cur = base + p.cursor;
+      const auto aligned =
+          (cur + alignment - 1) & ~(std::uintptr_t(alignment) - 1);
+      const std::size_t pad = static_cast<std::size_t>(aligned - cur);
+      if (pad + bytes <= p.capacity - p.cursor) {
+        p.cursor += pad + bytes;
+        bytes_allocated_ += bytes;
+        return std::shared_ptr<std::byte[]>(
+            p.buffer, reinterpret_cast<std::byte*>(aligned));
+      }
+    }
+
+    // Need a fresh page. A page base is at least `kArenaCachelineAlign`-
+    // aligned and a fresh cursor is 0, so a standard page needs no padding;
+    // an over-large request, or one needing finer alignment than a standard
+    // page base, gets a dedicated exactly-sized page.
+    if (bytes > page_size_ || alignment > kArenaCachelineAlign) {
+      std::size_t a =
+          alignment > kArenaCachelineAlign ? alignment : kArenaCachelineAlign;
+      Page& d = add_page(bytes, a);
+      d.cursor = bytes;  // dedicated: full after this one claim
+      bytes_allocated_ += bytes;
+      // A dedicated page is never the bump target; `current_` is unchanged.
+      return std::shared_ptr<std::byte[]>(d.buffer, d.buffer.get());
+    }
+
+    Page& p = add_page(page_size_, kArenaCachelineAlign);
+    current_ = pages_.size() - 1;
+    p.cursor = bytes;
+    bytes_allocated_ += bytes;
+    return std::shared_ptr<std::byte[]>(p.buffer, p.buffer.get());
   }
 
-  /// Bump-allocate n elements of T; result is T-aligned.
+  /// Typed bump-allocate of `n` elements of `T`; result is `T`-aligned.
   template <typename T>
   std::shared_ptr<T[]> claim(std::size_t n) {
-    TA_ASSERT(slab_);
-    auto base = reinterpret_cast<std::uintptr_t>(slab_.get() + cursor_);
-    auto aligned = (base + alignof(T) - 1) & ~(alignof(T) - 1);
-    std::size_t pad = static_cast<std::size_t>(aligned - base);
-    std::size_t consumed = pad + n * sizeof(T);
-    TA_ASSERT(cursor_ + consumed <= capacity_);
-    cursor_ += consumed;
-    return std::shared_ptr<T[]>(slab_, reinterpret_cast<T*>(aligned));
+    auto h = claim_bytes(n * sizeof(T), alignof(T));
+    return std::shared_ptr<T[]>(h, reinterpret_cast<T*>(h.get()));
   }
 
-  std::size_t capacity() const noexcept { return capacity_; }
-  std::size_t cursor() const noexcept { return cursor_; }
-  std::size_t remaining() const noexcept { return capacity_ - cursor_; }
-  bool empty() const noexcept { return cursor_ == 0; }
+  std::size_t page_count() const noexcept { return pages_.size(); }
+  std::size_t bytes_allocated() const noexcept { return bytes_allocated_; }
+  std::size_t bytes_reserved() const noexcept {
+    std::size_t s = 0;
+    for (const auto& p : pages_) s += p.capacity;
+    return s;
+  }
+  bool empty() const noexcept { return bytes_allocated_ == 0; }
+  std::size_t page_size() const noexcept { return page_size_; }
   std::pmr::memory_resource* resource() const noexcept { return resource_; }
 
  private:
-  std::pmr::memory_resource* resource_;
-  std::shared_ptr<std::byte[]> slab_;
-  std::size_t capacity_ = 0;
-  std::size_t cursor_ = 0;
-};
-
-/// Per-cell offsets and total slab size produced by plan().
-struct ArenaPlan {
-  std::vector<std::size_t> offsets;
-  std::size_t total_bytes = 0;
-};
-
-/// Cache-line-floor alignment used by production callers.
-inline constexpr std::size_t kArenaCachelineAlign = 128;
+  struct Page {
+    std::shared_ptr<std::byte[]> buffer;
+    std::size_t capacity = 0;
+    std::size_t cursor = 0;
+  };
 
-/// Round bytes up to a power-of-two alignment.
-inline std::size_t arena_align_up(std::size_t bytes,
-                                  std::size_t alignment) noexcept {
-  return (bytes + alignment - 1) & ~(alignment - 1);
-}
+  static constexpr std::size_t kNoPage = static_cast<std::size_t>(-1);
 
-/// Pre-walk cells once to compute offsets and total bytes.
-template <typename ShapeFn>
-ArenaPlan plan(std::size_t N_cells, ShapeFn&& shape_fn,
-               std::size_t element_size, std::size_t alignment) {
-  ArenaPlan out;
-  out.offsets.resize(N_cells);
-  std::size_t total = 0;
-  for (std::size_t ord = 0; ord < N_cells; ++ord) {
-    out.offsets[ord] = total;
-    auto&& r = shape_fn(ord);
-    std::size_t bytes = r.volume() * element_size;
-    total += arena_align_up(bytes, alignment);
+  Page& add_page(std::size_t capacity, std::size_t alignment) {
+    void* raw = resource_->allocate(capacity, alignment);
+    auto* mr = resource_;
+    auto deleter = [mr, capacity, alignment](std::byte* p) noexcept {
+      mr->deallocate(p, capacity, alignment);
+    };
+    Page pg;
+    pg.buffer = std::shared_ptr<std::byte[]>(static_cast<std::byte*>(raw),
+                                             std::move(deleter));
+    pg.capacity = capacity;
+    if (zero_init_) std::memset(pg.buffer.get(), 0, capacity);
+    pages_.push_back(std::move(pg));
+    return pages_.back();
   }
-  out.total_bytes = total;
-  return out;
-}
 
-/// PMR adapter over an Arena; deallocation is a no-op (slab-owned lifetime).
+  std::pmr::memory_resource* resource_;
+  std::size_t page_size_;
+  bool zero_init_;
+  std::vector<Page> pages_;
+  std::size_t current_ = kNoPage;  // index of the current bump page
+  std::size_t bytes_allocated_ = 0;
+};
+
+/// PMR adapter over an Arena; deallocation is a no-op (page-owned lifetime).
 class ArenaResource final : public std::pmr::memory_resource {
  public:
   explicit ArenaResource(Arena* arena) noexcept : arena_(arena) {
@@ -138,8 +202,7 @@ class ArenaResource final : public std::pmr::memory_resource {
 
  protected:
   void* do_allocate(std::size_t bytes, std::size_t alignment) override {
-    auto h = arena_->claim<std::byte>(arena_align_up(bytes, alignment));
-    return h.get();
+    return arena_->claim_bytes(bytes, alignment).get();
   }
 
   void do_deallocate(void* /*p*/, std::size_t /*bytes*/,
diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h
index 8dcd97c870..b74384e57f 100644
--- a/src/TiledArray/tensor/arena_kernels.h
+++ b/src/TiledArray/tensor/arena_kernels.h
@@ -16,8 +16,10 @@
 #include "TiledArray/tensor/arena.h"
 #include "TiledArray/tensor/arena_tensor.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <memory>
+#include <memory_resource>
 #include <new>
 #include <type_traits>
 #include <utility>
@@ -49,15 +51,20 @@ std::shared_ptr<typename OuterTensor::value_type[]> make_outer_data(
 
 }  // namespace
 
-/// Allocate a slab-backed ToT outer tile with caller-provided inner ranges.
+/// Allocate an arena-backed ToT outer tile with caller-provided inner ranges.
 ///
 /// `inner_range_fn(cell_ordinal)` -> inner `range_type` for each cell ordinal
 /// in `[0, outer_range.volume() * batch_sz)`; a zero-volume range yields a
-/// deliberately-null inner cell that consumes no slab bytes. Element storage
+/// deliberately-null inner cell that consumes no arena bytes. Element storage
 /// is left zero-initialized when `zero_init` is true. `cell_stride_align` is
 /// the minimum byte stride between adjacent cells; it is bumped up to the
 /// inner type's natural alignment (`ArenaTensor::cell_alignment()`, or
 /// `alignof(T)` for `TA::Tensor` inners).
+///
+/// This is the *up-front* path: all inner ranges are known, so the total is
+/// pre-walked and laid down as a single exactly-sized arena page -- one
+/// contiguous slab, no page-tail waste. For one-pass construction where the
+/// inner sizes are discovered incrementally, use `ArenaToTBuilder`.
 template <typename OuterTensor, typename InnerRangeFn>
 OuterTensor arena_outer_init(
     const typename OuterTensor::range_type& outer_range, std::size_t batch_sz,
@@ -75,24 +82,19 @@ OuterTensor arena_outer_init(
   } else {
     if (alignof(T) > stride) stride = alignof(T);
   }
-  // Cells pack at `stride` granularity, but the slab base handed to
-  // `Arena::reserve` must be at least `max_align_t`-aligned.
+  // Cells pack at `stride` granularity; the page base must be at least
+  // `max_align_t`-aligned.
   const std::size_t slab_align =
       stride > alignof(std::max_align_t) ? stride : alignof(std::max_align_t);
 
   const std::size_t N_cells = outer_range.volume() * batch_sz;
-  constexpr std::size_t kNull = static_cast<std::size_t>(-1);
   std::vector<InnerRange> ranges;
   ranges.reserve(N_cells);
-  std::vector<std::size_t> offsets(N_cells, 0);
   std::size_t total = 0;
   for (std::size_t ord = 0; ord < N_cells; ++ord) {
     ranges.emplace_back(inner_range_fn(ord));
     const std::size_t vol = ranges.back().volume();
-    if (vol == 0) {
-      offsets[ord] = kNull;
-    } else {
-      offsets[ord] = total;
+    if (vol != 0) {
       // `if constexpr`, not a ternary: `InnerT::cell_size` does not exist for
       // a `TA::Tensor` inner, so the non-arena branch must not be formed.
       std::size_t bytes;
@@ -104,15 +106,19 @@ OuterTensor arena_outer_init(
     }
   }
 
-  auto arena_slab = std::make_shared<Arena>();
-  if (total > 0) arena_slab->reserve(total, zero_init, slab_align);
-  auto data = make_outer_data<OuterTensor>(N_cells, arena_slab,
+  auto arena_ptr = std::make_shared<Arena>(std::pmr::new_delete_resource(),
+                                           kArenaDefaultPageBytes, zero_init);
+  // One exact page holds every cell -- subsequent `claim_bytes` calls pack
+  // into it in order, reproducing the old single-slab layout.
+  if (total > 0) arena_ptr->reserve_page(total, slab_align);
+  auto data = make_outer_data<OuterTensor>(N_cells, arena_ptr,
                                            std::shared_ptr<InnerT[]>{});
   OuterTensor result(outer_range, batch_sz, std::move(data));
 
   for (std::size_t ord = 0; ord < N_cells; ++ord) {
     auto& r = ranges[ord];
-    if (offsets[ord] == kNull) {
+    const std::size_t vol = r.volume();
+    if (vol == 0) {
       if constexpr (arena) {
         ::new (result.data() + ord) InnerT();
       } else {
@@ -123,15 +129,13 @@ OuterTensor arena_outer_init(
           ::new (result.data() + ord) InnerT(r);
       }
     } else if constexpr (arena) {
-      // slice<std::byte>(offset, 1) returns an aliased shared_ptr; we only
-      // need its raw pointer to placement-new the Cell -- the slab's lifetime
-      // is held by `arena_handle` captured in the outer's deleter.
-      auto byte_view = arena_slab->template slice<std::byte>(offsets[ord], 1);
+      auto h = arena_ptr->claim_bytes(InnerT::cell_size(vol), stride);
       ::new (result.data() + ord)
-          InnerT(make_arena_tensor_in<T>(byte_view.get(), std::move(r)));
+          InnerT(make_arena_tensor_in<T>(h.get(), std::move(r)));
     } else {
-      auto elem_data = arena_slab->template slice<T>(offsets[ord], r.volume());
-      ::new (result.data() + ord) InnerT(r, std::move(elem_data));
+      auto h = arena_ptr->claim_bytes(vol * sizeof(T), stride);
+      ::new (result.data() + ord)
+          InnerT(r, std::shared_ptr<T[]>(h, reinterpret_cast<T*>(h.get())));
     }
   }
   return result;
@@ -170,6 +174,93 @@ OuterTensor make_nested_tile(
   return result;
 }
 
+/// One-pass incremental builder for an arena-backed ToT outer tile.
+///
+/// `make_nested_tile` / `arena_outer_init` pre-walk every inner range before
+/// any storage is allocated. `ArenaToTBuilder` instead sizes and binds inner
+/// cells one at a time: the caller discovers each inner range and fills the
+/// returned cell in a single step, driving its own loop. Backed by the
+/// multi-page `Arena`, so no total size is needed up front.
+///
+/// Cells should be `emplace`d in outer cell-ordinal order -- recommended, not
+/// required: a view is a pointer, so any order is correct, but in-order
+/// emplacement keeps the page layout cache-friendly for later iteration.
+/// `arena_compact` coalesces a finished tile into one contiguous slab.
+///
+/// A builder, its `Arena`, and the tile under construction are single-thread
+/// objects (see `Arena`).
+template <typename OuterTensor>
+class ArenaToTBuilder {
+ public:
+  using outer_range_type = typename OuterTensor::range_type;
+  using inner_t = typename OuterTensor::value_type;
+  using inner_range_t = typename inner_t::range_type;
+  using elem_t = typename inner_t::value_type;
+
+  explicit ArenaToTBuilder(const outer_range_type& outer_range,
+                           std::size_t batch_sz = 1, bool zero_init = false,
+                           std::size_t page_size = kArenaDefaultPageBytes)
+      : outer_range_(outer_range),
+        batch_sz_(batch_sz),
+        n_cells_(outer_range.volume() * batch_sz),
+        arena_(std::make_shared<Arena>(std::pmr::new_delete_resource(),
+                                       page_size, zero_init)) {
+    data_ = make_outer_data<OuterTensor>(n_cells_, arena_,
+                                         std::shared_ptr<inner_t[]>{});
+    // Cells start null (the deleter destroys all n_cells_); `emplace` binds.
+    for (std::size_t ord = 0; ord < n_cells_; ++ord)
+      ::new (data_.get() + ord) inner_t();
+  }
+
+  /// Size and bind the inner cell at outer cell ordinal `ord` to
+  /// `inner_range`, returning a reference to the bound cell for the caller to
+  /// fill. A zero-volume range leaves the cell null. Outer element indices
+  /// translate via `outer_range().ordinal(idx)`.
+  inner_t& emplace(std::size_t ord, inner_range_t inner_range) {
+    TA_ASSERT(ord < n_cells_);
+    inner_t& cell = data_[ord];
+    const std::size_t vol = inner_range.volume();
+    if (vol == 0) return cell;  // stays null
+    constexpr bool arena = is_arena_tensor_v<inner_t>;
+    std::size_t stride;
+    std::size_t bytes;
+    if constexpr (arena) {
+      stride = inner_t::cell_alignment();
+      bytes = inner_t::cell_size(vol);
+    } else {
+      stride = alignof(elem_t);
+      bytes = vol * sizeof(elem_t);
+    }
+    // Single-cell tile: lay down one exactly-sized page (corner case b).
+    if (n_cells_ == 1 && arena_->empty()) arena_->reserve_page(bytes, stride);
+    auto h = arena_->claim_bytes(bytes, stride);
+    if constexpr (arena) {
+      cell = make_arena_tensor_in<elem_t>(h.get(), std::move(inner_range));
+    } else {
+      cell = inner_t(
+          std::move(inner_range),
+          std::shared_ptr<elem_t[]>(h, reinterpret_cast<elem_t*>(h.get())));
+    }
+    return cell;
+  }
+
+  /// Finalize and hand back the assembled outer tile; the builder is spent.
+  OuterTensor finish() && {
+    return OuterTensor(outer_range_, batch_sz_, std::move(data_));
+  }
+
+  std::size_t cell_count() const noexcept { return n_cells_; }
+  const outer_range_type& outer_range() const noexcept { return outer_range_; }
+  const Arena& arena() const noexcept { return *arena_; }
+
+ private:
+  outer_range_type outer_range_;
+  std::size_t batch_sz_;
+  std::size_t n_cells_;
+  std::shared_ptr<Arena> arena_;
+  std::shared_ptr<inner_t[]> data_;
+};
+
 /// Apply a unary fill op while preserving each source inner range.
 /// `fill_op(dst_data, src_data, n_elements)` writes the result cell.
 template <typename OuterTensor, typename SrcOuterTensor, typename FillOp>
@@ -196,6 +287,19 @@ OuterTensor arena_trivial_unary(const SrcOuterTensor& src, FillOp&& fill_op) {
   return result;
 }
 
+/// Coalesce a (possibly multi-page, incrementally built) arena-backed ToT
+/// outer tile into a fresh single-page tile: one exact allocation, no page
+/// tail waste, inner cells laid out contiguously in outer order. Returns a
+/// new tile; `src` is unchanged. A tile already built up-front via
+/// `arena_outer_init` is single-page already, so compacting it just
+/// deep-copies.
+template <typename OuterTensor>
+OuterTensor arena_compact(const OuterTensor& src) {
+  return arena_trivial_unary<OuterTensor>(
+      src,
+      [](auto* dst, const auto* s, std::size_t n) { std::copy_n(s, n, dst); });
+}
+
 /// Apply a binary fill op using the left operand's inner ranges (asserted
 /// equal to the right's per cell). `fill_op(dst, l, r, n_elements)`.
 template <typename OuterTensor, typename LeftTensor, typename RightTensor,
diff --git a/tests/arena.cpp b/tests/arena.cpp
index 46273e8645..3753b4beca 100644
--- a/tests/arena.cpp
+++ b/tests/arena.cpp
@@ -4,124 +4,134 @@
 #include "unit_test_config.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <memory_resource>
 #include <vector>
 
 using TiledArray::detail::Arena;
-using TiledArray::detail::ArenaPlan;
 using TiledArray::detail::ArenaResource;
-using TiledArray::detail::plan;
+using TiledArray::detail::kArenaDefaultPageBytes;
 
 namespace {
-// Minimal Range-like shim for plan() tests: supports only volume().
-struct FakeRange {
-  std::size_t v;
-  std::size_t volume() const noexcept { return v; }
-};
+bool is_aligned(const void* p, std::size_t a) {
+  return reinterpret_cast<std::uintptr_t>(p) % a == 0;
 }
+}  // namespace
 
 BOOST_AUTO_TEST_SUITE(arena_suite, TA_UT_LABEL_SERIAL)
 
 BOOST_AUTO_TEST_CASE(default_arena_is_empty) {
   Arena a;
-  BOOST_CHECK_EQUAL(a.capacity(), 0u);
-  BOOST_CHECK_EQUAL(a.cursor(), 0u);
   BOOST_CHECK(a.empty());
+  BOOST_CHECK_EQUAL(a.page_count(), 0u);
+  BOOST_CHECK_EQUAL(a.bytes_allocated(), 0u);
+  BOOST_CHECK_EQUAL(a.bytes_reserved(), 0u);
+  BOOST_CHECK_EQUAL(a.page_size(), kArenaDefaultPageBytes);
   BOOST_CHECK(a.resource() != nullptr);
 }
 
-BOOST_AUTO_TEST_CASE(reserve_initializes_capacity) {
+BOOST_AUTO_TEST_CASE(reserve_page_lays_down_one_exact_page) {
   Arena a;
-  a.reserve(1024);
-  BOOST_CHECK_EQUAL(a.capacity(), 1024u);
-  BOOST_CHECK_EQUAL(a.cursor(), 0u);
-  BOOST_CHECK_EQUAL(a.remaining(), 1024u);
-}
-
-BOOST_AUTO_TEST_CASE(reserve_zero_init_clears_slab) {
-  Arena a;
-  a.reserve(64, /*zero_init=*/true);
-  auto h = a.slice<unsigned char>(0, 64);
-  for (std::size_t i = 0; i < 64; ++i) BOOST_CHECK_EQUAL(h[i], 0u);
+  a.reserve_page(1024, 64);
+  BOOST_CHECK_EQUAL(a.page_count(), 1u);
+  BOOST_CHECK_EQUAL(a.bytes_reserved(), 1024u);
+  // nothing claimed yet
+  BOOST_CHECK(a.empty());
+  BOOST_CHECK_EQUAL(a.bytes_allocated(), 0u);
 }
 
-BOOST_AUTO_TEST_CASE(slice_random_access_and_aliasing) {
+BOOST_AUTO_TEST_CASE(claims_pack_into_the_reserved_page) {
   Arena a;
-  a.reserve(1024);
-  std::shared_ptr<double[]> p1 = a.slice<double>(0, 4);
-  std::shared_ptr<double[]> p2 = a.slice<double>(64, 4);
-  for (int i = 0; i < 4; ++i) p1[i] = double(i);
-  for (int i = 0; i < 4; ++i) p2[i] = double(10 + i);
-  for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p1[i], double(i));
-  for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p2[i], double(10 + i));
-  BOOST_CHECK(static_cast<void*>(&p2[0]) >= static_cast<void*>(&p1[4]));
+  a.reserve_page(1024, 128);
+  auto h1 = a.claim_bytes(100, 64);
+  auto h2 = a.claim_bytes(100, 64);
+  auto h3 = a.claim_bytes(100, 64);
+  // all three land in the single reserved page
+  BOOST_CHECK_EQUAL(a.page_count(), 1u);
+  BOOST_CHECK_EQUAL(a.bytes_allocated(), 300u);
+  BOOST_CHECK(is_aligned(h1.get(), 64));
+  BOOST_CHECK(is_aligned(h2.get(), 64));
+  BOOST_CHECK(is_aligned(h3.get(), 64));
+  // distinct, non-overlapping
+  BOOST_CHECK(h2.get() >= h1.get() + 100);
+  BOOST_CHECK(h3.get() >= h2.get() + 100);
 }
 
-BOOST_AUTO_TEST_CASE(claim_advances_cursor_and_aligns) {
-  Arena a;
-  a.reserve(1024);
+BOOST_AUTO_TEST_CASE(claim_auto_allocates_a_standard_page) {
+  Arena a;  // no reserve_page
   std::shared_ptr<double[]> h = a.claim<double>(10);
   BOOST_REQUIRE(h.get() != nullptr);
-  BOOST_CHECK_EQUAL(reinterpret_cast<std::uintptr_t>(h.get()) % alignof(double),
-                    0u);
-  BOOST_CHECK(a.cursor() >= 10u * sizeof(double));
+  BOOST_CHECK(is_aligned(h.get(), alignof(double)));
+  BOOST_CHECK_EQUAL(a.page_count(), 1u);
+  BOOST_CHECK_EQUAL(a.bytes_reserved(), kArenaDefaultPageBytes);
+  for (int i = 0; i < 10; ++i) h[i] = double(i);
+  for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(h[i], double(i));
 }
 
-BOOST_AUTO_TEST_CASE(slab_survives_arena_destruction) {
-  std::shared_ptr<int[]> survivor;
-  {
-    Arena tmp;
-    tmp.reserve(256);
-    survivor = tmp.claim<int>(10);
-    for (int i = 0; i < 10; ++i) survivor[i] = -i;
-  }
-  for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(survivor[i], -i);
+BOOST_AUTO_TEST_CASE(claims_roll_over_to_fresh_pages) {
+  Arena a(std::pmr::new_delete_resource(), /*page_size=*/256);
+  std::vector<std::shared_ptr<std::byte[]>> handles;
+  // 64 B at 64-B alignment => 4 per 256 B page; 10 claims => >= 3 pages
+  for (int i = 0; i < 10; ++i) handles.push_back(a.claim_bytes(64, 64));
+  BOOST_CHECK_GE(a.page_count(), 3u);
+  BOOST_CHECK_EQUAL(a.bytes_allocated(), 10u * 64u);
+  // every handle is a distinct, valid, writable region
+  for (std::size_t i = 0; i < handles.size(); ++i)
+    std::memset(handles[i].get(), int(i), 64);
+  for (std::size_t i = 0; i < handles.size(); ++i)
+    BOOST_CHECK_EQUAL(static_cast<unsigned char>(handles[i][0]),
+                      static_cast<unsigned char>(i));
 }
 
-BOOST_AUTO_TEST_CASE(plan_uniform_cells) {
-  ArenaPlan p = plan(
-      /*N_cells=*/6,
-      /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{10}; },
-      /*element_size=*/sizeof(double),
-      /*alignment=*/alignof(double));
-  BOOST_CHECK_EQUAL(p.total_bytes, 6u * 10u * sizeof(double));
-  BOOST_CHECK_EQUAL(p.offsets.size(), 6u);
-  BOOST_CHECK_EQUAL(p.offsets[0], 0u);
-  BOOST_CHECK_EQUAL(p.offsets[5], 5u * 10u * sizeof(double));
+BOOST_AUTO_TEST_CASE(oversized_claim_gets_a_dedicated_page) {
+  Arena a(std::pmr::new_delete_resource(), /*page_size=*/256);
+  // request larger than a page -> a dedicated, exactly-sized page
+  auto big = a.claim_bytes(1024, 64);
+  BOOST_REQUIRE(big.get() != nullptr);
+  BOOST_CHECK(is_aligned(big.get(), 64));
+  BOOST_CHECK_EQUAL(a.page_count(), 1u);
+  BOOST_CHECK_EQUAL(a.bytes_reserved(), 1024u);
+  // a following normal claim does not reuse the dedicated page; it opens a
+  // standard page
+  auto small = a.claim_bytes(64, 64);
+  BOOST_CHECK_EQUAL(a.page_count(), 2u);
+  BOOST_CHECK_EQUAL(a.bytes_reserved(), 1024u + 256u);
+  std::memset(big.get(), 1, 1024);
+  std::memset(small.get(), 2, 64);
 }
 
-BOOST_AUTO_TEST_CASE(plan_variable_cells_match_pivot_doc_example) {
-  ArenaPlan p = plan(
-      /*N_cells=*/12,
-      /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{20}; },
-      /*element_size=*/sizeof(double),
-      /*alignment=*/alignof(double));
-  BOOST_CHECK_EQUAL(p.total_bytes, 12u * 20u * sizeof(double));
-  BOOST_CHECK_EQUAL(p.offsets[1], 20u * sizeof(double));
+BOOST_AUTO_TEST_CASE(single_exact_page_corner_case) {
+  // corner case (b): a lone cell -> one exactly-sized page, no waste
+  Arena a;
+  a.reserve_page(640, 128);
+  auto h = a.claim_bytes(640, 128);
+  BOOST_CHECK_EQUAL(a.page_count(), 1u);
+  BOOST_CHECK_EQUAL(a.bytes_reserved(), 640u);
+  BOOST_CHECK_EQUAL(a.bytes_allocated(), 640u);
+  BOOST_CHECK(is_aligned(h.get(), 128));
 }
 
-BOOST_AUTO_TEST_CASE(plan_then_construct_then_read) {
-  const std::size_t N = 4;
-  std::vector<std::size_t> volumes = {3, 5, 2, 7};
-  auto shape_fn = [&volumes](std::size_t ord) { return FakeRange{volumes[ord]}; };
-  ArenaPlan p = plan(N, shape_fn, sizeof(double), alignof(double));
-  Arena a;
-  a.reserve(p.total_bytes);
-  std::vector<std::shared_ptr<double[]>> handles(N);
-  for (std::size_t ord = 0; ord < N; ++ord) {
-    handles[ord] = a.slice<double>(p.offsets[ord], volumes[ord]);
-    for (std::size_t i = 0; i < volumes[ord]; ++i)
-      handles[ord][i] = double(100 * ord + i);
+BOOST_AUTO_TEST_CASE(zero_init_clears_each_page) {
+  Arena a(std::pmr::new_delete_resource(), /*page_size=*/256,
+          /*zero_init=*/true);
+  auto h = a.claim<unsigned char>(200);
+  for (std::size_t i = 0; i < 200; ++i) BOOST_CHECK_EQUAL(h[i], 0u);
+}
+
+BOOST_AUTO_TEST_CASE(claimed_memory_survives_arena_destruction) {
+  std::shared_ptr<int[]> survivor;
+  {
+    Arena tmp(std::pmr::new_delete_resource(), /*page_size=*/256);
+    survivor = tmp.claim<int>(10);
+    for (int i = 0; i < 10; ++i) survivor[i] = -i;
   }
-  for (std::size_t ord = 0; ord < N; ++ord)
-    for (std::size_t i = 0; i < volumes[ord]; ++i)
-      BOOST_CHECK_EQUAL(handles[ord][i], double(100 * ord + i));
+  // the aliasing handle keeps its page alive past the Arena
+  for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(survivor[i], -i);
 }
 
 BOOST_AUTO_TEST_CASE(arena_resource_is_identity_equal) {
   Arena a;
-  a.reserve(64);
   ArenaResource r1(&a);
   ArenaResource r2(&a);
   BOOST_CHECK(r1.is_equal(r1));
diff --git a/tests/arena_kernels.cpp b/tests/arena_kernels.cpp
index 4e278fd495..e6c1fe2a74 100644
--- a/tests/arena_kernels.cpp
+++ b/tests/arena_kernels.cpp
@@ -157,4 +157,62 @@ BOOST_AUTO_TEST_CASE(inner_permute_rank3_cell) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(builder_matches_up_front_baseline) {
+  // build a ToT one cell at a time, then compare to the up-front baseline
+  outer_t baseline = make_tot(4, 5, 1.0);
+  TA::detail::ArenaToTBuilder<outer_t> b(TA::Range{4l});
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    inner_t& cell = b.emplace(ord, TA::Range{5l});
+    for (std::size_t i = 0; i < 5; ++i)
+      cell.at_ordinal(i) = 1.0 + ord * 100.0 + double(i);
+  }
+  outer_t built = std::move(b).finish();
+  BOOST_CHECK(tot_equal(built, baseline));
+}
+
+BOOST_AUTO_TEST_CASE(builder_rolls_over_to_multiple_pages) {
+  // a small page forces the incremental arena to span several pages
+  const std::size_t N = 20;
+  TA::detail::ArenaToTBuilder<outer_t> b(TA::Range{static_cast<long>(N)},
+                                         /*batch=*/1, /*zero_init=*/false,
+                                         /*page_size=*/256);
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    inner_t& cell = b.emplace(ord, TA::Range{5l});
+    for (std::size_t i = 0; i < 5; ++i)
+      cell.at_ordinal(i) = 1.0 + ord * 100.0 + double(i);
+  }
+  BOOST_CHECK_GT(b.arena().page_count(), 1u);
+  outer_t built = std::move(b).finish();
+  BOOST_CHECK(tot_equal(built, make_tot(N, 5, 1.0)));
+}
+
+BOOST_AUTO_TEST_CASE(builder_single_cell_uses_one_exact_page) {
+  // corner case (b): a lone inner tensor -> one exactly-sized page
+  TA::detail::ArenaToTBuilder<outer_t> b(TA::Range{1l});
+  inner_t& cell = b.emplace(0, TA::Range{7l});
+  for (std::size_t i = 0; i < 7; ++i) cell.at_ordinal(i) = double(i);
+  BOOST_CHECK_EQUAL(b.arena().page_count(), 1u);
+  BOOST_CHECK_EQUAL(b.arena().bytes_reserved(), 7u * sizeof(double));
+  outer_t built = std::move(b).finish();
+  BOOST_REQUIRE_EQUAL(built.range().volume(), 1u);
+  for (std::size_t i = 0; i < 7; ++i)
+    BOOST_CHECK_EQUAL(built.data()->at_ordinal(i), double(i));
+}
+
+BOOST_AUTO_TEST_CASE(compact_coalesces_a_multipage_tile) {
+  const std::size_t N = 16;
+  TA::detail::ArenaToTBuilder<outer_t> b(TA::Range{static_cast<long>(N)}, 1,
+                                         false, /*page_size=*/256);
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    inner_t& cell = b.emplace(ord, TA::Range{5l});
+    for (std::size_t i = 0; i < 5; ++i)
+      cell.at_ordinal(i) = 1.0 + ord * 100.0 + double(i);
+  }
+  BOOST_CHECK_GT(b.arena().page_count(), 1u);
+  outer_t multipage = std::move(b).finish();
+  outer_t compacted = TA::detail::arena_compact(multipage);
+  BOOST_CHECK(tot_equal(compacted, multipage));
+  BOOST_CHECK(tot_equal(compacted, make_tot(N, 5, 1.0)));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp
index fea3046aa8..5b26678ad4 100644
--- a/tests/arena_tensor_kernels.cpp
+++ b/tests/arena_tensor_kernels.cpp
@@ -158,6 +158,64 @@ bool outers_equal(const Outer& a, const Outer& b) {
 
 }  // namespace
 
+BOOST_AUTO_TEST_CASE(builder_matches_up_front_baseline) {
+  // incremental one-pass construction of an ArenaTensor-celled outer tile
+  Outer baseline = make_outer(4, 8, 1.0);
+  TA::detail::ArenaToTBuilder<Outer> b(TA::Range{4});
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& cell = b.emplace(ord, TA::Range{8});
+    for (std::size_t i = 0; i < 8; ++i)
+      cell.data()[i] = 1.0 + ord * 100.0 + double(i);
+  }
+  Outer built = std::move(b).finish();
+  BOOST_CHECK(outers_equal(built, baseline));
+}
+
+BOOST_AUTO_TEST_CASE(builder_rolls_over_to_multiple_pages) {
+  const std::size_t N = 10;
+  TA::detail::ArenaToTBuilder<Outer> b(TA::Range{static_cast<long>(N)}, 1,
+                                       false,
+                                       /*page_size=*/Inner::cell_size(8) * 4);
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    Inner& cell = b.emplace(ord, TA::Range{8});
+    for (std::size_t i = 0; i < 8; ++i)
+      cell.data()[i] = 1.0 + ord * 100.0 + double(i);
+  }
+  BOOST_CHECK_GT(b.arena().page_count(), 1u);
+  Outer built = std::move(b).finish();
+  BOOST_CHECK(outers_equal(built, make_outer(N, 8, 1.0)));
+}
+
+BOOST_AUTO_TEST_CASE(builder_single_cell_uses_one_exact_page) {
+  // corner case (b): a lone ArenaTensor cell -> one exactly-sized page
+  TA::detail::ArenaToTBuilder<Outer> b(TA::Range{1});
+  Inner& cell = b.emplace(0, TA::Range{7});
+  for (std::size_t i = 0; i < 7; ++i) cell.data()[i] = double(i);
+  BOOST_CHECK_EQUAL(b.arena().page_count(), 1u);
+  BOOST_CHECK_EQUAL(b.arena().bytes_reserved(), Inner::cell_size(7));
+  Outer built = std::move(b).finish();
+  BOOST_REQUIRE_EQUAL(built.range().volume(), 1u);
+  BOOST_REQUIRE(bool(built.data()[0]));
+  for (std::size_t i = 0; i < 7; ++i)
+    BOOST_CHECK_EQUAL(built.data()[0].data()[i], double(i));
+}
+
+BOOST_AUTO_TEST_CASE(compact_coalesces_a_multipage_tile) {
+  const std::size_t N = 9;
+  TA::detail::ArenaToTBuilder<Outer> b(TA::Range{static_cast<long>(N)}, 1,
+                                       false, Inner::cell_size(6) * 4);
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    Inner& cell = b.emplace(ord, TA::Range{6});
+    for (std::size_t i = 0; i < 6; ++i)
+      cell.data()[i] = 1.0 + ord * 100.0 + double(i);
+  }
+  BOOST_CHECK_GT(b.arena().page_count(), 1u);
+  Outer multipage = std::move(b).finish();
+  Outer compacted = TA::detail::arena_compact(multipage);
+  BOOST_CHECK(outers_equal(compacted, multipage));
+  BOOST_CHECK(outers_equal(compacted, make_outer(N, 6, 1.0)));
+}
+
 BOOST_AUTO_TEST_CASE(arena_tensor_is_a_tensor_but_a_view) {
   // ArenaTensor is registered as is_tensor_helper / is_contiguous_tensor so
   // kernel paths treat it like Tensor<double>; the `is_tensor_view` trait

From 3372118f9b0c58e13ac6871addda40f43a41f543 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 20 May 2026 04:59:23 +0900
Subject: [PATCH 2/4] test: DistArray-level incremental construction of arena
 ToT tiles

Add a test that builds a TA::DistArray<Tensor<ArenaTensor>> by calling
ArenaToTBuilder inside the init_tiles callback -- each outer tile's
inner cells are sized (jagged) and filled one at a time, with no
up-front range_fn. Confirms the incremental builder composes with
init_tiles and needs no new DistArray API.
---
 tests/arena_tensor_kernels.cpp | 37 ++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp
index 5b26678ad4..2e501c66db 100644
--- a/tests/arena_tensor_kernels.cpp
+++ b/tests/arena_tensor_kernels.cpp
@@ -470,6 +470,43 @@ BOOST_AUTO_TEST_CASE(distarray_arena_tensor_construct_and_init_tiles) {
   }
 }
 
+// DistArray-level incremental construction: each outer tile is built with
+// ArenaToTBuilder *inside* the init_tiles callback -- inner cells are sized
+// and filled one at a time, with no up-front range_fn. This needs no new
+// DistArray API: init_tiles already supplies a per-tile callback. Serial-only.
+BOOST_AUTO_TEST_CASE(distarray_arena_tensor_incremental_init_tiles) {
+  using Array = TA::DistArray<Outer, TA::DensePolicy>;
+  auto& world = TA::get_default_world();
+  TA::TiledRange tr{TA::TiledRange1{0, 2, 4}};
+  Array A(world, tr);
+  A.init_tiles([](const TA::Range& tile_range) {
+    TA::detail::ArenaToTBuilder<Outer> b(tile_range);
+    const std::size_t n = tile_range.volume();
+    for (std::size_t ord = 0; ord < n; ++ord) {
+      // inner extent discovered per cell (jagged) -- no pre-walk
+      const std::size_t inner = 2 + ord;
+      Inner& cell = b.emplace(ord, TA::Range{static_cast<long>(inner)});
+      for (std::size_t i = 0; i < inner; ++i)
+        cell.data()[i] = double(ord * 10 + i);
+    }
+    return std::move(b).finish();
+  });
+  world.gop.fence();
+  BOOST_CHECK_EQUAL(A.trange().tiles_range().volume(), 2u);
+  for (std::size_t t = 0; t < 2; ++t) {
+    if (!A.is_local(t)) continue;
+    Outer tile = A.find(t).get();
+    const std::size_t n = tile.range().volume();
+    for (std::size_t ord = 0; ord < n; ++ord) {
+      const Inner& cell = tile.data()[ord];
+      BOOST_REQUIRE(bool(cell));
+      BOOST_CHECK_EQUAL(cell.size(), 2u + ord);
+      for (std::size_t i = 0; i < cell.size(); ++i)
+        BOOST_CHECK_EQUAL(cell.data()[i], double(ord * 10 + i));
+    }
+  }
+}
+
 // Mixed scalar/ArenaTensor outer Hadamard: each scalar-side outer cell
 // multiplies the corresponding ArenaTensor-side inner element-wise.
 // Exercises Tensor<ArenaTensor>::mult(Tensor<scalar>) and the symmetric

From 176df8a2bdf291c75f6901c19272894044da2cc8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 20 May 2026 05:24:42 +0900
Subject: [PATCH 3/4] arena ToT: single-pass DistArray construction (drop
 buffering)

The arena-ToT construction paths pre-walked their cells twice: the
two-pass make_nested_tile invoked its source once to size each cell and
again to fill it, so callers with a single-pass source materialized the
whole outer tile into a temporary vector first. ArenaToTBuilder makes a
single ascending pass possible everywhere.

- make_nested_tile (arena_kernels.h): rebuilt on ArenaToTBuilder --
  inner_range_fn and inner_fill_fn are now interleaved per cell instead
  of two full passes; no separate all-ranges walk. Cells stay
  zero-initialized so the no-op-fill (shape-only) path is unchanged.
- DistArray::make_arena_nested_tile: rebuilt on ArenaToTBuilder;
  cell_source is invoked exactly once per cell in ascending order.
- DistArray::init_elements (arena branch): drops the std::vector<R>
  that collected every inner tensor of the outer tile before building.
- DistArray::set(i, InIter) (arena branch): drops the std::vector that
  buffered the single-pass iterator; it now feeds straight through.
- ArrayImpl retile (arena-ToT branch): builds each target tile with
  ArenaToTBuilder, one source-cell lookup per cell instead of two.

Eliminates a peak-memory doubling during construction (the temporary
held the whole tile's data alongside the arena slab). foreach /
make_array were also reviewed: both are tile-type-agnostic (the result
tile is default-constructed and the user op populates it) -- no
two-pass machinery there, nothing to relax.
---
 src/TiledArray/array_impl.h           | 30 ++++++------
 src/TiledArray/dist_array.h           | 67 +++++++++++++--------------
 src/TiledArray/tensor/arena_kernels.h | 49 +++++++++-----------
 3 files changed, 69 insertions(+), 77 deletions(-)

diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h
index 3b745fdd28..f6dff7f066 100644
--- a/src/TiledArray/array_impl.h
+++ b/src/TiledArray/array_impl.h
@@ -1001,7 +1001,6 @@ std::shared_ptr<ArrayImpl<Tile, Policy>> make_with_new_trange(
     // target cell at global outer element `e` takes its inner range and data
     // from the source cell at `e` (elements outside the source range, e.g. a
     // retile that grows the element range, yield null cells).
-    using inner_range_type = typename Tile::value_type::range_type;
     const auto& source_elements = source_array.trange().elements_range();
     std::map<std::size_t, Tile> src_tile_cache;
     auto source_cell_at =
@@ -1024,21 +1023,20 @@ std::shared_ptr<ArrayImpl<Tile, Policy>> make_with_new_trange(
     };
     for (const auto target_ord : *target_array.pmap()) {
       if (target_array.is_zero(target_ord)) continue;
-      Tile tile = make_nested_tile<Tile>(
-          target_trange.make_tile_range(target_ord),
-          [&](const auto& e) -> inner_range_type {
-            const auto* sc = source_cell_at(e);
-            return (sc && !sc->empty()) ? sc->range() : inner_range_type{};
-          },
-          [&](auto& cell, const auto& e) {
-            const auto* sc = source_cell_at(e);
-            if (sc && !sc->empty()) {
-              const auto* s = sc->data();
-              auto* d = cell.data();
-              for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p];
-            }
-          });
-      target_array.set(target_ord, std::move(tile));
+      // build each target tile in one pass: a single source lookup per cell
+      // sizes it and fills it together (no separate all-ranges walk).
+      const auto outer_range = target_trange.make_tile_range(target_ord);
+      ArenaToTBuilder<Tile> builder(outer_range);
+      const std::size_t n = outer_range.volume();
+      for (std::size_t o = 0; o < n; ++o) {
+        const auto* sc = source_cell_at(outer_range.idx(o));
+        if (!sc || sc->empty()) continue;  // leaves a deliberately-null cell
+        auto& cell = builder.emplace(o, sc->range());
+        const auto* s = sc->data();
+        auto* d = cell.data();
+        for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p];
+      }
+      target_array.set(target_ord, std::move(builder).finish());
     }
     target_array.world().gop.fence();
   } else {
diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h
index 6b994e17c0..c2994963df 100644
--- a/src/TiledArray/dist_array.h
+++ b/src/TiledArray/dist_array.h
@@ -811,18 +811,17 @@ class DistArray : public madness::archive::ParallelSerializableObject {
     check_index(i);
     if constexpr (detail::is_tensor_of_tensor_v<value_type> &&
                   is_arena_tensor_v<element_type>) {
-      // arena ToT: the iterated inner tiles carry the ranges needed to size
-      // the slab; buffer them (the iterator is single-pass) and build.
+      // arena ToT: each iterated inner tile carries the range that sizes its
+      // cell. make_arena_nested_tile pulls the source once per cell in
+      // ascending order, so the single-pass iterator feeds straight through.
       const auto outer_range = pimpl_->trange().make_tile_range(i);
       using SrcTile = std::decay_t<decltype(*first)>;
-      std::vector<SrcTile> buf;
-      buf.reserve(outer_range.volume());
-      for (std::size_t k = 0; k < outer_range.volume(); ++k, ++first)
-        buf.emplace_back(*first);
-      pimpl_->set(i, make_arena_nested_tile(
-                         outer_range, [&buf](std::size_t k) -> const SrcTile& {
-                           return buf[k];
-                         }));
+      pimpl_->set(i, make_arena_nested_tile(outer_range,
+                                            [&first](std::size_t) -> SrcTile {
+                                              SrcTile t = *first;
+                                              ++first;
+                                              return t;
+                                            }));
     } else {
       pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first));
     }
@@ -1165,15 +1164,11 @@ class DistArray : public madness::archive::ParallelSerializableObject {
                 std::is_assignable_v<element_type&, const R&>,
                 "DistArray::init_elements: op must return a freestanding "
                 "tensor assignable to the inner tile type");
-            // pass 1: collect op's freestanding inner tensors; pass 2:
-            // make_arena_nested_tile sizes the slab and deep-copies them in
-            std::vector<R> collected;
-            collected.reserve(outer_range.volume());
-            for (std::size_t o = 0; o < outer_range.volume(); ++o)
-              collected.emplace_back(op(outer_range.idx(o)));
+            // single pass: make_arena_nested_tile pulls each cell once, in
+            // ascending order, so op runs once per cell with no buffer
             return make_arena_nested_tile(
-                outer_range, [&collected](std::size_t k) -> const R& {
-                  return collected[k];
+                outer_range, [&op, &outer_range](std::size_t k) -> R {
+                  return op(outer_range.idx(k));
                 });
           },
           skip_set);
@@ -1927,29 +1922,31 @@ class DistArray : public madness::archive::ParallelSerializableObject {
 
   /// Engine behind the arena-ToT paths of \c init_elements and \c set:
   /// \p cell_source(ordinal) returns a freestanding tensor whose range sizes
-  /// inner cell \p ordinal and whose data fills it. The slab is allocated by
-  /// \c detail::make_nested_tile and each cell deep-copies its source.
+  /// inner cell \p ordinal and whose data fills it. Built in one pass with
+  /// \c detail::ArenaToTBuilder; \p cell_source is invoked exactly once per
+  /// cell, in ascending ordinal order, so a single-pass source (a generator
+  /// op or an input iterator) can be fed straight through without buffering.
   /// \param[in] outer_range the outer tile's range
   /// \param[in] cell_source maps a cell ordinal to its source tensor
   template <typename CellSource>
   static value_type make_arena_nested_tile(const TiledArray::Range& outer_range,
                                            CellSource&& cell_source) {
     using InnerRange = typename element_type::range_type;
-    return detail::make_nested_tile<value_type>(
-        outer_range,
-        [&](const auto& idx) -> InnerRange {
-          // the inner-cell range type is built from an extent list -- it is
-          // not constructible from a foreign range type
-          const auto& src = cell_source(outer_range.ordinal(idx)).range();
-          const auto& src_ext = src.extent();
-          std::vector<std::size_t> ext(src.rank());
-          for (std::size_t d = 0; d < src.rank(); ++d)
-            ext[d] = static_cast<std::size_t>(src_ext[d]);
-          return InnerRange(ext);
-        },
-        [&](auto& cell, const auto& idx) {
-          cell = cell_source(outer_range.ordinal(idx));
-        });
+    detail::ArenaToTBuilder<value_type> builder(outer_range);
+    const std::size_t n = outer_range.volume();
+    for (std::size_t k = 0; k < n; ++k) {
+      const auto& src = cell_source(k);
+      // the inner-cell range type is built from an extent list -- it is not
+      // constructible from a foreign range type
+      const auto& src_range = src.range();
+      const auto& src_ext = src_range.extent();
+      std::vector<std::size_t> ext(src_range.rank());
+      for (std::size_t d = 0; d < src_range.rank(); ++d)
+        ext[d] = static_cast<std::size_t>(src_ext[d]);
+      auto& cell = builder.emplace(k, InnerRange(ext));
+      if (!cell.empty()) cell = src;  // deep copy into the bound arena cell
+    }
+    return std::move(builder).finish();
   }
 
   /// Code factorization of the actual assert for the other overloads
diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h
index b74384e57f..f5277ec749 100644
--- a/src/TiledArray/tensor/arena_kernels.h
+++ b/src/TiledArray/tensor/arena_kernels.h
@@ -148,32 +148,6 @@ struct nested_fill_noop {
   void operator()(Cell&, const Index&) const noexcept {}
 };
 
-/// Build one ToT outer tile over `outer_range`, two-pass:
-///   pass 1: `inner_range_fn(outer_element_index)` -> inner `range_type`
-///           sizes every inner cell (zero-volume -> deliberately-null cell);
-///   pass 2: `inner_fill_fn(inner_cell&, outer_element_index)` fills each
-///           non-null cell. The default fill leaves storage zero-initialized.
-/// Dispatches internally on the inner-tile type (see `arena_outer_init`).
-template <typename OuterTensor, typename InnerRangeFn,
-          typename InnerFillFn = nested_fill_noop>
-OuterTensor make_nested_tile(
-    const typename OuterTensor::range_type& outer_range,
-    InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) {
-  // arena_outer_init keys ranges on the cell ordinal; user code keys on the
-  // (global) outer element index -- translate via the outer range.
-  auto cell_range_fn = [&](std::size_t ord) {
-    return inner_range_fn(outer_range.idx(ord));
-  };
-  OuterTensor result =
-      arena_outer_init<OuterTensor>(outer_range, 1, cell_range_fn);
-  const std::size_t N = outer_range.volume();
-  for (std::size_t ord = 0; ord < N; ++ord) {
-    auto& cell = result.data()[ord];
-    if (!cell.empty()) inner_fill_fn(cell, outer_range.idx(ord));
-  }
-  return result;
-}
-
 /// One-pass incremental builder for an arena-backed ToT outer tile.
 ///
 /// `make_nested_tile` / `arena_outer_init` pre-walk every inner range before
@@ -261,6 +235,29 @@ class ArenaToTBuilder {
   std::shared_ptr<inner_t[]> data_;
 };
 
+/// Build one ToT outer tile over `outer_range` in a single pass: each inner
+/// cell is sized by `inner_range_fn(outer_element_index)` and immediately
+/// filled by `inner_fill_fn(inner_cell&, outer_element_index)` before moving
+/// to the next -- no separate all-ranges walk. A zero-volume inner range
+/// yields a deliberately-null cell, which `inner_fill_fn` is not invoked on.
+/// Cells are zero-initialized, so the default no-op fill still leaves zeroed
+/// storage. Backed by `ArenaToTBuilder`.
+template <typename OuterTensor, typename InnerRangeFn,
+          typename InnerFillFn = nested_fill_noop>
+OuterTensor make_nested_tile(
+    const typename OuterTensor::range_type& outer_range,
+    InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) {
+  ArenaToTBuilder<OuterTensor> builder(outer_range, /*batch_sz=*/1,
+                                       /*zero_init=*/true);
+  const std::size_t N = outer_range.volume();
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    const auto idx = outer_range.idx(ord);
+    auto& cell = builder.emplace(ord, inner_range_fn(idx));
+    if (!cell.empty()) inner_fill_fn(cell, idx);
+  }
+  return std::move(builder).finish();
+}
+
 /// Apply a unary fill op while preserving each source inner range.
 /// `fill_op(dst_data, src_data, n_elements)` writes the result cell.
 template <typename OuterTensor, typename SrcOuterTensor, typename FillOp>

From a02f28e60e49e0a0134f78eb08492ec6a338a96f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 20 May 2026 05:47:19 +0900
Subject: [PATCH 4/4] arena: preserve rank>0 zero-volume ranges; fix stale view
 comment

ArenaToTBuilder::emplace, given a zero-volume range, used to leave the
cell default/null. For an owning (non-view) inner that drops the range
metadata -- arena_outer_init keeps a rank>0 zero-volume range as an
empty-but-ranked tensor and only collapses a rank-0 range to null. Since
make_nested_tile now routes through the builder, mirror that handling so
a TA::Tensor inner with e.g. Range{0} stays an empty rank-1 tensor.
Arena view inners (which cannot carry a standalone range) still go null.
Adds a regression test.

Also drops a stale type_traits.h comment that listed TensorInterface as
an is_tensor_view specialization -- it is deliberately not a view.
---
 src/TiledArray/tensor/arena_kernels.h | 17 +++++++++++++----
 src/TiledArray/tensor/type_traits.h   |  8 +++++---
 tests/arena_kernels.cpp               | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h
index f5277ec749..4db5212ef3 100644
--- a/src/TiledArray/tensor/arena_kernels.h
+++ b/src/TiledArray/tensor/arena_kernels.h
@@ -188,14 +188,23 @@ class ArenaToTBuilder {
 
   /// Size and bind the inner cell at outer cell ordinal `ord` to
   /// `inner_range`, returning a reference to the bound cell for the caller to
-  /// fill. A zero-volume range leaves the cell null. Outer element indices
-  /// translate via `outer_range().ordinal(idx)`.
+  /// fill. A zero-volume range yields an empty cell: an owning inner keeps a
+  /// rank>0 range (a rank-0 range stays null), a view inner stays null.
+  /// Outer element indices translate via `outer_range().ordinal(idx)`.
   inner_t& emplace(std::size_t ord, inner_range_t inner_range) {
     TA_ASSERT(ord < n_cells_);
     inner_t& cell = data_[ord];
-    const std::size_t vol = inner_range.volume();
-    if (vol == 0) return cell;  // stays null
     constexpr bool arena = is_arena_tensor_v<inner_t>;
+    const std::size_t vol = inner_range.volume();
+    if (vol == 0) {
+      // Mirror arena_outer_init: an owning (non-view) inner preserves a
+      // rank>0 zero-volume range as an empty-but-ranked tensor; a rank-0
+      // range -- and any arena view inner -- leaves the cell default/null.
+      if constexpr (!arena) {
+        if (inner_range.rank() != 0) cell = inner_t(std::move(inner_range));
+      }
+      return cell;
+    }
     std::size_t stride;
     std::size_t bytes;
     if constexpr (arena) {
diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h
index 1a457e61b1..ac622675e7 100644
--- a/src/TiledArray/tensor/type_traits.h
+++ b/src/TiledArray/tensor/type_traits.h
@@ -120,9 +120,11 @@ inline constexpr const bool is_nested_tensor_v = is_nested_tensor<Ts...>::value;
 }  // namespace detail
 
 /// Forward decl for the tensor-view predicate. Specializations live in
-/// `tensor/arena_tensor.h` (`ArenaTensor`, `detail::TensorInterface`) and
-/// `external/btas.h` (`btas::TensorView`). Declared here so the operator-body
-/// predicates below can consult it without including arena_tensor.h.
+/// `tensor/arena_tensor.h` (`ArenaTensor`) and `external/btas.h`
+/// (`btas::TensorView`). Declared here so the operator-body predicates below
+/// can consult it without including arena_tensor.h. Note `TensorInterface` /
+/// `TensorMap` is deliberately *not* a view here -- it has value-returning
+/// member arithmetic (see arena_tensor.h).
 template <typename T>
 struct is_tensor_view : std::false_type {};
 template <typename T>
diff --git a/tests/arena_kernels.cpp b/tests/arena_kernels.cpp
index e6c1fe2a74..22fe5bc155 100644
--- a/tests/arena_kernels.cpp
+++ b/tests/arena_kernels.cpp
@@ -199,6 +199,21 @@ BOOST_AUTO_TEST_CASE(builder_single_cell_uses_one_exact_page) {
     BOOST_CHECK_EQUAL(built.data()->at_ordinal(i), double(i));
 }
 
+BOOST_AUTO_TEST_CASE(builder_zero_volume_nonscalar_range_keeps_rank) {
+  // an owning inner given a zero-volume but rank>0 range keeps that range
+  // (mirrors arena_outer_init): the rank-1 range is preserved rather than
+  // collapsed to a rank-0 null cell
+  TA::detail::ArenaToTBuilder<outer_t> b(TA::Range{2l});
+  inner_t& c0 = b.emplace(0, TA::Range{0l});  // rank 1, extent 0, volume 0
+  inner_t& c1 = b.emplace(1, TA::Range{3l});
+  BOOST_CHECK_EQUAL(c0.range().rank(), 1u);
+  BOOST_CHECK_EQUAL(c0.range().volume(), 0u);
+  BOOST_CHECK(!c1.empty());
+  outer_t built = std::move(b).finish();
+  BOOST_CHECK_EQUAL(built.data()[0].range().rank(), 1u);
+  BOOST_CHECK_EQUAL(built.data()[1].range().volume(), 3u);
+}
+
 BOOST_AUTO_TEST_CASE(compact_coalesces_a_multipage_tile) {
   const std::size_t N = 16;
   TA::detail::ArenaToTBuilder<outer_t> b(TA::Range{static_cast<long>(N)}, 1,