From bbdf2d7591a133bb52457d474ec63a817c5e82e2 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 21 May 2026 12:10:39 +0900
Subject: [PATCH 1/2] einsum: fence sub-Worlds before destruction (incl. on
 exception unwind)

Add an inline RAII guard `FenceSubWorldsOnExit` to the generalized-
contraction path of einsum, declared right after the `worlds` vector so
it destructs *before* `worlds` (LIFO) and *after* AB/C. On normal exit
this is a final harmless drain; on exception unwind it drains any
`lazy_sync_children` tasks that ~DistArray scheduled via lazy_deleter
on sub-World taskqs before those sub-Worlds are torn down. Without
this, those tasks survive into the global ThreadPool past ~World, then
trip ~WorldObject's `World::exists(&world)` assertion when an enclosing
scope's fence runs them, masking the real exception with a cryptic
abort.

One fence per sub-World suffices because lazy_deleter now bypasses
lazy_sync when invoked from `do_cleanup` (gated by
`world.gop.is_in_do_cleanup()`): the deferred-cleanup path performs
direct deletes rather than scheduling cross-rank tasks. The remaining
tasks this fence has to drain come only from non-deferred ~DistArray
calls (e.g. AB during exception unwind), and all participating ranks
of a sub-World reach this RAII guard in lockstep so their lazy_sync
handshakes match up.
---
 src/TiledArray/einsum/tiledarray.h | 32 ++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h
index 45229f89c6..d87cf3ff04 100644
--- a/src/TiledArray/einsum/tiledarray.h
+++ b/src/TiledArray/einsum/tiledarray.h
@@ -653,6 +653,38 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
     // dead World (e.g. while unwinding an exception thrown mid-contraction).
     std::vector<std::shared_ptr<World>> worlds;
 
+    // RAII fencer: on normal exit and (critically) on exception unwind,
+    // fence every live sub-World before it is destroyed. ~DistArray ->
+    // lazy_deleter calls world.gop.lazy_sync(...) which enqueues a
+    // lazy_sync_children task onto the sub-World's taskq; without a fence
+    // those tasks survive into the global ThreadPool past the sub-World's
+    // ~World, then trip ~WorldObject's `World::exists(&world)` assertion
+    // when some later fence (e.g. an enclosing scope's fence run during
+    // unwind) picks them up. Declared *after* `worlds` so it destructs
+    // *before* `worlds` (LIFO); destructs *after* AB/C so it sees the
+    // tasks they scheduled via lazy_deleter.
+    //
+    // One fence per sub-World is sufficient: lazy_deleter's fast path
+    // skips lazy_sync when invoked from inside fence_impl's do_cleanup
+    // (gated by `world.gop.is_in_do_cleanup()`), so the deferred-cleanup
+    // path performs direct deletes rather than scheduling cross-rank
+    // tasks. Tasks scheduled by *non*-deferred ~DistArray's (e.g. AB
+    // during exception unwind) are drained by this fence's drain loop;
+    // all participating ranks of a sub-World reach this RAII guard in
+    // lockstep at function exit, so their lazy_sync handshakes match up.
+    struct FenceSubWorldsOnExit {
+      std::vector<std::shared_ptr<World>> &worlds_;
+      ~FenceSubWorldsOnExit() {
+        for (auto &w : worlds_) {
+          if (!w) continue;
+          try {
+            w->gop.fence();
+          } catch (...) {
+          }
+        }
+      }
+    } fence_subworlds_on_exit{worlds};
+
     std::tuple<ArrayTerm<ArrayA>, ArrayTerm<ArrayB>> AB{{A.array(), a},
                                                         {B.array(), b}};
 

From 31800a9e65d0410b90ed959005993222718ea7a8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 21 May 2026 12:10:40 +0900
Subject: [PATCH 2/2] cont_engine: view ToTxToT (outer Contraction, inner
 Hadamard) via arena

Add the (outer Contraction, inner Hadamard) case to
init_inner_tile_op's view-cell branch. Mirrors the owning-tile path in
init_inner_tile_op_owning_: arena_plan_ uses the `left_range` plan to
shape each result cell from a non-empty left inner cell, and the
per-cell op accumulates `r += l * rr` -- or `r += (l * rr) * factor_`
when scaled -- via fused_hadamard_inplace into the pre-shaped view
cell. No value-returning per-cell op is needed, so this works for view
cells (e.g. ArenaTensor); non-identity inner result permutation is
rejected (the owning fallback that materializes a permuted return cell
cannot run for views).

Previously this case threw "nested non-contraction product on view
inner tiles is not yet supported", aborting expressions such as
`C(i_3,i_4;a<...>) = A(i_3;a<...>) * B(i_4;a<...>)` over ArenaTensor
inner cells -- the typical sub-product inside einsum's generalized
contraction loop for ToTxToT with Hadamard outer-Hadamard inner shapes.
---
 src/TiledArray/expressions/cont_engine.h | 67 ++++++++++++++++++++----
 1 file changed, 58 insertions(+), 9 deletions(-)

diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index ee2f721aa3..867d18d3a9 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -547,20 +547,69 @@ class ContEngine : public BinaryEngine<Derived> {
                     TiledArray::is_tensor_view_v<result_tile_element_type>) {
         // ToT x ToT with non-owning view inner cells (e.g. ArenaTensor). A
         // view cell cannot host a value-returning inner op, so the
-        // owning-cell inner-op builder cannot be used. Two nested products
-        // are supported here:
-        //  - the elementwise pure Hadamard, where the inner element op is
-        //    unused anyway -- MultEngine::make_tile_op passes none and the
-        //    outer Mult tile op recurses through Tensor<view>::mult -- so
-        //    element_*_op_ is left null;
-        //  - the inner contraction (incl. inner outer-product), routed
-        //    through the arena fast path: it writes results in place into
-        //    pre-shaped view cells, so only element_nonreturn_op_ is needed.
+        // owning-cell inner-op builder cannot be used. The supported nested
+        // products are:
+        //  - the elementwise pure Hadamard (outer Hadamard, inner Hadamard),
+        //    where the inner element op is unused anyway -- MultEngine::
+        //    make_tile_op passes none and the outer Mult tile op recurses
+        //    through Tensor<view>::mult -- so element_*_op_ is left null;
+        //  - inner Hadamard under outer Contraction, routed through the
+        //    arena fast path with a left_range plan and a per-cell
+        //    `r += l * rr` (optionally scaled) op: result cells are
+        //    pre-shaped from non-empty left cells, then accumulated in
+        //    place over the K-panel;
+        //  - inner Contraction (incl. inner outer-product) under either
+        //    outer regime, routed through the arena fast path: it writes
+        //    results in place into pre-shaped view cells, so only
+        //    element_nonreturn_op_ is needed.
         // Every other nested product is deferred.
         const auto inner_prod = this->inner_product_type();
         if (inner_prod == TensorProduct::Hadamard &&
             this->product_type() == TensorProduct::Hadamard) {
           // pure Hadamard: element_*_op_ left null
+        } else if (inner_prod == TensorProduct::Hadamard &&
+                   this->product_type() == TensorProduct::Contraction) {
+          // outer Contraction + inner Hadamard on view inner tiles.
+          // Mirror the owning-tile path (init_inner_tile_op_owning_): the
+          // SUMMA shapes each result cell from a non-empty left inner cell
+          // (left_range plan), and the per-cell op accumulates `r += l * rr`
+          // -- or `r += (l * rr) * factor_` when scaled -- via
+          // fused_hadamard_inplace into the pre-shaped view cell. No
+          // value-returning per-cell op is needed, so this works for view
+          // cells; non-identity inner result permutation is rejected here
+          // (the owning fallback that materializes a permuted return cell
+          // cannot run for views).
+          constexpr bool arena_eligible_h_view =
+              TiledArray::detail::is_contraction_arena_tot_v<
+                  result_tile_type, left_tile_type, right_tile_type>;
+          if constexpr (!arena_eligible_h_view) {
+            TA_EXCEPTION(
+                "nested Hadamard on view inner tiles is supported only for "
+                "arena-backed tensors-of-tensors");
+          } else {
+            this->arena_plan_ = TiledArray::detail::make_contraction_arena_plan<
+                result_tile_type, left_tile_type, right_tile_type>(
+                TiledArray::detail::ArenaInnerShapeKind::left_range,
+                std::nullopt, inner(this->perm_));
+            if (!bool(this->arena_plan_))
+              TA_EXCEPTION(
+                  "nested Hadamard on view inner tiles: the arena fast path "
+                  "was inactive (arena disabled, or a non-identity inner "
+                  "result permutation -- not yet supported on view cells)");
+            if (this->factor_ == scalar_type{1}) {
+              this->element_nonreturn_op_ =
+                  TiledArray::detail::make_fused_hadamard_lambda<
+                      result_tile_element_type, left_tile_element_type,
+                      right_tile_element_type>();
+            } else {
+              this->element_nonreturn_op_ =
+                  TiledArray::detail::make_fused_hadamard_scaled_lambda<
+                      result_tile_element_type, left_tile_element_type,
+                      right_tile_element_type>(this->factor_);
+            }
+          }
+          // element_return_op_ left null: a view cell cannot be
+          // value-returned (see the init_struct precondition check).
         } else if (inner_prod == TensorProduct::Contraction) {
           using op_type = TiledArray::detail::ContractReduce<
               result_tile_element_type, left_tile_element_type,