From bbdf2d7591a133bb52457d474ec63a817c5e82e2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 May 2026 12:10:39 +0900 Subject: [PATCH 1/2] einsum: fence sub-Worlds before destruction (incl. on exception unwind) Add an inline RAII guard `FenceSubWorldsOnExit` to the generalized- contraction path of einsum, declared right after the `worlds` vector so it destructs *before* `worlds` (LIFO) and *after* AB/C. On normal exit this is a final harmless drain; on exception unwind it drains any `lazy_sync_children` tasks that ~DistArray scheduled via lazy_deleter on sub-World taskqs before those sub-Worlds are torn down. Without this, those tasks survive into the global ThreadPool past ~World, then trip ~WorldObject's `World::exists(&world)` assertion when an enclosing scope's fence runs them, masking the real exception with a cryptic abort. One fence per sub-World suffices because lazy_deleter now bypasses lazy_sync when invoked from `do_cleanup` (gated by `world.gop.is_in_do_cleanup()`): the deferred-cleanup path performs direct deletes rather than scheduling cross-rank tasks. The remaining tasks this fence has to drain come only from non-deferred ~DistArray calls (e.g. AB during exception unwind), and all participating ranks of a sub-World reach this RAII guard in lockstep so their lazy_sync handshakes match up. --- src/TiledArray/einsum/tiledarray.h | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 45229f89c6..d87cf3ff04 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -653,6 +653,38 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // dead World (e.g. while unwinding an exception thrown mid-contraction). std::vector> worlds; + // RAII fencer: on normal exit and (critically) on exception unwind, + // fence every live sub-World before it is destroyed. ~DistArray -> + // lazy_deleter calls world.gop.lazy_sync(...) which enqueues a + // lazy_sync_children task onto the sub-World's taskq; without a fence + // those tasks survive into the global ThreadPool past the sub-World's + // ~World, then trip ~WorldObject's `World::exists(&world)` assertion + // when some later fence (e.g. an enclosing scope's fence run during + // unwind) picks them up. Declared *after* `worlds` so it destructs + // *before* `worlds` (LIFO); destructs *after* AB/C so it sees the + // tasks they scheduled via lazy_deleter. + // + // One fence per sub-World is sufficient: lazy_deleter's fast path + // skips lazy_sync when invoked from inside fence_impl's do_cleanup + // (gated by `world.gop.is_in_do_cleanup()`), so the deferred-cleanup + // path performs direct deletes rather than scheduling cross-rank + // tasks. Tasks scheduled by *non*-deferred ~DistArray's (e.g. AB + // during exception unwind) are drained by this fence's drain loop; + // all participating ranks of a sub-World reach this RAII guard in + // lockstep at function exit, so their lazy_sync handshakes match up. + struct FenceSubWorldsOnExit { + std::vector> &worlds_; + ~FenceSubWorldsOnExit() { + for (auto &w : worlds_) { + if (!w) continue; + try { + w->gop.fence(); + } catch (...) { + } + } + } + } fence_subworlds_on_exit{worlds}; + std::tuple, ArrayTerm> AB{{A.array(), a}, {B.array(), b}}; From 31800a9e65d0410b90ed959005993222718ea7a8 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 May 2026 12:10:40 +0900 Subject: [PATCH 2/2] cont_engine: view ToTxToT (outer Contraction, inner Hadamard) via arena Add the (outer Contraction, inner Hadamard) case to init_inner_tile_op's view-cell branch. Mirrors the owning-tile path in init_inner_tile_op_owning_: arena_plan_ uses the `left_range` plan to shape each result cell from a non-empty left inner cell, and the per-cell op accumulates `r += l * rr` -- or `r += (l * rr) * factor_` when scaled -- via fused_hadamard_inplace into the pre-shaped view cell. No value-returning per-cell op is needed, so this works for view cells (e.g. ArenaTensor); non-identity inner result permutation is rejected (the owning fallback that materializes a permuted return cell cannot run for views). Previously this case threw "nested non-contraction product on view inner tiles is not yet supported", aborting expressions such as `C(i_3,i_4;a<...>) = A(i_3;a<...>) * B(i_4;a<...>)` over ArenaTensor inner cells -- the typical sub-product inside einsum's generalized contraction loop for ToTxToT with Hadamard outer-Hadamard inner shapes. --- src/TiledArray/expressions/cont_engine.h | 67 ++++++++++++++++++++---- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index ee2f721aa3..867d18d3a9 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -547,20 +547,69 @@ class ContEngine : public BinaryEngine { TiledArray::is_tensor_view_v) { // ToT x ToT with non-owning view inner cells (e.g. ArenaTensor). A // view cell cannot host a value-returning inner op, so the - // owning-cell inner-op builder cannot be used. Two nested products - // are supported here: - // - the elementwise pure Hadamard, where the inner element op is - // unused anyway -- MultEngine::make_tile_op passes none and the - // outer Mult tile op recurses through Tensor::mult -- so - // element_*_op_ is left null; - // - the inner contraction (incl. inner outer-product), routed - // through the arena fast path: it writes results in place into - // pre-shaped view cells, so only element_nonreturn_op_ is needed. + // owning-cell inner-op builder cannot be used. The supported nested + // products are: + // - the elementwise pure Hadamard (outer Hadamard, inner Hadamard), + // where the inner element op is unused anyway -- MultEngine:: + // make_tile_op passes none and the outer Mult tile op recurses + // through Tensor::mult -- so element_*_op_ is left null; + // - inner Hadamard under outer Contraction, routed through the + // arena fast path with a left_range plan and a per-cell + // `r += l * rr` (optionally scaled) op: result cells are + // pre-shaped from non-empty left cells, then accumulated in + // place over the K-panel; + // - inner Contraction (incl. inner outer-product) under either + // outer regime, routed through the arena fast path: it writes + // results in place into pre-shaped view cells, so only + // element_nonreturn_op_ is needed. // Every other nested product is deferred. const auto inner_prod = this->inner_product_type(); if (inner_prod == TensorProduct::Hadamard && this->product_type() == TensorProduct::Hadamard) { // pure Hadamard: element_*_op_ left null + } else if (inner_prod == TensorProduct::Hadamard && + this->product_type() == TensorProduct::Contraction) { + // outer Contraction + inner Hadamard on view inner tiles. + // Mirror the owning-tile path (init_inner_tile_op_owning_): the + // SUMMA shapes each result cell from a non-empty left inner cell + // (left_range plan), and the per-cell op accumulates `r += l * rr` + // -- or `r += (l * rr) * factor_` when scaled -- via + // fused_hadamard_inplace into the pre-shaped view cell. No + // value-returning per-cell op is needed, so this works for view + // cells; non-identity inner result permutation is rejected here + // (the owning fallback that materializes a permuted return cell + // cannot run for views). + constexpr bool arena_eligible_h_view = + TiledArray::detail::is_contraction_arena_tot_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (!arena_eligible_h_view) { + TA_EXCEPTION( + "nested Hadamard on view inner tiles is supported only for " + "arena-backed tensors-of-tensors"); + } else { + this->arena_plan_ = TiledArray::detail::make_contraction_arena_plan< + result_tile_type, left_tile_type, right_tile_type>( + TiledArray::detail::ArenaInnerShapeKind::left_range, + std::nullopt, inner(this->perm_)); + if (!bool(this->arena_plan_)) + TA_EXCEPTION( + "nested Hadamard on view inner tiles: the arena fast path " + "was inactive (arena disabled, or a non-identity inner " + "result permutation -- not yet supported on view cells)"); + if (this->factor_ == scalar_type{1}) { + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_hadamard_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(); + } else { + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_hadamard_scaled_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(this->factor_); + } + } + // element_return_op_ left null: a view cell cannot be + // value-returned (see the init_struct precondition check). } else if (inner_prod == TensorProduct::Contraction) { using op_type = TiledArray::detail::ContractReduce< result_tile_element_type, left_tile_element_type,