From 4d504b5cb5b974c98e1396a14e35535a81fe09c3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 May 2026 12:09:50 +0900 Subject: [PATCH 1/2] external: bump pinned MADNESS to 666765ca6 (m-a-d-n-e-s-s/madness#695) --- external/versions.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external/versions.cmake b/external/versions.cmake index 78c015ec79..f5565d299c 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -12,8 +12,8 @@ set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6 set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) set(TA_TRACKED_MADNESS_URL https://github.com/m-a-d-n-e-s-s/madness.git CACHE STRING "GIT_REPOSITORY for cloning MADNESS source") -set(TA_TRACKED_MADNESS_TAG f7aa1401e CACHE STRING "GIT_TAG (branch or hash) for cloning MADNESS") -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 7d8aaf9d51981e4accf4d84742270d1473f8ca2e) +set(TA_TRACKED_MADNESS_TAG 666765ca6 CACHE STRING "GIT_TAG (branch or hash) for cloning MADNESS") +set(TA_TRACKED_MADNESS_PREVIOUS_TAG f7aa1401e) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From d294ecb2df1c57f55ca9a5df2374cb71271afaca Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 May 2026 12:10:19 +0900 Subject: [PATCH 2/2] DistArray::lazy_deleter: skip lazy_sync when invoked from fence's do_cleanup Use the new MADNESS `WorldGopInterface::is_in_do_cleanup()` flag to short-circuit the cross-rank `lazy_sync` handshake when `lazy_deleter` is called from inside `fence_impl`'s deferred-cleanup phase: `delete pimpl` directly, decrement `cleanup_counter_`, return. Why it is safe: - `fence_impl` runs the global-termination protocol before calling `deferred_->do_cleanup()`, so all ranks are at the same point with no AM in flight. - `defer_deleter_to_next_fence` is, by contract, used collectively, so every rank's deferred list holds the same set of pimpls at this point and every rank performs the matching delete in lockstep. - The `lazy_sync` handshake exists to guarantee that no peer is still about to send AM addressed to this object before we delete it; the fence already establishes that. Why it matters: the original `lazy_sync` path enqueues a `lazy_sync_children` task on this world's taskq *after* the fence's drain loop has exited. Such tasks survive the fence and are picked up later by some other fence that drives the global ThreadPool. If the world is destroyed in the meantime (e.g. einsum's per-Hadamard sub-Worlds torn down at function exit or during exception unwind), the stranded task runs `delete pimpl` against a world whose taskq / gop are already freed; `~WorldObject` then trips its `World::exists(&world)` assertion and aborts, masking any real error. The fast path avoids ever scheduling that task. The general (non-deferred) path is unchanged: `lazy_deleter` invoked outside `do_cleanup` still goes through `lazy_sync` because we cannot rely on synchronization with peers in that case. --- src/TiledArray/array_impl.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index f6dff7f066..ecec74fc21 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -480,6 +480,24 @@ class ArrayImpl : public TensorImpl, // wait for all DelayedSet's to vanish world.await([&]() { return (pimpl->num_live_ds() == 0); }, true); + // Fast path when invoked from inside the fence's deferred-cleanup + // phase: the global-termination protocol has already established + // global quiescence (no in-flight AM, all ranks at the same point), + // and symmetric collective use of `defer_deleter_to_next_fence()` + // guarantees every rank has this same pimpl in its deferred list + // and so reaches this same delete in lockstep. The cross-rank + // lazy_sync handshake below is therefore redundant; it would also + // schedule a lazy_sync_children task on this world's taskq that the + // fence cannot drain (do_cleanup runs after the drain loop) and + // that would later be run by some unrelated fence -- against freed + // state if this world is destroyed before then (e.g. einsum's + // per-Hadamard sub-Worlds). + if (world.gop.is_in_do_cleanup()) { + delete pimpl; + cleanup_counter_--; + return; + } + try { world.gop.lazy_sync(id, [pimpl]() { delete pimpl;