From 216e0d87424a0371411debab1f8c3397d6c00222 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 18 Feb 2026 13:30:59 -0800 Subject: [PATCH 1/5] Segmenter container sharing: segment Fusions share completeFusion's IrContainer Add protected Fusion constructor that accepts an existing shared_ptr, allowing SegmentedFusion::makeFusion() to share the complete Fusion's container instead of creating a new one per segment. Changes: - fusion.h: Add protected Fusion(shared_ptr) constructor declaration - fusion.cpp: Implement the shared-container constructor - ir/container.cpp: Pre-allocate per_fusion_vals_/per_fusion_exprs_ in addFusion() to prevent rehash races during concurrent segment compilation - fusion_segmenter.cpp: Use shared container constructor in makeFusion() This enables real cross-thread lock contention on the IrContainer mutex during parallel segment compilation, validating Phase 2's shared_ptr + mutex infrastructure under realistic multi-Fusion sharing conditions. --- csrc/fusion.cpp | 6 ++++++ csrc/fusion.h | 4 ++++ csrc/fusion_segmenter.cpp | 3 ++- csrc/ir/container.cpp | 2 ++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index e33a3a39418..43e59619509 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -520,6 +520,12 @@ Fusion::Fusion() : ir_container_(std::make_shared()) { ir_container_->addFusion(this); } +// Shared-container constructor -- creates empty Fusion using existing container +Fusion::Fusion(std::shared_ptr container) + : ir_container_(std::move(container)) { + ir_container_->addFusion(this); +} + // Copy constructor -- shares the source's container Fusion::Fusion(const Fusion& other) : ir_container_(other.ir_container_) { FUSER_PERF_SCOPE("Fusion copy"); diff --git a/csrc/fusion.h b/csrc/fusion.h index 8044a415305..695e31f154f 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -600,6 +600,10 @@ class NVF_API Fusion : public PolymorphicBase { friend class TranslateApplicableWelford; friend Val; + //! Constructor that shares an existing container. Creates an empty Fusion + //! registered with the shared container. Used by makeFusion for sharing. + explicit Fusion(std::shared_ptr container); + //! Register the Val with this fusion virtual void registerVal(Val* val); diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 64c4c7fae93..0736b560078 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -1801,7 +1801,8 @@ std::pair> SegmentedFusion::makeFusion( SegmentedGroup* sg) const { // TODO Optimize cloning step by only copying values and expressions between // the fusion segment's inputs and outputs. - auto fusion_segment = std::make_unique(); + auto fusion_segment = std::unique_ptr( + new Fusion(completeFusion()->ir_container_ptr())); IrCloner complete_to_segment_map = Fusion::copy(completeFusion(), fusion_segment.get()); diff --git a/csrc/ir/container.cpp b/csrc/ir/container.cpp index d4bdc54d41c..9114e6d7cb7 100644 --- a/csrc/ir/container.cpp +++ b/csrc/ir/container.cpp @@ -153,6 +153,8 @@ int64_t IrContainer::numVals() const noexcept { void IrContainer::addFusion(Fusion* fusion) { std::unique_lock lock(mutex_); sharing_fusions_.insert(fusion); + per_fusion_vals_[fusion]; // Pre-allocate to prevent rehash during concurrent access + per_fusion_exprs_[fusion]; } void IrContainer::removeFusion(Fusion* fusion) { From 0c32f222b9bc6b0e86997e20bad9f3d0ad168d45 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 18 Feb 2026 13:55:11 -0800 Subject: [PATCH 2/5] Add stress tests for shared container with 8 and 12 segments Two new SegmentationTest cases that exercise the shared IrContainer under higher concurrent load than existing tests (max 3 segments): - SharedContainerStress8Segments: linear chain with 7 segment_set boundaries, 8 segments compiling in parallel - SharedContainerStress12ParallelBranches: 4 inputs x 3 independent reductions each, at least 6 segments compiling in parallel Both pass with correct results, validating the shared_ptr + mutex infrastructure under real multi-thread contention. --- tests/cpp/test_segmentation.cpp | 84 +++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/tests/cpp/test_segmentation.cpp b/tests/cpp/test_segmentation.cpp index cb849feb450..884ab97bd44 100644 --- a/tests/cpp/test_segmentation.cpp +++ b/tests/cpp/test_segmentation.cpp @@ -1034,4 +1034,88 @@ TEST_F(SegmentationTest, ReshapeWithCrossSegmentExtent) { executor_cache.fusion(), outputs, {t_a, t_b}, __LINE__, __FILE__); } +// Stress test: 8 segments sharing one IrContainer via segmenter container +// sharing. Exercises parallel compilation with 8 concurrent threads all +// registering vals/exprs into the same shared container. +TEST_F(SegmentationTest, SharedContainerStress8Segments) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + // Build a linear chain with 7 segment_set boundaries → 8 segments. + // Each segment does a different pointwise op to keep them distinct. + auto tv0 = makeContigConcreteTensor({1024, 256}, DataType::Float); + fusion->addInput(tv0); + + auto tv = relu(tv0); + tv = segment_set(tv); + tv = neg(tv); + tv = segment_set(tv); + tv = sin(tv); + tv = segment_set(tv); + tv = relu(tv); + tv = segment_set(tv); + tv = neg(tv); + tv = segment_set(tv); + tv = sin(tv); + tv = segment_set(tv); + tv = relu(tv); + tv = segment_set(tv); + tv = neg(tv); + fusion->addOutput(tv); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({1024, 256}, options); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({t0}); + + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + EXPECT_THAT(runtime->fusionSegments()->groups(), SizeIs(8)); + + testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); +} + +// Stress test: 12 parallel branches each producing a segment via independent +// reductions. Unlike the linear chain above, this creates 12 segments that +// can all compile simultaneously, maximizing concurrent lock contention on the +// shared IrContainer. +TEST_F(SegmentationTest, SharedContainerStress12ParallelBranches) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + // 4 inputs, each feeds 3 independent reductions on different axes. + // Reductions on different axes cannot be merged → separate segments. + std::vector inputs; + for (int i = 0; i < 4; i++) { + auto tv = makeContigConcreteTensor({64, 128, 32}, DataType::Float); + fusion->addInput(tv); + inputs.push_back(tv); + } + + for (int i = 0; i < 4; i++) { + // Each input → 3 reductions on axes 0, 1, 2 + for (int axis = 0; axis < 3; axis++) { + auto r = sum(inputs[i], {axis}); + fusion->addOutput(r); + } + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector aten_inputs; + for (int i = 0; i < 4; i++) { + aten_inputs.push_back(at::randn({64, 128, 32}, options)); + } + + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs(aten_inputs); + + FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime(); + // Expect at least 6 segments (the segmenter may merge some compatible + // reductions, but incompatible reduction axes force separate segments) + EXPECT_GE(runtime->fusionSegments()->groups().size(), 6); + + testValidate( + executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__); +} + } // namespace nvfuser From 4da037889eb73d2d9d9a057719e6dfdea426cd08 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 4 Mar 2026 11:48:34 -0800 Subject: [PATCH 3/5] Segmenter Shared Container Fix (#6025) Statements cleaned up by statement guard need to be popped from the specific fusion only, not the entire IrContainer. --- csrc/fusion.cpp | 138 +++++++++++++++++++++++++-------------- csrc/fusion.h | 7 -- csrc/statement_guard.cpp | 2 +- 3 files changed, 89 insertions(+), 58 deletions(-) diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index 43e59619509..b95644d843f 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -181,18 +181,19 @@ struct Fusion::ContainerMutator { } } - static int64_t numValsExcludingShortcuts(const Fusion* self) noexcept { - auto* c = self->ir_container(); - // Use direct field access. Avoids re-entering valsOwnedBy() which acquires - // shared_lock. - const auto it = c->per_fusion_vals_.find(self); - int64_t count = it != c->per_fusion_vals_.end() - ? static_cast(it->second.size()) - : 0; - count -= (self->zero_val_ != nullptr) + (self->one_val_ != nullptr) + - (self->true_val_ != nullptr) + (self->false_val_ != nullptr) + - (self->magic_zero_val_ != nullptr); - return count; + // Null out self's shortcut-val pointer cache if v is one of them. + static void nullOutShortcutIfNeeded(Fusion* self, Val* v) { + if (v == self->zero_val_) { + self->zero_val_ = nullptr; + } else if (v == self->one_val_) { + self->one_val_ = nullptr; + } else if (v == self->true_val_) { + self->true_val_ = nullptr; + } else if (v == self->false_val_) { + self->false_val_ = nullptr; + } else if (v == self->magic_zero_val_) { + self->magic_zero_val_ = nullptr; + } } static void removeStatementsCreatedAfter( @@ -201,42 +202,83 @@ struct Fusion::ContainerMutator { int64_t num_vals_before) { auto* c = self->ir_container(); - // Remove expressions before values because we need to change Val::uses_. - while (std::ssize(c->per_fusion_exprs_[self]) > num_exprs_before) { - // Pop from global deque back — statements created by this Fusion during - // the guard scope are at the tail (LIFO invariant). - Expr* e = c->exprs_up_.back().get(); - NVF_ERROR( - c->per_fusion_exprs_[self].count(e) > 0, - "removeStatementsCreatedAfter: tail expr belongs to another Fusion"); - for (Val* in : e->inputs()) { - in->removeUse(e); + // Use direct field access — hasMultipleFusions() acquires shared_lock which + // deadlocks since the caller already holds unique_lock on mutex_. + if (c->sharing_fusions_.size() <= 1) { + // Fast path: single Fusion owns this container, so the LIFO invariant + // holds — self's newest statements are always at the global deque tail. + // Remove expressions before values because we need to change Val::uses_. + while (std::ssize(c->per_fusion_exprs_[self]) > num_exprs_before) { + Expr* e = c->exprs_up_.back().get(); + NVF_ERROR( + c->per_fusion_exprs_[self].count(e) > 0, + "removeStatementsCreatedAfter: tail expr belongs to another Fusion"); + for (Val* out : e->outputs()) { + out->setDefinition(nullptr); + } + for (Val* in : e->inputs()) { + in->removeUse(e); + } + c->per_fusion_exprs_[self].erase(e); + c->exprs_.erase(e); + c->exprs_up_.pop_back(); } - c->per_fusion_exprs_[self].erase(e); - c->exprs_.erase(e); - c->exprs_up_.pop_back(); - } - - while (numValsExcludingShortcuts(self) > num_vals_before) { - Val* v = c->vals_up_.back().get(); - NVF_ERROR( - c->per_fusion_vals_[self].count(v) > 0, - "removeStatementsCreatedAfter: tail val belongs to another Fusion"); - // Null out shortcut caches if they point to vals about to be destroyed - if (v == self->zero_val_) { - self->zero_val_ = nullptr; - } else if (v == self->one_val_) { - self->one_val_ = nullptr; - } else if (v == self->true_val_) { - self->true_val_ = nullptr; - } else if (v == self->false_val_) { - self->false_val_ = nullptr; - } else if (v == self->magic_zero_val_) { - self->magic_zero_val_ = nullptr; + while (std::ssize(c->per_fusion_vals_[self]) > num_vals_before) { + Val* v = c->vals_up_.back().get(); + NVF_ERROR( + c->per_fusion_vals_[self].count(v) > 0, + "removeStatementsCreatedAfter: tail val belongs to another Fusion"); + nullOutShortcutIfNeeded(self, v); + c->per_fusion_vals_[self].erase(v); + c->vals_.erase(v); + c->vals_up_.pop_back(); } - c->per_fusion_vals_[self].erase(v); - c->vals_.erase(v); - c->vals_up_.pop_back(); + } else { + // Slow path: shared container — other Fusions' statements may be + // interleaved at the tail of the global deques. Use std::erase_if + // (C++20) to scan forward: skip the first num_before of self's + // statements (old, to keep), then erase the remainder (added during + // the guard scope). Entered whenever the container is shared, + // regardless of success or failure; if no new statements were added + // the scan completes trivially. O(total statements in container). + int64_t exprs_kept = 0; + std::erase_if(c->exprs_up_, [&](const std::unique_ptr& e_up) { + Expr* e = e_up.get(); + if (c->per_fusion_exprs_[self].count(e) == 0) { + return false; // belongs to another Fusion — keep + } + if (exprs_kept < num_exprs_before) { + ++exprs_kept; + return false; // self's old expr — keep + } + // self's new expr — remove (clean up uses and index maps first) + for (Val* out : e->outputs()) { + out->setDefinition(nullptr); + } + for (Val* in : e->inputs()) { + in->removeUse(e); + } + c->per_fusion_exprs_[self].erase(e); + c->exprs_.erase(e); + return true; + }); + + int64_t vals_kept = 0; + std::erase_if(c->vals_up_, [&](const std::unique_ptr& v_up) { + Val* v = v_up.get(); + if (c->per_fusion_vals_[self].count(v) == 0) { + return false; // belongs to another Fusion — keep + } + if (vals_kept < num_vals_before) { + ++vals_kept; + return false; // self's old val — keep + } + // self's new val — remove (null shortcut cache pointer if applicable) + nullOutShortcutIfNeeded(self, v); + c->per_fusion_vals_[self].erase(v); + c->vals_.erase(v); + return true; + }); } } }; @@ -626,10 +668,6 @@ void Fusion::removeStatementsCreatedAfter( this, num_exprs_before, num_vals_before); } -int64_t Fusion::numValsExcludingShortcuts() const noexcept { - return ContainerMutator::numValsExcludingShortcuts(this); -} - void Fusion::addInput(Val* input) { assertInContainer(input, "Cannot register input "); diff --git a/csrc/fusion.h b/csrc/fusion.h index 695e31f154f..c4436e11747 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -565,13 +565,6 @@ class NVF_API Fusion : public PolymorphicBase { return std::ssize(ir_container()->valsOwnedBy(this)); } - //! Return per-Fusion val count excluding shortcut vals (zero_val_, etc.). - //! Shortcut vals are registered in both per_fusion_vals_ and vals_up_, but - //! since they're singletons that should persist across StatementGuard scopes, - //! this count excludes them so the LIFO pop-back in - //! removeStatementsCreatedAfter correctly skips over them. - int64_t numValsExcludingShortcuts() const noexcept; - // Shortcut values (frequently used constants) Val* zeroVal(); Val* oneVal(); diff --git a/csrc/statement_guard.cpp b/csrc/statement_guard.cpp index 15a3b4159c3..4575bb59076 100644 --- a/csrc/statement_guard.cpp +++ b/csrc/statement_guard.cpp @@ -20,7 +20,7 @@ StatementGuard::StatementGuard(Fusion* fusion) return fusion; }()), prev_num_exprs_(fusion_->numExprs()), - prev_num_vals_(fusion_->numValsExcludingShortcuts()) {} + prev_num_vals_(fusion_->numVals()) {} StatementGuard::~StatementGuard() { fusion_->removeStatementsCreatedAfter(prev_num_exprs_, prev_num_vals_); From 3d03155c3234f380951d287f16c13561c0cfc8a5 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 4 Mar 2026 16:45:39 -0800 Subject: [PATCH 4/5] GT comment suggestion Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- csrc/ir/container.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/ir/container.cpp b/csrc/ir/container.cpp index 9114e6d7cb7..d33a8af4eef 100644 --- a/csrc/ir/container.cpp +++ b/csrc/ir/container.cpp @@ -153,7 +153,7 @@ int64_t IrContainer::numVals() const noexcept { void IrContainer::addFusion(Fusion* fusion) { std::unique_lock lock(mutex_); sharing_fusions_.insert(fusion); - per_fusion_vals_[fusion]; // Pre-allocate to prevent rehash during concurrent access + per_fusion_vals_[fusion]; // Pre-insert key so no outer-map rehash occurs during concurrent val/expr registration per_fusion_exprs_[fusion]; } From deb0a38911cc9bdb77fd98dbd1de299929d658e9 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 4 Mar 2026 17:36:36 -0800 Subject: [PATCH 5/5] lint --- csrc/fusion.cpp | 3 ++- csrc/fusion_segmenter.cpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index b95644d843f..6d295d930c8 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -212,7 +212,8 @@ struct Fusion::ContainerMutator { Expr* e = c->exprs_up_.back().get(); NVF_ERROR( c->per_fusion_exprs_[self].count(e) > 0, - "removeStatementsCreatedAfter: tail expr belongs to another Fusion"); + "removeStatementsCreatedAfter: tail expr belongs to another " + "Fusion"); for (Val* out : e->outputs()) { out->setDefinition(nullptr); } diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 0736b560078..fca03e6d1fd 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -1801,8 +1801,8 @@ std::pair> SegmentedFusion::makeFusion( SegmentedGroup* sg) const { // TODO Optimize cloning step by only copying values and expressions between // the fusion segment's inputs and outputs. - auto fusion_segment = std::unique_ptr( - new Fusion(completeFusion()->ir_container_ptr())); + auto fusion_segment = + std::unique_ptr(new Fusion(completeFusion()->ir_container_ptr())); IrCloner complete_to_segment_map = Fusion::copy(completeFusion(), fusion_segment.get());