From 226a6f4d8c466285907580d938e529460e4abaf2 Mon Sep 17 00:00:00 2001 From: walterzhao <519362600@qq.com> Date: Mon, 16 Mar 2026 00:49:03 +0800 Subject: [PATCH 1/3] Fix IOBuf TLS block pool self-loop caused by double-return (#3243) release_tls_block() and release_tls_block_chain() do not guard against a block being returned to TLS when it is already the list head. The assignment `b->portal_next = block_head` becomes `b->portal_next = b`, forming a self-loop that causes remove_tls_block_chain() or share_tls_block() to spin forever, silently hanging the thread at exit. Fix: - release_tls_block(): add early return when b == block_head, skipping the duplicate insertion. - release_tls_block_chain(): during the existing chain walk, check each node against block_head before linking. Return early if overlap is detected so that num_blocks stays consistent with the actual list length (remove_tls_block_chain verifies this via CHECK_EQ). Add three unit tests that reproduce the self-loop through: 1. Direct double release_tls_block() of the same block. 2. release_tls_block_chain() with a chain overlapping the TLS head. 3. IOBufAsZeroCopyOutputStream::BackUp() followed by a second release. --- src/butil/iobuf.cpp | 16 +++- src/butil/iobuf_inl.h | 8 ++ test/iobuf_unittest.cpp | 157 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 1 deletion(-) diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp index 26046e3745..fd9528333e 100644 --- a/src/butil/iobuf.cpp +++ b/src/butil/iobuf.cpp @@ -334,18 +334,32 @@ void release_tls_block_chain(IOBuf::Block* b) { g_num_hit_tls_threshold.fetch_add(n, butil::memory_order_relaxed); return; } + IOBuf::Block* const old_head = tls_data.block_head; IOBuf::Block* first_b = b; IOBuf::Block* last_b = NULL; do { ++n; CHECK(!b->full()); + // If any block in the incoming chain is already the TLS block_head, + // linking last_b->portal_next = block_head would create a cycle: + // - Single-block chain [B] where B == block_head: + // last_b->portal_next = B, i.e. B->portal_next = B (self-loop) + // - Multi-block chain [A -> B] where A == block_head: + // last_b(B)->portal_next = A, creating A -> B -> A (cycle) + // Return early before any state is modified so that num_blocks stays + // consistent with the actual list length (remove_tls_block_chain + // verifies this with CHECK_EQ at thread exit). + // See https://github.com/apache/brpc/issues/3243 + if (b == old_head) { + return; + } if (b->u.portal_next == NULL) { last_b = b; break; } b = b->u.portal_next; } while (true); - last_b->u.portal_next = tls_data.block_head; + last_b->u.portal_next = old_head; tls_data.block_head = first_b; tls_data.num_blocks += n; if (!tls_data.registered) { diff --git a/src/butil/iobuf_inl.h b/src/butil/iobuf_inl.h index 6b1f875145..9794c0e7ce 100644 --- a/src/butil/iobuf_inl.h +++ b/src/butil/iobuf_inl.h @@ -615,6 +615,14 @@ inline void release_tls_block(IOBuf::Block* b) { b->dec_ref(); // g_num_hit_tls_threshold.fetch_add(1, butil::memory_order_relaxed); inc_g_num_hit_tls_threshold(); + } else if (b == tls_data->block_head) { + // b is already at the head of the TLS free list. Re-inserting it + // would execute `b->portal_next = block_head`, i.e. `b->portal_next = b`, + // creating a single-node self-loop. Any later traversal of the TLS + // chain (remove_tls_block_chain at thread exit, share_tls_block, etc.) + // would spin forever. Skip the duplicate return. + // See https://github.com/apache/brpc/issues/3243 + return; } else { b->u.portal_next = tls_data->block_head; tls_data->block_head = b; diff --git a/test/iobuf_unittest.cpp b/test/iobuf_unittest.cpp index 679cdfe799..ca864a0c7b 100644 --- a/test/iobuf_unittest.cpp +++ b/test/iobuf_unittest.cpp @@ -1946,4 +1946,161 @@ TEST_F(IOBufTest, single_iobuf) { ASSERT_TRUE(p != nullptr); } +// Reproduction test for https://github.com/apache/brpc/issues/3243 +// +// release_tls_block() does not guard against a block being returned while it is +// already the TLS list head. The assignment `b->portal_next = block_head` +// becomes `b->portal_next = b` (self-loop). Any later traversal of the TLS +// chain — remove_tls_block_chain() at thread exit, share_tls_block(), etc. — +// spins forever, silently hanging the thread. +// +// The tests below run in a dedicated thread so the main thread's TLS is +// unaffected and a self-loop can be detected and cleaned up safely. + +static void* double_return_release_tls_block_thread(void* arg) { + bool* self_loop = static_cast(arg); + butil::iobuf::remove_tls_block_chain(); + + // Step 1: acquire a fresh block (removed from TLS or newly created). + butil::IOBuf::Block* b = butil::iobuf::acquire_tls_block(); + if (!b) return NULL; + + // Step 2: return it to TLS — b is now block_head. + butil::iobuf::release_tls_block(b); + + // Step 3: return the *same* block again while it is still block_head. + // BUG: release_tls_block executes + // b->portal_next = block_head // == b → self-loop! + // block_head = b // no-op + butil::iobuf::release_tls_block(b); + + // Detect the self-loop. + butil::IOBuf::Block* head = butil::iobuf::get_tls_block_head(); + if (head && butil::iobuf::get_portal_next(head) == head) { + *self_loop = true; + // Break the cycle so thread_atexit(remove_tls_block_chain) won't hang. + // acquire_tls_block() sets portal_next = NULL, breaking the loop. + butil::iobuf::acquire_tls_block(); + // block_head is still b (was self-referencing), acquire again to drain. + butil::iobuf::acquire_tls_block(); + // TLS is now empty — thread can exit without hanging. + } else { + butil::iobuf::remove_tls_block_chain(); + } + return NULL; +} + +TEST_F(IOBufTest, release_tls_block_double_return_creates_self_loop) { + bool self_loop = false; + pthread_t tid; + ASSERT_EQ(0, pthread_create(&tid, NULL, + double_return_release_tls_block_thread, + &self_loop)); + ASSERT_EQ(0, pthread_join(tid, NULL)); + + EXPECT_FALSE(self_loop) + << "release_tls_block() created a self-loop (b->portal_next == b) " + "when the same block was double-returned while already being the " + "TLS block_head. This causes remove_tls_block_chain() to loop " + "infinitely at thread exit. (GitHub issue #3243)"; +} + +static void* double_return_release_tls_block_chain_thread(void* arg) { + bool* self_loop = static_cast(arg); + butil::iobuf::remove_tls_block_chain(); + + // Acquire a block and put it back as TLS head. + butil::IOBuf::Block* b = butil::iobuf::acquire_tls_block(); + if (!b) return NULL; + butil::iobuf::release_tls_block(b); + // Now: block_head -> b -> NULL + + // Return the same block through release_tls_block_chain. + // The chain is: b -> NULL (b->portal_next was set to NULL by acquire). + // Internally: last_b = b, then last_b->portal_next = block_head = b + // → self-loop! + b->u.portal_next = NULL; // simulate a single-block chain + butil::iobuf::release_tls_block_chain(b); + + butil::IOBuf::Block* head = butil::iobuf::get_tls_block_head(); + if (head && butil::iobuf::get_portal_next(head) == head) { + *self_loop = true; + butil::iobuf::acquire_tls_block(); + butil::iobuf::acquire_tls_block(); + } else { + butil::iobuf::remove_tls_block_chain(); + } + return NULL; +} + +TEST_F(IOBufTest, release_tls_block_chain_overlap_creates_self_loop) { + bool self_loop = false; + pthread_t tid; + ASSERT_EQ(0, pthread_create(&tid, NULL, + double_return_release_tls_block_chain_thread, + &self_loop)); + ASSERT_EQ(0, pthread_join(tid, NULL)); + + EXPECT_FALSE(self_loop) + << "release_tls_block_chain() created a self-loop when the returned " + "chain overlaps with the existing TLS block_head. " + "last_b->portal_next = block_head creates a cycle when first_b == " + "block_head. (GitHub issue #3243)"; +} + +// Reproduce the self-loop through the IOBufAsZeroCopyOutputStream::BackUp() +// code path described in the issue. BackUp() eagerly returns _cur_block to +// TLS. If the same block pointer is subsequently released again (e.g. via +// _release_block()), the double-return triggers the self-loop. +static void* backup_double_return_thread(void* arg) { + bool* self_loop = static_cast(arg); + butil::iobuf::remove_tls_block_chain(); + + butil::IOBuf buf; + { + // _block_size == 0 → uses TLS blocks. + butil::IOBufAsZeroCopyOutputStream stream(&buf); + void* data = NULL; + int size = 0; + EXPECT_TRUE(stream.Next(&data, &size)); + + // BackUp releases _cur_block to TLS and sets _cur_block = NULL. + stream.BackUp(size); + + // At this point the block is the TLS head. + butil::IOBuf::Block* head = butil::iobuf::get_tls_block_head(); + EXPECT_TRUE(head != NULL); + + // Simulate a second release of the same block, which can happen + // when the block pointer is obtained from a still-live BlockRef. + butil::iobuf::release_tls_block(head); + + butil::IOBuf::Block* new_head = butil::iobuf::get_tls_block_head(); + if (new_head && butil::iobuf::get_portal_next(new_head) == new_head) { + *self_loop = true; + butil::iobuf::acquire_tls_block(); + butil::iobuf::acquire_tls_block(); + } + } + if (!*self_loop) { + butil::iobuf::remove_tls_block_chain(); + } + return NULL; +} + +TEST_F(IOBufTest, backup_then_double_release_creates_self_loop) { + bool self_loop = false; + pthread_t tid; + ASSERT_EQ(0, pthread_create(&tid, NULL, + backup_double_return_thread, &self_loop)); + ASSERT_EQ(0, pthread_join(tid, NULL)); + + EXPECT_FALSE(self_loop) + << "After IOBufAsZeroCopyOutputStream::BackUp() returned a block to " + "TLS, a second release_tls_block() with the same pointer created a " + "self-loop. This is the primary scenario described in GitHub issue " + "#3243 — the thread will hang forever in remove_tls_block_chain() " + "at exit."; +} + } // namespace From 328c6b745f2a8a33cd1c28ea3240c46363bc9aa6 Mon Sep 17 00:00:00 2001 From: walterzhao <519362600@qq.com> Date: Mon, 16 Mar 2026 11:34:46 +0800 Subject: [PATCH 2/3] fix review fix review comment add new ut --- src/butil/iobuf.cpp | 29 +++-- src/butil/iobuf_inl.h | 24 ++-- test/iobuf_unittest.cpp | 269 ++++++++++++++++++++++++++-------------- 3 files changed, 209 insertions(+), 113 deletions(-) diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp index fd9528333e..4e97ebf54f 100644 --- a/src/butil/iobuf.cpp +++ b/src/butil/iobuf.cpp @@ -324,33 +324,32 @@ IOBuf::Block* share_tls_block() { void release_tls_block_chain(IOBuf::Block* b) { TLSData& tls_data = g_tls_data; size_t n = 0; + size_t n_hit_threshold = 0; if (tls_data.num_blocks >= max_blocks_per_thread()) { do { ++n; IOBuf::Block* const saved_next = b->u.portal_next; - b->dec_ref(); + // If b is already cached in TLS, dec_ref() would drop a reference + // still owned by the TLS list and leave a dangling pointer behind. + if (!is_in_tls_block_chain(tls_data.block_head, b)) { + b->dec_ref(); + ++n_hit_threshold; + } b = saved_next; } while (b); - g_num_hit_tls_threshold.fetch_add(n, butil::memory_order_relaxed); + g_num_hit_tls_threshold.fetch_add( + n_hit_threshold, butil::memory_order_relaxed); return; } - IOBuf::Block* const old_head = tls_data.block_head; IOBuf::Block* first_b = b; IOBuf::Block* last_b = NULL; do { ++n; CHECK(!b->full()); - // If any block in the incoming chain is already the TLS block_head, - // linking last_b->portal_next = block_head would create a cycle: - // - Single-block chain [B] where B == block_head: - // last_b->portal_next = B, i.e. B->portal_next = B (self-loop) - // - Multi-block chain [A -> B] where A == block_head: - // last_b(B)->portal_next = A, creating A -> B -> A (cycle) - // Return early before any state is modified so that num_blocks stays - // consistent with the actual list length (remove_tls_block_chain - // verifies this with CHECK_EQ at thread exit). - // See https://github.com/apache/brpc/issues/3243 - if (b == old_head) { + // Guard against overlap with any node already cached in TLS, not just + // block_head. Otherwise returning X into H -> X would create a + // 2-node cycle X -> H -> X and hang later list traversal. + if (is_in_tls_block_chain(tls_data.block_head, b)) { return; } if (b->u.portal_next == NULL) { @@ -359,7 +358,7 @@ void release_tls_block_chain(IOBuf::Block* b) { } b = b->u.portal_next; } while (true); - last_b->u.portal_next = old_head; + last_b->u.portal_next = tls_data.block_head; tls_data.block_head = first_b; tls_data.num_blocks += n; if (!tls_data.registered) { diff --git a/src/butil/iobuf_inl.h b/src/butil/iobuf_inl.h index 9794c0e7ce..3199f202fa 100644 --- a/src/butil/iobuf_inl.h +++ b/src/butil/iobuf_inl.h @@ -603,26 +603,34 @@ void remove_tls_block_chain(); IOBuf::Block* acquire_tls_block(); +inline bool is_in_tls_block_chain(IOBuf::Block* head, IOBuf::Block* b) { + for (IOBuf::Block* p = head; p != NULL; p = p->u.portal_next) { + if (p == b) { + return true; + } + } + return false; +} + // Return one block to TLS. inline void release_tls_block(IOBuf::Block* b) { if (!b) { return; } TLSData *tls_data = get_g_tls_data(); + // Guard against duplicate return anywhere in the TLS list. Checking only + // block_head misses cases like H -> X where returning X again would create + // a 2-node cycle X -> H -> X. The TLS list is short (soft-limited), so a + // linear scan is cheap here. + if (is_in_tls_block_chain(tls_data->block_head, b)) { + return; + } if (b->full()) { b->dec_ref(); } else if (tls_data->num_blocks >= max_blocks_per_thread()) { b->dec_ref(); // g_num_hit_tls_threshold.fetch_add(1, butil::memory_order_relaxed); inc_g_num_hit_tls_threshold(); - } else if (b == tls_data->block_head) { - // b is already at the head of the TLS free list. Re-inserting it - // would execute `b->portal_next = block_head`, i.e. `b->portal_next = b`, - // creating a single-node self-loop. Any later traversal of the TLS - // chain (remove_tls_block_chain at thread exit, share_tls_block, etc.) - // would spin forever. Skip the duplicate return. - // See https://github.com/apache/brpc/issues/3243 - return; } else { b->u.portal_next = tls_data->block_head; tls_data->block_head = b; diff --git a/test/iobuf_unittest.cpp b/test/iobuf_unittest.cpp index ca864a0c7b..104ff5aa6d 100644 --- a/test/iobuf_unittest.cpp +++ b/test/iobuf_unittest.cpp @@ -1946,161 +1946,250 @@ TEST_F(IOBufTest, single_iobuf) { ASSERT_TRUE(p != nullptr); } -// Reproduction test for https://github.com/apache/brpc/issues/3243 +// Regression tests for https://github.com/apache/brpc/issues/3243 // -// release_tls_block() does not guard against a block being returned while it is -// already the TLS list head. The assignment `b->portal_next = block_head` -// becomes `b->portal_next = b` (self-loop). Any later traversal of the TLS -// chain — remove_tls_block_chain() at thread exit, share_tls_block(), etc. — -// spins forever, silently hanging the thread. -// -// The tests below run in a dedicated thread so the main thread's TLS is -// unaffected and a self-loop can be detected and cleaned up safely. +// A duplicate return of a block that's already in the TLS free list can create +// either a self-loop (when the block is block_head) or a longer cycle (when +// the block already exists deeper in the list). Any later traversal then +// spins forever. Run these tests in a dedicated pthread so the main thread's +// TLS remains untouched. + +static bool tls_block_chain_has_cycle() { + butil::IOBuf::Block* slow = butil::iobuf::get_tls_block_head(); + butil::IOBuf::Block* fast = slow; + while (fast != NULL) { + fast = butil::iobuf::get_portal_next(fast); + if (fast == NULL) { + return false; + } + slow = butil::iobuf::get_portal_next(slow); + fast = butil::iobuf::get_portal_next(fast); + if (slow == fast) { + return true; + } + } + return false; +} + +static void dec_ref_distinct_blocks(butil::IOBuf::Block* b1, + butil::IOBuf::Block* b2, + butil::IOBuf::Block* b3 = NULL) { + if (b1 != NULL) { + b1->dec_ref(); + } + if (b2 != NULL && b2 != b1) { + b2->dec_ref(); + } + if (b3 != NULL && b3 != b1 && b3 != b2) { + b3->dec_ref(); + } +} + +static void cleanup_corrupted_tls_chain(int drain_times) { + butil::IOBuf::Block* b1 = NULL; + butil::IOBuf::Block* b2 = NULL; + butil::IOBuf::Block* b3 = NULL; + if (drain_times >= 1) { + b1 = butil::iobuf::acquire_tls_block(); + } + if (drain_times >= 2) { + b2 = butil::iobuf::acquire_tls_block(); + } + if (drain_times >= 3) { + b3 = butil::iobuf::acquire_tls_block(); + } + dec_ref_distinct_blocks(b1, b2, b3); +} -static void* double_return_release_tls_block_thread(void* arg) { - bool* self_loop = static_cast(arg); +static void* double_return_release_tls_block_head_thread(void* arg) { + bool* has_cycle = static_cast(arg); butil::iobuf::remove_tls_block_chain(); - // Step 1: acquire a fresh block (removed from TLS or newly created). butil::IOBuf::Block* b = butil::iobuf::acquire_tls_block(); - if (!b) return NULL; - - // Step 2: return it to TLS — b is now block_head. + if (!b) { + return NULL; + } butil::iobuf::release_tls_block(b); - - // Step 3: return the *same* block again while it is still block_head. - // BUG: release_tls_block executes - // b->portal_next = block_head // == b → self-loop! - // block_head = b // no-op butil::iobuf::release_tls_block(b); - // Detect the self-loop. - butil::IOBuf::Block* head = butil::iobuf::get_tls_block_head(); - if (head && butil::iobuf::get_portal_next(head) == head) { - *self_loop = true; - // Break the cycle so thread_atexit(remove_tls_block_chain) won't hang. - // acquire_tls_block() sets portal_next = NULL, breaking the loop. - butil::iobuf::acquire_tls_block(); - // block_head is still b (was self-referencing), acquire again to drain. - butil::iobuf::acquire_tls_block(); - // TLS is now empty — thread can exit without hanging. + if (tls_block_chain_has_cycle()) { + *has_cycle = true; + cleanup_corrupted_tls_chain(2); + } else { + butil::iobuf::remove_tls_block_chain(); + } + return NULL; +} + +TEST_F(IOBufTest, regression_3243_release_tls_block_head_no_cycle) { + bool has_cycle = false; + pthread_t tid; + ASSERT_EQ(0, pthread_create(&tid, NULL, + double_return_release_tls_block_head_thread, + &has_cycle)); + ASSERT_EQ(0, pthread_join(tid, NULL)); + + EXPECT_FALSE(has_cycle) + << "release_tls_block() created a cycle when the same block was " + "returned twice while already cached in TLS. (GitHub issue #3243)"; +} + +static void* double_return_release_tls_block_non_head_thread(void* arg) { + bool* has_cycle = static_cast(arg); + butil::iobuf::remove_tls_block_chain(); + + butil::IOBuf::Block* tail = butil::iobuf::acquire_tls_block(); + butil::IOBuf::Block* head = butil::iobuf::acquire_tls_block(); + if (!tail || !head) { + dec_ref_distinct_blocks(tail, head); + return NULL; + } + + butil::iobuf::release_tls_block(tail); + butil::iobuf::release_tls_block(head); + // TLS: head -> tail -> NULL. Returning tail again used to create + // tail -> head -> tail. + butil::iobuf::release_tls_block(tail); + + if (tls_block_chain_has_cycle()) { + *has_cycle = true; + cleanup_corrupted_tls_chain(3); } else { butil::iobuf::remove_tls_block_chain(); } return NULL; } -TEST_F(IOBufTest, release_tls_block_double_return_creates_self_loop) { - bool self_loop = false; +TEST_F(IOBufTest, regression_3243_release_tls_block_non_head_no_cycle) { + bool has_cycle = false; pthread_t tid; ASSERT_EQ(0, pthread_create(&tid, NULL, - double_return_release_tls_block_thread, - &self_loop)); + double_return_release_tls_block_non_head_thread, + &has_cycle)); ASSERT_EQ(0, pthread_join(tid, NULL)); - EXPECT_FALSE(self_loop) - << "release_tls_block() created a self-loop (b->portal_next == b) " - "when the same block was double-returned while already being the " - "TLS block_head. This causes remove_tls_block_chain() to loop " - "infinitely at thread exit. (GitHub issue #3243)"; + EXPECT_FALSE(has_cycle) + << "release_tls_block() created a cycle when a block already present " + "deeper in the TLS list was returned again. " + "(GitHub issue #3243)"; } -static void* double_return_release_tls_block_chain_thread(void* arg) { - bool* self_loop = static_cast(arg); +static void* double_return_release_tls_block_chain_head_thread(void* arg) { + bool* has_cycle = static_cast(arg); butil::iobuf::remove_tls_block_chain(); - // Acquire a block and put it back as TLS head. butil::IOBuf::Block* b = butil::iobuf::acquire_tls_block(); - if (!b) return NULL; + if (!b) { + return NULL; + } butil::iobuf::release_tls_block(b); - // Now: block_head -> b -> NULL - - // Return the same block through release_tls_block_chain. - // The chain is: b -> NULL (b->portal_next was set to NULL by acquire). - // Internally: last_b = b, then last_b->portal_next = block_head = b - // → self-loop! - b->u.portal_next = NULL; // simulate a single-block chain + b->u.portal_next = NULL; butil::iobuf::release_tls_block_chain(b); - butil::IOBuf::Block* head = butil::iobuf::get_tls_block_head(); - if (head && butil::iobuf::get_portal_next(head) == head) { - *self_loop = true; - butil::iobuf::acquire_tls_block(); - butil::iobuf::acquire_tls_block(); + if (tls_block_chain_has_cycle()) { + *has_cycle = true; + cleanup_corrupted_tls_chain(2); } else { butil::iobuf::remove_tls_block_chain(); } return NULL; } -TEST_F(IOBufTest, release_tls_block_chain_overlap_creates_self_loop) { - bool self_loop = false; +TEST_F(IOBufTest, regression_3243_release_tls_block_chain_head_no_cycle) { + bool has_cycle = false; pthread_t tid; ASSERT_EQ(0, pthread_create(&tid, NULL, - double_return_release_tls_block_chain_thread, - &self_loop)); + double_return_release_tls_block_chain_head_thread, + &has_cycle)); ASSERT_EQ(0, pthread_join(tid, NULL)); - EXPECT_FALSE(self_loop) - << "release_tls_block_chain() created a self-loop when the returned " - "chain overlaps with the existing TLS block_head. " - "last_b->portal_next = block_head creates a cycle when first_b == " - "block_head. (GitHub issue #3243)"; + EXPECT_FALSE(has_cycle) + << "release_tls_block_chain() created a cycle when the returned chain " + "overlapped the TLS head. (GitHub issue #3243)"; } -// Reproduce the self-loop through the IOBufAsZeroCopyOutputStream::BackUp() -// code path described in the issue. BackUp() eagerly returns _cur_block to -// TLS. If the same block pointer is subsequently released again (e.g. via -// _release_block()), the double-return triggers the self-loop. +static void* double_return_release_tls_block_chain_non_head_thread(void* arg) { + bool* has_cycle = static_cast(arg); + butil::iobuf::remove_tls_block_chain(); + + butil::IOBuf::Block* tail = butil::iobuf::acquire_tls_block(); + butil::IOBuf::Block* head = butil::iobuf::acquire_tls_block(); + if (!tail || !head) { + dec_ref_distinct_blocks(tail, head); + return NULL; + } + + butil::iobuf::release_tls_block(tail); + butil::iobuf::release_tls_block(head); + // TLS: head -> tail -> NULL. Returning the single-node chain [tail] + // again used to create tail -> head -> tail. + tail->u.portal_next = NULL; + butil::iobuf::release_tls_block_chain(tail); + + if (tls_block_chain_has_cycle()) { + *has_cycle = true; + cleanup_corrupted_tls_chain(3); + } else { + butil::iobuf::remove_tls_block_chain(); + } + return NULL; +} + +TEST_F(IOBufTest, regression_3243_release_tls_block_chain_non_head_no_cycle) { + bool has_cycle = false; + pthread_t tid; + ASSERT_EQ(0, pthread_create(&tid, NULL, + double_return_release_tls_block_chain_non_head_thread, + &has_cycle)); + ASSERT_EQ(0, pthread_join(tid, NULL)); + + EXPECT_FALSE(has_cycle) + << "release_tls_block_chain() created a cycle when the returned chain " + "contained a block already present deeper in TLS. " + "(GitHub issue #3243)"; +} + +// Reproduce the issue through IOBufAsZeroCopyOutputStream::BackUp(). BackUp() +// eagerly returns _cur_block to TLS. A subsequent release of the same pointer +// used to create a self-loop at the head. static void* backup_double_return_thread(void* arg) { - bool* self_loop = static_cast(arg); + bool* has_cycle = static_cast(arg); butil::iobuf::remove_tls_block_chain(); butil::IOBuf buf; { - // _block_size == 0 → uses TLS blocks. butil::IOBufAsZeroCopyOutputStream stream(&buf); void* data = NULL; int size = 0; EXPECT_TRUE(stream.Next(&data, &size)); - - // BackUp releases _cur_block to TLS and sets _cur_block = NULL. stream.BackUp(size); - // At this point the block is the TLS head. butil::IOBuf::Block* head = butil::iobuf::get_tls_block_head(); EXPECT_TRUE(head != NULL); - - // Simulate a second release of the same block, which can happen - // when the block pointer is obtained from a still-live BlockRef. butil::iobuf::release_tls_block(head); - butil::IOBuf::Block* new_head = butil::iobuf::get_tls_block_head(); - if (new_head && butil::iobuf::get_portal_next(new_head) == new_head) { - *self_loop = true; - butil::iobuf::acquire_tls_block(); - butil::iobuf::acquire_tls_block(); + if (tls_block_chain_has_cycle()) { + *has_cycle = true; + cleanup_corrupted_tls_chain(2); } } - if (!*self_loop) { + if (!*has_cycle) { butil::iobuf::remove_tls_block_chain(); } return NULL; } -TEST_F(IOBufTest, backup_then_double_release_creates_self_loop) { - bool self_loop = false; +TEST_F(IOBufTest, regression_3243_backup_then_double_release_no_cycle) { + bool has_cycle = false; pthread_t tid; ASSERT_EQ(0, pthread_create(&tid, NULL, - backup_double_return_thread, &self_loop)); + backup_double_return_thread, &has_cycle)); ASSERT_EQ(0, pthread_join(tid, NULL)); - EXPECT_FALSE(self_loop) + EXPECT_FALSE(has_cycle) << "After IOBufAsZeroCopyOutputStream::BackUp() returned a block to " - "TLS, a second release_tls_block() with the same pointer created a " - "self-loop. This is the primary scenario described in GitHub issue " - "#3243 — the thread will hang forever in remove_tls_block_chain() " - "at exit."; + "TLS, a second release_tls_block() created a cycle. " + "(GitHub issue #3243)"; } } // namespace From b615fa4c4f486cb53689a9a98741737fe26daaaf Mon Sep 17 00:00:00 2001 From: walterzhao <519362600@qq.com> Date: Mon, 16 Mar 2026 11:51:55 +0800 Subject: [PATCH 3/3] fix review comment#2 add more ut --- src/butil/iobuf.cpp | 19 ++++++++------ test/iobuf_unittest.cpp | 58 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/src/butil/iobuf.cpp b/src/butil/iobuf.cpp index 4e97ebf54f..029654371e 100644 --- a/src/butil/iobuf.cpp +++ b/src/butil/iobuf.cpp @@ -323,11 +323,9 @@ IOBuf::Block* share_tls_block() { // NOTE: b MUST be non-NULL and all blocks linked SHOULD not be full. void release_tls_block_chain(IOBuf::Block* b) { TLSData& tls_data = g_tls_data; - size_t n = 0; size_t n_hit_threshold = 0; if (tls_data.num_blocks >= max_blocks_per_thread()) { do { - ++n; IOBuf::Block* const saved_next = b->u.portal_next; // If b is already cached in TLS, dec_ref() would drop a reference // still owned by the TLS list and leave a dangling pointer behind. @@ -343,24 +341,29 @@ void release_tls_block_chain(IOBuf::Block* b) { } IOBuf::Block* first_b = b; IOBuf::Block* last_b = NULL; + size_t n_to_tls = 0; do { - ++n; CHECK(!b->full()); // Guard against overlap with any node already cached in TLS, not just - // block_head. Otherwise returning X into H -> X would create a - // 2-node cycle X -> H -> X and hang later list traversal. + // block_head. Returning X into H -> X would create a 2-node cycle + // X -> H -> X. If the overlap happens after a unique prefix such as + // A -> X, keep the prefix by linking A directly to the old TLS head. if (is_in_tls_block_chain(tls_data.block_head, b)) { - return; + break; } + ++n_to_tls; + last_b = b; if (b->u.portal_next == NULL) { - last_b = b; break; } b = b->u.portal_next; } while (true); + if (last_b == NULL) { + return; + } last_b->u.portal_next = tls_data.block_head; tls_data.block_head = first_b; - tls_data.num_blocks += n; + tls_data.num_blocks += n_to_tls; if (!tls_data.registered) { tls_data.registered = true; butil::thread_atexit(remove_tls_block_chain); diff --git a/test/iobuf_unittest.cpp b/test/iobuf_unittest.cpp index 104ff5aa6d..29450fb9ca 100644 --- a/test/iobuf_unittest.cpp +++ b/test/iobuf_unittest.cpp @@ -2149,6 +2149,64 @@ TEST_F(IOBufTest, regression_3243_release_tls_block_chain_non_head_no_cycle) { "(GitHub issue #3243)"; } +struct PartialOverlapResult { + bool has_cycle; + int tls_block_count; +}; + +static void* partial_overlap_release_tls_block_chain_thread(void* arg) { + PartialOverlapResult* result = static_cast(arg); + result->has_cycle = false; + result->tls_block_count = -1; + butil::iobuf::remove_tls_block_chain(); + + butil::IOBuf::Block* tail = butil::iobuf::acquire_tls_block(); + butil::IOBuf::Block* head = butil::iobuf::acquire_tls_block(); + butil::IOBuf::Block* prefix = butil::iobuf::acquire_tls_block(); + if (!tail || !head || !prefix) { + dec_ref_distinct_blocks(tail, head, prefix); + return NULL; + } + + butil::iobuf::release_tls_block(tail); + butil::iobuf::release_tls_block(head); + // TLS: head -> tail -> NULL. Returning prefix -> tail should preserve + // prefix and avoid both the tail->head->tail cycle and leaking prefix. + prefix->u.portal_next = tail; + butil::iobuf::release_tls_block_chain(prefix); + + result->has_cycle = tls_block_chain_has_cycle(); + result->tls_block_count = butil::iobuf::get_tls_block_count(); + if (result->has_cycle) { + cleanup_corrupted_tls_chain(3); + } else { + butil::iobuf::remove_tls_block_chain(); + } + return NULL; +} + +TEST_F(IOBufTest, regression_3243_release_tls_block_chain_partial_overlap_keeps_prefix) { + install_debug_allocator(); + + PartialOverlapResult result; + result.has_cycle = false; + result.tls_block_count = -1; + + pthread_t tid; + ASSERT_EQ(0, pthread_create(&tid, NULL, + partial_overlap_release_tls_block_chain_thread, + &result)); + ASSERT_EQ(0, pthread_join(tid, NULL)); + + EXPECT_FALSE(result.has_cycle) + << "release_tls_block_chain() created a cycle when a unique prefix " + "was returned ahead of a block already cached in TLS. " + "(GitHub issue #3243)"; + EXPECT_EQ(3, result.tls_block_count) + << "release_tls_block_chain() dropped the unique prefix when the " + "returned chain overlapped the TLS list. (GitHub issue #3243)"; +} + // Reproduce the issue through IOBufAsZeroCopyOutputStream::BackUp(). BackUp() // eagerly returns _cur_block to TLS. A subsequent release of the same pointer // used to create a self-loop at the head.