From ceaba2c44d23ac0bd81e05eaa7f9ed4a427cfa35 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Fri, 15 May 2026 15:41:51 +0800 Subject: [PATCH] SDSTOR-21863: Revert AppendBlkAllocator Reset logic to prevent race condition between upper layer reset and self operation on it (cp_flush and free) Fix https://github.com/eBay/HomeObject/issues/401 Co-Authored-By: Claude Sonnet 4.6 --- conanfile.py | 2 +- src/lib/blkalloc/append_blk_allocator.cpp | 25 +++++++--- src/lib/device/virtual_dev.cpp | 12 ++++- src/tests/test_data_service.cpp | 60 ++++++++++++++++++++--- 4 files changed, 83 insertions(+), 16 deletions(-) diff --git a/conanfile.py b/conanfile.py index 66e884eb9..0d20a477d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "7.5.8" + version = "7.5.9" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 50b6e81cb..40a718b92 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -155,17 +155,30 @@ bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const { } void AppendBlkAllocator::reset() { + // Reset in-place: restore all in-memory counters to the initial empty state and mark dirty + // so the next cp_flush persists commit_offset=0 to the existing superblock. + // + // We intentionally do NOT destroy the superblock or deregister the handler here, unlike + // BitmapBlkAllocator::reset(). AppendBlkAllocator is managed by the upper layer (HomeObject + // GC), which reuses the same chunk across multiple GC cycles. Keeping the same allocator + // instance alive avoids the race described in https://github.com/eBay/HomeObject/issues/401: + // + // T1: cp_flush holds a shared_ptr to the old allocator + // T2: reset() destroys m_sb [old behavior] + // T3: gc_repl_reqs calls free() on old allocator -> m_is_dirty=true + // T4: cp_flush writes to the destroyed m_sb -> crash + // + // With in-place reset the superblock is always valid, so T4 is safe regardless of T3. std::lock_guard lg(m_sb_mtx); - m_is_dirty.store(false); - m_sb.destroy(); - meta_service().deregister_handler(get_name()); + m_last_append_offset.store(0); + m_freeable_nblks.store(0); + m_commit_offset.store(0); + m_is_dirty.store(true); } bool AppendBlkAllocator::is_blk_alloced_on_disk(BlkId const& bid, bool use_lock) const { std::lock_guard lg(m_sb_mtx); - if (!m_sb.is_empty()) { return bid.blk_num() < m_sb->commit_offset; } - // if allocator is reset, the sb will be destroyed, and all the blks will be treated as free; - return false; + return bid.blk_num() < m_sb->commit_offset; } std::string AppendBlkAllocator::get_name() const { return "AppendBlkAlloc_chunk_" + std::to_string(m_chunk_id); } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 73d39fc55..b3c4bb3ac 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -129,6 +129,16 @@ void VirtualDev::add_chunk(cshared< Chunk >& chunk, bool is_fresh_chunk) { } void VirtualDev::reset_chunk_blk_allocator(Chunk* chunk) { + if (m_allocator_type == blk_allocator_type_t::append) { + // AppendBlkAllocator resets in-place via its reset() method and no new allocator instance is created. + // Creating a new instance would require deregistering the old handler and registering a + // new one, which introduces a window where a concurrent cp_flush holding the old + // allocator's shared_ptr could write to an already-destroyed superblock. + // See https://github.com/eBay/HomeObject/issues/401. + + LOGINFO("AppendBlkAllocator resets in-place, skip creating new instance for chunk {}", chunk->chunk_id()); + return; + } auto ba = create_blk_allocator(m_allocator_type, block_size(), chunk->physical_dev()->optimal_page_size(), chunk->physical_dev()->align_size(), chunk->size(), m_auto_recovery, chunk->chunk_id(), true /* is_init */, m_use_slab_in_blk_allocator); @@ -743,8 +753,6 @@ void VirtualDev::cp_flush(VDevCPContext* v_cp_ctx) { // pass down cp so that underlying components can get their customized CP context if needed; m_chunk_selector->foreach_chunks([this, cp](cshared< Chunk >& chunk) { HS_LOG(TRACE, device, "Flushing chunk: {}, vdev: {}", chunk->chunk_id(), m_vdev_info.name); - // Hold shared_ptr to prevent allocator from being reset during cp_flush. - // This fixes race with GC's purge_reserved_chunk() which calls reset_block_allocator(). auto alloc_ptr = chunk->blk_allocator_shared(); #ifdef _PRERELEASE diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 53b5fe19d..8efad5ba6 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -1123,8 +1123,8 @@ TEST_F(BlkDataServiceAppendTest, TestResetCompletedBeforeCpFlush) { LOGINFO("Scenario 1: PASSED - Order: get_ptr -> reset_block_allocator -> cp_flush"); } -// Scenario 2: Order: cp_flush holds shared_ptr -> old allocator reset -> cp_flush on old -> create new allocator -// Test that cp_flush executes after old allocator reset, then new allocator is created +// Scenario 2: Order: cp_flush holds shared_ptr -> allocator reset (in-place) -> cp_flush completes on same allocator +// Test that cp_flush completes safely after in-place reset of the same allocator TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) { vdev_info vinfo; auto data_vdev = inst().open_vdev(vinfo, true); @@ -1185,12 +1185,12 @@ TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) { LOGINFO("Scenario 2: Old allocator reset() completed"); old_allocator_reset.store(true); - // Wait for cp_flush to complete on old allocator before creating new one + // Wait for cp_flush to complete before reset thread continues for (int i = 0; i < 200 && !cp_flush_completed.load(); ++i) { std::this_thread::sleep_for(std::chrono::milliseconds(10)); } - RELEASE_ASSERT(cp_flush_completed.load(), "cp_flush should complete before creating new allocator"); - LOGINFO("Scenario 2: cp_flush completed, proceeding to create new allocator"); + RELEASE_ASSERT(cp_flush_completed.load(), "cp_flush should complete before reset thread continues"); + LOGINFO("Scenario 2: cp_flush completed, reset thread proceeding"); })); std::thread flush_thread([&]() { @@ -1205,7 +1205,7 @@ TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) { std::thread reset_thread([&]() { chunk->reset_block_allocator(); reset_completed.store(true); - LOGINFO("Scenario 2: reset_block_allocator completed (new allocator created)"); + LOGINFO("Scenario 2: reset_block_allocator completed (in-place reset)"); }); flush_thread.join(); @@ -1219,7 +1219,7 @@ TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) { ASSERT_TRUE(cp_flush_completed.load()); ASSERT_TRUE(reset_completed.load()); ASSERT_TRUE(old_allocator_reset.load()); - LOGINFO("Scenario 2: PASSED - Order: get_ptr -> old_allocator_reset -> cp_flush -> create new allocator"); + LOGINFO("Scenario 2: PASSED - Order: get_ptr -> allocator_reset (in-place) -> cp_flush on same allocator"); } // Scenario 3: Order: cp_flush acquires mutex first -> reset_block_allocator blocks on mutex @@ -1299,6 +1299,52 @@ TEST_F(BlkDataServiceAppendTest, TestCpFlushBlocksResetWithMutex) { LOGINFO("Scenario 3: PASSED - mutex protected m_sb from concurrent reset_block_allocator during cp_flush"); } +// Scenario 4: Order: cp_flush holds shared_ptr -> reset() completes -> free() re-sets dirty on old allocator +// -> cp_flush runs on old allocator +// Test the race from https://github.com/eBay/HomeObject/issues/401: +// with the old AppendBlkAllocator::reset() (destroy + recreate): +// - reset() destroys m_sb of the old allocator +// - free() sets m_is_dirty=true on the old allocator (whose m_sb is already destroyed) +// - cp_flush writes to the destroyed m_sb -> crash / undefined behavior +TEST_F(BlkDataServiceAppendTest, TestFreeAfterResetBeforeCpFlushSafe) { + vdev_info vinfo; + auto data_vdev = inst().open_vdev(vinfo, true); + auto chunks = data_vdev->get_chunks(); + + auto it = std::find_if(chunks.begin(), chunks.end(), [](const auto& pair) { + return pair.second && pair.second->blk_allocator() && + pair.second->blk_allocator()->get_name().find("append_chunk_") == 0; + }); + ASSERT_NE(it, chunks.end()) << "No AppendBlkAllocator chunks found"; + auto chunk = it->second; + auto chunk_id = chunk->chunk_id(); + + LOGINFO("Scenario 4: Testing chunk_id={}", chunk_id); + + // Simulate cp_flush holding a reference to the old allocator before reset begins. + LOGINFO("Scenario 4: Simulate cp_flush acquiring shared_ptr to old allocator before reset"); + auto old_alloc = chunk->blk_allocator_shared(); + + // Simulate GC's purge_reserved_chunk calling reset_block_allocator(). + LOGINFO("Scenario 4: Simulate GC calling reset_block_allocator on the chunk"); + chunk->reset_block_allocator(); + auto alloc_after_reset = chunk->blk_allocator_shared(); + ASSERT_FALSE(old_alloc.owner_before(alloc_after_reset) || alloc_after_reset.owner_before(old_alloc)); + + // Simulate gc_repl_reqs calling free() on the old allocator reference after reset + LOGINFO("Scenario 4: Simulate gc_repl_reqs calling free() on old allocator reference after reset"); + BlkId fake_bid{0, 1, chunk_id}; + old_alloc->free(fake_bid); + + // Simulate cp_flush running on the old allocator reference. + // With fix: safe - m_sb is still valid, writes commit_offset=0 correctly. + // Without fix: UB - m_sb was destroyed in reset(), writing to it crashes. + LOGINFO("Scenario 4: Simulate cp_flush running on old allocator reference after reset and free"); + old_alloc->cp_flush(nullptr); + + LOGINFO("Scenario 4: PASSED - free() + cp_flush on old allocator after reset is safe"); +} + #endif // _PRERELEASE // Stream related test