From ceaba2c44d23ac0bd81e05eaa7f9ed4a427cfa35 Mon Sep 17 00:00:00 2001
From: yawzhang <yawzhang@ebay.com>
Date: Fri, 15 May 2026 15:41:51 +0800
Subject: [PATCH] SDSTOR-21863: Revert AppendBlkAllocator Reset logic to
 prevent race condition between upper layer reset and self operation on it
 (cp_flush and free)

Fix https://github.com/eBay/HomeObject/issues/401

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 conanfile.py                              |  2 +-
 src/lib/blkalloc/append_blk_allocator.cpp | 25 +++++++---
 src/lib/device/virtual_dev.cpp            | 12 ++++-
 src/tests/test_data_service.cpp           | 60 ++++++++++++++++++++---
 4 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/conanfile.py b/conanfile.py
index 66e884eb9..0d20a477d 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -9,7 +9,7 @@
 
 class HomestoreConan(ConanFile):
     name = "homestore"
-    version = "7.5.8"
+    version = "7.5.9"
 
     homepage = "https://github.com/eBay/Homestore"
     description = "HomeStore Storage Engine"
diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp
index 50b6e81cb..40a718b92 100644
--- a/src/lib/blkalloc/append_blk_allocator.cpp
+++ b/src/lib/blkalloc/append_blk_allocator.cpp
@@ -155,17 +155,30 @@ bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const {
 }
 
 void AppendBlkAllocator::reset() {
+    // Reset in-place: restore all in-memory counters to the initial empty state and mark dirty
+    // so the next cp_flush persists commit_offset=0 to the existing superblock.
+    //
+    // We intentionally do NOT destroy the superblock or deregister the handler here, unlike
+    // BitmapBlkAllocator::reset(). AppendBlkAllocator is managed by the upper layer (HomeObject
+    // GC), which reuses the same chunk across multiple GC cycles. Keeping the same allocator
+    // instance alive avoids the race described in https://github.com/eBay/HomeObject/issues/401:
+    //
+    //   T1: cp_flush holds a shared_ptr to the old allocator
+    //   T2: reset() destroys m_sb [old behavior]
+    //   T3: gc_repl_reqs calls free() on old allocator -> m_is_dirty=true
+    //   T4: cp_flush writes to the destroyed m_sb -> crash
+    //
+    // With in-place reset the superblock is always valid, so T4 is safe regardless of T3.
     std::lock_guard lg(m_sb_mtx);
-    m_is_dirty.store(false);
-    m_sb.destroy();
-    meta_service().deregister_handler(get_name());
+    m_last_append_offset.store(0);
+    m_freeable_nblks.store(0);
+    m_commit_offset.store(0);
+    m_is_dirty.store(true);
 }
 
 bool AppendBlkAllocator::is_blk_alloced_on_disk(BlkId const& bid, bool use_lock) const {
     std::lock_guard lg(m_sb_mtx);
-    if (!m_sb.is_empty()) { return bid.blk_num() < m_sb->commit_offset; }
-    // if allocator is reset, the sb will be destroyed, and all the blks will be treated as free;
-    return false;
+    return bid.blk_num() < m_sb->commit_offset;
 }
 
 std::string AppendBlkAllocator::get_name() const { return "AppendBlkAlloc_chunk_" + std::to_string(m_chunk_id); }
diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp
index 73d39fc55..b3c4bb3ac 100644
--- a/src/lib/device/virtual_dev.cpp
+++ b/src/lib/device/virtual_dev.cpp
@@ -129,6 +129,16 @@ void VirtualDev::add_chunk(cshared< Chunk >& chunk, bool is_fresh_chunk) {
 }
 
 void VirtualDev::reset_chunk_blk_allocator(Chunk* chunk) {
+    if (m_allocator_type == blk_allocator_type_t::append) {
+        // AppendBlkAllocator resets in-place via its reset() method and no new allocator instance is created.
+        // Creating a new instance would require deregistering the old handler and registering a
+        // new one, which introduces a window where a concurrent cp_flush holding the old
+        // allocator's shared_ptr could write to an already-destroyed superblock.
+        // See https://github.com/eBay/HomeObject/issues/401.
+
+        LOGINFO("AppendBlkAllocator resets in-place, skip creating new instance for chunk {}", chunk->chunk_id());
+        return;
+    }
     auto ba = create_blk_allocator(m_allocator_type, block_size(), chunk->physical_dev()->optimal_page_size(),
                                    chunk->physical_dev()->align_size(), chunk->size(), m_auto_recovery,
                                    chunk->chunk_id(), true /* is_init */, m_use_slab_in_blk_allocator);
@@ -743,8 +753,6 @@ void VirtualDev::cp_flush(VDevCPContext* v_cp_ctx) {
     // pass down cp so that underlying components can get their customized CP context if needed;
     m_chunk_selector->foreach_chunks([this, cp](cshared< Chunk >& chunk) {
         HS_LOG(TRACE, device, "Flushing chunk: {}, vdev: {}", chunk->chunk_id(), m_vdev_info.name);
-        // Hold shared_ptr to prevent allocator from being reset during cp_flush.
-        // This fixes race with GC's purge_reserved_chunk() which calls reset_block_allocator().
         auto alloc_ptr = chunk->blk_allocator_shared();
 
 #ifdef _PRERELEASE
diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp
index 53b5fe19d..8efad5ba6 100644
--- a/src/tests/test_data_service.cpp
+++ b/src/tests/test_data_service.cpp
@@ -1123,8 +1123,8 @@ TEST_F(BlkDataServiceAppendTest, TestResetCompletedBeforeCpFlush) {
     LOGINFO("Scenario 1: PASSED - Order: get_ptr -> reset_block_allocator -> cp_flush");
 }
 
-// Scenario 2: Order: cp_flush holds shared_ptr -> old allocator reset -> cp_flush on old -> create new allocator
-// Test that cp_flush executes after old allocator reset, then new allocator is created
+// Scenario 2: Order: cp_flush holds shared_ptr -> allocator reset (in-place) -> cp_flush completes on same allocator
+// Test that cp_flush completes safely after in-place reset of the same allocator
 TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) {
     vdev_info vinfo;
     auto data_vdev = inst().open_vdev(vinfo, true);
@@ -1185,12 +1185,12 @@ TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) {
             LOGINFO("Scenario 2: Old allocator reset() completed");
             old_allocator_reset.store(true);
 
-            // Wait for cp_flush to complete on old allocator before creating new one
+            // Wait for cp_flush to complete before reset thread continues
             for (int i = 0; i < 200 && !cp_flush_completed.load(); ++i) {
                 std::this_thread::sleep_for(std::chrono::milliseconds(10));
             }
-            RELEASE_ASSERT(cp_flush_completed.load(), "cp_flush should complete before creating new allocator");
-            LOGINFO("Scenario 2: cp_flush completed, proceeding to create new allocator");
+            RELEASE_ASSERT(cp_flush_completed.load(), "cp_flush should complete before reset thread continues");
+            LOGINFO("Scenario 2: cp_flush completed, reset thread proceeding");
         }));
 
     std::thread flush_thread([&]() {
@@ -1205,7 +1205,7 @@ TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) {
     std::thread reset_thread([&]() {
         chunk->reset_block_allocator();
         reset_completed.store(true);
-        LOGINFO("Scenario 2: reset_block_allocator completed (new allocator created)");
+        LOGINFO("Scenario 2: reset_block_allocator completed (in-place reset)");
     });
 
     flush_thread.join();
@@ -1219,7 +1219,7 @@ TEST_F(BlkDataServiceAppendTest, TestResetDuringCpFlushHoldsSharedPtr) {
     ASSERT_TRUE(cp_flush_completed.load());
     ASSERT_TRUE(reset_completed.load());
     ASSERT_TRUE(old_allocator_reset.load());
-    LOGINFO("Scenario 2: PASSED - Order: get_ptr -> old_allocator_reset -> cp_flush -> create new allocator");
+    LOGINFO("Scenario 2: PASSED - Order: get_ptr -> allocator_reset (in-place) -> cp_flush on same allocator");
 }
 
 // Scenario 3: Order: cp_flush acquires mutex first -> reset_block_allocator blocks on mutex
@@ -1299,6 +1299,52 @@ TEST_F(BlkDataServiceAppendTest, TestCpFlushBlocksResetWithMutex) {
     LOGINFO("Scenario 3: PASSED - mutex protected m_sb from concurrent reset_block_allocator during cp_flush");
 }
 
+// Scenario 4: Order: cp_flush holds shared_ptr -> reset() completes -> free() re-sets dirty on old allocator
+//        -> cp_flush runs on old allocator
+// Test the race from https://github.com/eBay/HomeObject/issues/401:
+// with the old AppendBlkAllocator::reset() (destroy + recreate):
+//   - reset() destroys m_sb of the old allocator
+//   - free() sets m_is_dirty=true on the old allocator (whose m_sb is already destroyed)
+//   - cp_flush writes to the destroyed m_sb -> crash / undefined behavior
+TEST_F(BlkDataServiceAppendTest, TestFreeAfterResetBeforeCpFlushSafe) {
+    vdev_info vinfo;
+    auto data_vdev = inst().open_vdev(vinfo, true);
+    auto chunks = data_vdev->get_chunks();
+
+    auto it = std::find_if(chunks.begin(), chunks.end(), [](const auto& pair) {
+        return pair.second && pair.second->blk_allocator() &&
+            pair.second->blk_allocator()->get_name().find("append_chunk_") == 0;
+    });
+    ASSERT_NE(it, chunks.end()) << "No AppendBlkAllocator chunks found";
+    auto chunk = it->second;
+    auto chunk_id = chunk->chunk_id();
+
+    LOGINFO("Scenario 4: Testing chunk_id={}", chunk_id);
+
+    // Simulate cp_flush holding a reference to the old allocator before reset begins.
+    LOGINFO("Scenario 4: Simulate cp_flush acquiring shared_ptr to old allocator before reset");
+    auto old_alloc = chunk->blk_allocator_shared();
+
+    // Simulate GC's purge_reserved_chunk calling reset_block_allocator().
+    LOGINFO("Scenario 4: Simulate GC calling reset_block_allocator on the chunk");
+    chunk->reset_block_allocator();
+    auto alloc_after_reset = chunk->blk_allocator_shared();
+    ASSERT_FALSE(old_alloc.owner_before(alloc_after_reset) || alloc_after_reset.owner_before(old_alloc));
+
+    // Simulate gc_repl_reqs calling free() on the old allocator reference after reset
+    LOGINFO("Scenario 4: Simulate gc_repl_reqs calling free() on old allocator reference after reset");
+    BlkId fake_bid{0, 1, chunk_id};
+    old_alloc->free(fake_bid);
+
+    // Simulate cp_flush running on the old allocator reference.
+    // With fix: safe - m_sb is still valid, writes commit_offset=0 correctly.
+    // Without fix: UB - m_sb was destroyed in reset(), writing to it crashes.
+    LOGINFO("Scenario 4: Simulate cp_flush running on old allocator reference after reset and free");
+    old_alloc->cp_flush(nullptr);
+
+    LOGINFO("Scenario 4: PASSED - free() + cp_flush on old allocator after reset is safe");
+}
+
 #endif // _PRERELEASE
 
 // Stream related test