diff --git a/docs/knowledges/homestore_checkpoint_system.md b/docs/knowledges/homestore_checkpoint_system.md
new file mode 100644
index 000000000..b80e456dd
--- /dev/null
+++ b/docs/knowledges/homestore_checkpoint_system.md
@@ -0,0 +1,381 @@
+# HomeStore Checkpoint (CP) System Architecture
+
+## Overview
+HomeStore's checkpoint system provides transactional consistency by coordinating flush operations across multiple consumers (INDEX_SVC, REPLICATION_SVC, BLK_DATA_SVC, LOG_SVC, SEALER).
+
+## Key Components
+
+### 1. CP (Checkpoint) Object
+**Location**: `HomeStore/src/include/homestore/checkpoint/cp.hpp`
+
+**Structure**:
+```cpp
+class CP {
+    cp_id_t m_cp_id;
+    std::atomic<cp_status_t> m_cp_status;
+    CPManager* m_cp_mgr;
+    std::array<std::unique_ptr<CPContext>, (size_t)cp_consumer_t::SENTINEL> m_contexts;
+    folly::SharedPromise<bool> m_comp_promise;
+    Clock::time_point m_cp_start_time;
+};
+```
+
+**CP States**:
+- `cp_trigger`: CP flush triggered
+- `cp_flush_prepare`: Switchover completed, preparing to flush
+- `cp_io_ready`: New CP ready for IO operations
+- `cp_flushing`: Currently flushing
+- `cp_flush_done`: Flush completed
+- `cp_cleaning`: Cleanup in progress
+
+### 2. CPContext Base Class
+**Location**: `HomeStore/src/include/homestore/checkpoint/cp_mgr.hpp`
+
+```cpp
+class CPContext {
+protected:
+    CP* m_cp;
+    folly::Promise<bool> m_flush_comp;  // Used by collectAll in cp_flush
+public:
+    CPContext(CP* cp) : m_cp{cp} {}
+    void complete(bool status) { m_flush_comp.setValue(status); }
+    folly::Future<bool> get_future() { return m_flush_comp.getFuture(); }
+    virtual ~CPContext() = default;
+};
+```
+
+**Important**: Each consumer creates its own CPContext subclass to hold consumer-specific CP state.
+
+### 3. CPManager
+**Location**: `HomeStore/src/lib/checkpoint/cp_mgr.cpp`
+
+**Key Members**:
+```cpp
+class CPManager {
+    CP* m_cur_cp;  // RCU-protected pointer
+    std::mutex m_trigger_cp_mtx;  // Serializes CP trigger operations
+    std::atomic<bool> m_in_flush_phase;  // Prevents concurrent flushes
+    bool m_pending_trigger_cp;  // Back-to-back CP flag
+    folly::SharedPromise<bool> m_pending_trigger_cp_comp;  // For queued CP
+    std::vector<iomgr::io_fiber_t> m_cp_io_fibers;  // 2 blocking IO fibers
+    std::array<std::unique_ptr<CPCallbacks>, SENTINEL> m_cp_cb_table;
+};
+```
+
+## CP Lifecycle
+
+### Phase 1: Trigger (do_trigger_cp_flush)
+**File**: `cp_mgr.cpp:201-270`
+
+```cpp
+folly::Future<bool> CPManager::do_trigger_cp_flush(bool force, bool flush_on_shutdown) {
+    std::unique_lock<std::mutex> lk(m_trigger_cp_mtx);
+
+    // Back-to-back CP handling
+    if (m_in_flush_phase) {
+        if (force && (!m_cp_shutdown_initiated || flush_on_shutdown)) {
+            if (!m_pending_trigger_cp) {
+                m_pending_trigger_cp = true;
+                m_pending_trigger_cp_comp = std::move(folly::SharedPromise<bool>{});
+            }
+            return m_pending_trigger_cp_comp.getFuture();
+        }
+        return folly::makeFuture<bool>(false);
+    }
+    m_in_flush_phase = true;
+
+    auto cur_cp = cp_guard();  // Get current CP
+    auto new_cp = new CP(this);
+    new_cp->m_cp_id = cur_cp->m_cp_id + 1;
+
+    // Phase 1a: Switchover SEALER first
+    auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER];
+    if (sealer_cp) {
+        new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] =
+            std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp));
+    }
+
+    // Phase 1b: Switchover other consumers
+    for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) {
+        if (svcid == (size_t)cp_consumer_t::SEALER) continue;
+        auto& consumer = m_cp_cb_table[svcid];
+        if (consumer) {
+            new_cp->m_contexts[svcid] = std::move(consumer->on_switchover_cp(cur_cp.get(), new_cp));
+        }
+    }
+
+    // Phase 1c: Setup promise and switch m_cur_cp pointer
+    if (m_pending_trigger_cp) {
+        cur_cp->m_comp_promise = std::move(m_pending_trigger_cp_comp);
+        m_pending_trigger_cp = false;
+    } else {
+        cur_cp->m_comp_promise = std::move(folly::SharedPromise<bool>{});
+    }
+    ret_fut = cur_cp->m_comp_promise.getFuture();
+
+    cur_cp->m_cp_status = cp_status_t::cp_flush_prepare;
+    new_cp->m_cp_status = cp_status_t::cp_io_ready;
+
+    // RCU pointer swap - guarantees all readers see new CP
+    rcu_xchg_pointer(&m_cur_cp, new_cp);
+    synchronize_rcu();  // Wait for all RCU readers to finish
+
+    lk.unlock();
+    return ret_fut;
+}
+```
+
+**Key Synchronization**:
+- `m_trigger_cp_mtx`: Serializes all CP trigger operations
+- `m_in_flush_phase`: Set to `true` during entire CP flush, prevents concurrent triggers
+- `rcu_xchg_pointer` + `synchronize_rcu()`: Safe pointer switching without locking read path
+
+### Phase 2: Flush (cp_start_flush)
+**File**: `cp_mgr.cpp:272-293`
+
+```cpp
+void CPManager::cp_start_flush(CP* cp) {
+    std::vector<folly::Future<bool>> futs;
+    cp->m_cp_status = cp_status_t::cp_flushing;
+
+    // Collect futures from all consumers (except SEALER)
+    for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) {
+        if (svcid == (size_t)cp_consumer_t::SEALER) continue;
+        auto& consumer = m_cp_cb_table[svcid];
+        bool participated = (cp->m_contexts[svcid] != nullptr);
+        if (consumer && participated) {
+            futs.emplace_back(std::move(consumer->cp_flush(cp)));
+        }
+    }
+
+    // Wait for all consumers to flush
+    folly::collectAllUnsafe(futs).thenValue([this, cp](auto) {
+        // SEALER (REPLICATION_SVC) flushes last synchronously
+        auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER];
+        bool participated = (cp->m_contexts[(size_t)cp_consumer_t::SEALER] != nullptr);
+        if (sealer_cp && participated) {
+            sealer_cp->cp_flush(cp).wait();
+        }
+        on_cp_flush_done(cp);
+    });
+}
+```
+
+**Important**:
+- `collectAllUnsafe` holds references to futures from `cp->m_contexts[]->get_future()`
+- SEALER flushes last because it updates cp_lsn (other components must flush up to this LSN)
+
+### Phase 3: Completion (on_cp_flush_done)
+**File**: `cp_mgr.cpp:295-331`
+
+```cpp
+void CPManager::on_cp_flush_done(CP* cp) {
+    cp->m_cp_status = cp_status_t::cp_flush_done;
+
+    iomanager.run_on_forget(pick_blocking_io_fiber(), [this, cp]() {
+        ++(m_sb->m_last_flushed_cp);
+        m_sb.write();
+
+        cleanup_cp(cp);  // Call all consumers' cp_cleanup()
+
+        auto promise = std::move(cp->m_comp_promise);
+        delete cp;  // Destructs m_contexts[] in array order: [0]→[1]→[2]→[3]
+
+        bool trigger_back_2_back_cp{false};
+        {
+            std::unique_lock<std::mutex> lk(m_trigger_cp_mtx);
+            m_in_flush_phase = false;  // Release flush lock
+            trigger_back_2_back_cp = m_pending_trigger_cp;
+        }
+
+        promise.setValue(true);  // Notify waiters
+
+        if (trigger_back_2_back_cp) {
+            trigger_cp_flush(false);  // Start queued CP
+        }
+    });
+}
+```
+
+**Critical Details**:
+- Runs on `pick_blocking_io_fiber()` - randomly picks 1 of 2 cp_io fibers
+- `delete cp` destructs `m_contexts[]` in array index order
+- `m_in_flush_phase = false` allows next CP to proceed
+- Promise is set AFTER releasing lock to avoid race
+
+### Phase 4: Cleanup
+**File**: `cp_mgr.cpp:333-338`
+
+```cpp
+void CPManager::cleanup_cp(CP* cp) {
+    cp->m_cp_status = cp_status_t::cp_cleaning;
+    for (auto& consumer : m_cp_cb_table) {
+        if (consumer) { consumer->cp_cleanup(cp); }
+    }
+}
+```
+
+## Back-to-Back CP Mechanism
+
+**Scenario**: Multiple `trigger_cp_flush(force=true)` calls while CP is flushing
+
+**Behavior**:
+1. First call starts CP=N, sets `m_in_flush_phase = true`
+2. Second call (while CP=N flushing) sets `m_pending_trigger_cp = true`, queues CP=N+1
+3. Third call (while still flushing) reuses same `m_pending_trigger_cp_comp` promise
+4. When CP=N completes, `on_cp_flush_done` checks `m_pending_trigger_cp` and auto-triggers CP=N+1
+5. Only the LAST queued CP is triggered (earlier ones are merged)
+
+**Code Flow**:
+```
+T1: trigger_cp_flush(force=true)  → Start CP=443
+T2: trigger_cp_flush(force=true)  → Queue CP=444 (m_pending_trigger_cp = true)
+T3: trigger_cp_flush(force=true)  → Reuse CP=444 promise (still m_pending_trigger_cp = true)
+T4: CP=443 completes → on_cp_flush_done → trigger_cp_flush(false) → Start CP=444
+```
+
+## CP IO Fibers
+
+**Creation**: `cp_mgr.cpp:340-372`
+
+```cpp
+void CPManager::start_cp_thread() {
+    auto const num_fibers = HS_DYNAMIC_CONFIG(generic.cp_io_fibers); // default: 2
+    iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, num_fibers, [this, ctx](bool is_started) {
+        if (is_started) {
+            auto v = iomanager.sync_io_capable_fibers();
+            m_cp_io_fibers.insert(m_cp_io_fibers.end(), v.begin(), v.end());
+        }
+    });
+}
+
+iomgr::io_fiber_t CPManager::pick_blocking_io_fiber() const {
+    static thread_local std::random_device s_rd{};
+    static thread_local std::default_random_engine s_re{s_rd()};
+    static auto rand_fiber = std::uniform_int_distribution<size_t>(0, m_cp_io_fibers.size() - 1);
+    return m_cp_io_fibers[rand_fiber(s_re)];  // Randomly pick one of 2 fibers
+}
+```
+
+**Key Points**:
+- 2 fibers by default (configurable via `generic.cp_io_fibers`)
+- Created with `INTERRUPT_LOOP` mode for blocking IO operations
+- `pick_blocking_io_fiber()` randomly distributes tasks across fibers
+- **Implication**: Multiple CPs' `on_cp_flush_done` can execute concurrently on different fibers
+
+## RCU (Read-Copy-Update) for m_cur_cp
+
+**Purpose**: Lock-free read access to current CP pointer
+
+**Read Path** (`cp_mgr.cpp:160-171`):
+```cpp
+CP* CPManager::cp_io_enter() {
+    rcu_read_lock();
+    auto cp = get_cur_cp();  // Read m_cur_cp
+    if (!cp) {
+        rcu_read_unlock();
+        return nullptr;
+    }
+    cp_ref(cp);  // Increment refcount
+    rcu_read_unlock();
+    return cp;
+}
+```
+
+**Write Path** (`cp_mgr.cpp:260-261`):
+```cpp
+rcu_xchg_pointer(&m_cur_cp, new_cp);
+synchronize_rcu();  // Wait for all readers in RCU critical section to exit
+```
+
+**Guarantee**: After `synchronize_rcu()` returns, no thread is accessing old CP without holding a reference (via `cp_ref()`).
+
+## Consumer Order
+
+### Switchover Order (on_switchover_cp)
+1. **SEALER** (REPLICATION_SVC) - First
+2. Other consumers: INDEX_SVC, BLK_DATA_SVC, LOG_SVC
+
+### Flush Order (cp_flush)
+1. INDEX_SVC, BLK_DATA_SVC, LOG_SVC - Parallel (via `collectAllUnsafe`)
+2. **SEALER** (REPLICATION_SVC) - Last, synchronous
+
+### Cleanup Order (cp_cleanup)
+- All consumers in `m_cp_cb_table` order
+
+### Destruction Order (CP::~CP)
+- `m_contexts[]` destructs in array index order:
+  - [0] INDEX_SVC
+  - [1] BLK_DATA_SVC
+  - [2] LOG_SVC
+  - [3] REPLICATION_SVC (SEALER)
+
+## Important Invariants
+
+1. **No Concurrent Flushes**: `m_trigger_cp_mtx` + `m_in_flush_phase` ensure only one CP flushes at a time
+2. **No Concurrent Triggers**: `m_trigger_cp_mtx` serializes all `do_trigger_cp_flush` calls
+3. **Safe CP Pointer Switch**: RCU + refcounting ensure old CP not accessed after switchover
+4. **Back-to-Back Merging**: Only last queued force-flush CP is triggered
+5. **SEALER Last**: REPLICATION_SVC always flushes last to establish cp_lsn watermark
+
+## Common Issues
+
+### Issue 1: Returning Wrong Future in cp_flush
+**Symptom**: CP completes immediately even if work is pending
+**Cause**: Returning `folly::makeFuture<bool>(true)` instead of `CPContext::get_future()`
+**Impact**: `collectAllUnsafe` completes prematurely, may trigger cleanup while work ongoing
+
+### Issue 2: Concurrent Access to CPContext
+**Symptom**: Use-after-free, memory corruption
+**Cause**: `pick_blocking_io_fiber()` schedules cleanup on different fiber than switchover
+**Solution**: Ensure synchronization between `on_switchover_cp` and `cp_cleanup`
+
+### Issue 3: Deadlock in Signal Handler
+**Symptom**: Process hangs instead of exiting on crash
+**Cause**: Signal handler calls malloc-dependent logging (spdlog) while malloc holds arena lock
+**Solution**: Use async-signal-safe functions only in signal handlers
+
+## Related Configuration
+
+- `generic.cp_io_fibers`: Number of CP IO fibers (default: 2)
+- `generic.repl_dev_cleanup_interval_sec`: Interval for repl_dev GC timer
+- Various consumer-specific CP flush intervals
+
+## Example Timeline (Back-to-Back CP)
+
+```
+T0: CP=443 working
+T1: destroy_pg(1) → trigger_cp_flush(force=true)
+    → m_in_flush_phase=true, queue CP=444
+
+T2: CP=443 flush done
+    → on_cp_flush_done(443) on Fiber A
+       → cleanup_cp(443)
+       → delete CP=443
+       → m_in_flush_phase = false
+       → trigger_cp_flush(false) → Start CP=444
+
+T3: do_trigger_cp_flush(444)
+    → on_switchover_cp(444, 445)  // CRITICAL: Happens BEFORE CP=444 starts flushing!
+       → Creates CP=445 contexts
+       → RaftReplServiceCPHandler::on_switchover_cp(444, 445)
+          → Accesses CP=444's m_cp_ctx_map to save state
+    → cp_start_flush(444)
+
+T4: destroy_pg(2) → trigger_cp_flush(force=true)
+    → m_in_flush_phase=true, queue CP=445
+
+T5: CP=444 flush done (all consumers skip)
+    → on_cp_flush_done(444) on Fiber B  // May be different fiber!
+       → cleanup_cp(444)
+       → delete CP=444
+       → m_in_flush_phase = false
+       → trigger CP=445
+
+T6: CP=445 starts, completes, cleanup
+    → delete CP=445
+       → ~m_contexts[0] (INDEX_SVC)
+       → ~m_contexts[1] (BLK_DATA_SVC)  // May corrupt memory here
+       → ~m_contexts[2] (LOG_SVC)
+       → ~m_contexts[3] (REPLICATION_SVC)  // CRASH: m_cp_ctx_map corrupted
+```
diff --git a/docs/knowledges/replication_service_cp.md b/docs/knowledges/replication_service_cp.md
new file mode 100644
index 000000000..6c6021cf5
--- /dev/null
+++ b/docs/knowledges/replication_service_cp.md
@@ -0,0 +1,379 @@
+# Replication Service Checkpoint Implementation
+
+## Overview
+RaftReplService implements checkpoint (CP) callbacks to persist replication state across all ReplDev instances during HomeStore checkpoint operations.
+
+## Key Classes
+
+### 1. ReplSvcCPContext
+**Location**: `HomeStore/src/lib/replication/service/raft_repl_service.h:132-141`
+
+```cpp
+class ReplSvcCPContext : public CPContext {
+    std::shared_mutex m_cp_map_mtx;  // ⚠️ DECLARED BUT NEVER USED!
+    std::map<ReplDev*, cshared<ReplDevCPContext>> m_cp_ctx_map;  // Raw pointer as key
+
+public:
+    ReplSvcCPContext(CP* cp) : CPContext(cp){};
+    virtual ~ReplSvcCPContext() = default;
+
+    int add_repl_dev_ctx(ReplDev* dev, cshared<ReplDevCPContext> dev_ctx);
+    cshared<ReplDevCPContext> get_repl_dev_ctx(ReplDev* dev);
+};
+```
+
+**Critical Bug**: `m_cp_map_mtx` is declared but never locked in `add_repl_dev_ctx()` or `get_repl_dev_ctx()`!
+
+**Implementation** (`raft_repl_service.cpp:760-770`):
+```cpp
+int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared<ReplDevCPContext> dev_ctx) {
+    m_cp_ctx_map.emplace(dev, dev_ctx);  // ⚠️ NO LOCK PROTECTION!
+    return 0;
+}
+
+cshared<ReplDevCPContext> ReplSvcCPContext::get_repl_dev_ctx(ReplDev* dev) {
+    if (m_cp_ctx_map.count(dev) == 0) {
+        return std::make_shared<ReplDevCPContext>();
+    }
+    return m_cp_ctx_map[dev];  // ⚠️ NO LOCK PROTECTION!
+}
+```
+
+**Data Structure**: `m_cp_ctx_map` is a `std::map` (red-black tree) with:
+- **Key**: Raw `ReplDev*` pointer
+- **Value**: `shared_ptr<ReplDevCPContext>`
+
+### 2. ReplDevCPContext
+**Purpose**: Holds per-ReplDev checkpoint state (LSN, etc.)
+
+```cpp
+struct ReplDevCPContext {
+    repl_lsn_t lsn;  // Last LSN to checkpoint
+    // Other repl_dev specific CP state
+};
+```
+
+### 3. RaftReplServiceCPHandler
+**Location**: `HomeStore/src/lib/replication/service/raft_repl_service.h:143-153`
+
+Implements the CPCallbacks interface for REPLICATION_SVC consumer.
+
+## CP Callback Implementations
+
+### on_switchover_cp()
+**Location**: `raft_repl_service.cpp:773-790`
+
+```cpp
+std::unique_ptr<CPContext> RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) {
+    // checking if cur_cp == nullptr as on_switchover_cp will be called when registering the cp handler
+    if (cur_cp != nullptr) {
+        // Add cp info from all devices to current cp.
+        // We dont need taking cp_guard as cp_mgr already taken it in do_trigger_cp_flush
+        auto cur_cp_ctx = s_cast<ReplSvcCPContext*>(cur_cp->context(cp_consumer_t::REPLICATION_SVC));
+
+        repl_service().iterate_repl_devs([cur_cp, cur_cp_ctx](cshared<ReplDev>& repl_dev) {
+            // Collect the LSN of each repl dev and put it into current CP
+            auto dev_ctx = std::static_pointer_cast<RaftReplDev>(repl_dev)->get_cp_ctx(cur_cp);
+            cur_cp_ctx->add_repl_dev_ctx(repl_dev.get(), std::move(dev_ctx));  // Line 784
+        });
+    }
+    // create new ctx for new_cp
+    auto ctx = std::make_unique<ReplSvcCPContext>(new_cp);
+    return ctx;
+}
+```
+
+**Key Points**:
+- Called during `do_trigger_cp_flush` while holding `m_trigger_cp_mtx`
+- Populates **cur_cp's** m_cp_ctx_map, not new_cp's
+- Iterates all repl_devs and collects their CP state
+- Comment says "We dont need taking cp_guard" - assumes CPManager lock provides protection
+
+**Execution Context**: Runs on thread calling `trigger_cp_flush`, protected by `m_trigger_cp_mtx`
+
+### cp_flush()
+**Location**: `raft_repl_service.cpp:792-799`
+
+```cpp
+folly::Future<bool> RaftReplServiceCPHandler::cp_flush(CP* cp) {
+    auto cp_ctx = s_cast<ReplSvcCPContext*>(cp->context(cp_consumer_t::REPLICATION_SVC));
+
+    repl_service().iterate_repl_devs([cp, cp_ctx](cshared<ReplDev>& repl_dev) {
+        auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get());  // Line 795
+        std::static_pointer_cast<RaftReplDev>(repl_dev)->cp_flush(cp, dev_ctx);
+    });
+
+    return folly::makeFuture<bool>(true);  // ⚠️ BUG: Should return cp_ctx->get_future()
+}
+```
+
+**Known Bug**: Returns immediate future instead of `cp_ctx->get_future()`. However:
+- All work is actually completed synchronously before return
+- `RaftReplDev::cp_flush()` is synchronous
+- Returning `true` is semantically correct for this implementation
+
+**Why It Works**: Unlike async implementations, this handler completes all flush work before returning, so immediate `true` is safe.
+
+### cp_cleanup()
+**Location**: `raft_repl_service.cpp:801-804`
+
+```cpp
+void RaftReplServiceCPHandler::cp_cleanup(CP* cp) {
+    repl_service().iterate_repl_devs(
+        [cp](cshared<ReplDev>& repl_dev) {
+            std::static_pointer_cast<RaftReplDev>(repl_dev)->cp_cleanup(cp);
+        });
+}
+```
+
+**Execution Context**: Called from `CPManager::cleanup_cp()` which runs on `pick_blocking_io_fiber()`.
+
+## Critical Race Condition (Unconfirmed)
+
+### Hypothetical Scenario
+```
+Thread A (do_trigger_cp_flush):
+  ├─ Acquires m_trigger_cp_mtx
+  ├─ on_switchover_cp(CP=444, CP=445)
+  │   └─ cur_cp_ctx->add_repl_dev_ctx(...)  // Modifies CP=444's m_cp_ctx_map
+  ├─ Releases m_trigger_cp_mtx
+  └─ cp_start_flush(444)
+
+Thread B (on_cp_flush_done for CP=443):
+  ├─ run_on_forget(pick_blocking_io_fiber(), [...])
+  │   ├─ cleanup_cp(443)
+  │   ├─ delete CP=443
+  │   ├─ Sets m_in_flush_phase = false
+  │   └─ trigger_cp_flush(false) → Starts CP=444
+  │       └─ Acquires m_trigger_cp_mtx
+  │           └─ on_switchover_cp(444, 445)
+  │               └─ add_repl_dev_ctx(...)  // ⚠️ Concurrent access?
+```
+
+**Why This May Not Be a Race**:
+- `m_trigger_cp_mtx` serializes all `do_trigger_cp_flush` calls
+- `on_switchover_cp` only runs while holding this lock
+- `cleanup_cp` runs AFTER CP flush completes, when switchover for next CP already done
+- **However**: With 2 cp_io fibers, cleanup can run concurrently with next CP's switchover
+
+### Actual Issue: Memory Corruption During Destruction
+
+**Observed Evidence** (from GDB):
+```
+CP=445 destruction:
+  ~m_contexts[0] (INDEX_SVC)     → OK
+  ~m_contexts[1] (BLK_DATA_SVC)  → Triggers folly::collectAll::Context cleanup
+                                 → Corrupts memory (ASan markers appear)
+  ~m_contexts[2] (LOG_SVC)       → OK
+  ~m_contexts[3] (REPLICATION_SVC) → m_cp_ctx_map already corrupted
+                                   → CRASH in ~Promise()
+```
+
+**Red-Black Tree Corruption**:
+- `m_cp_ctx_map` uses `std::map` (red-black tree)
+- Tree nodes found with ASan markers: `0x0000610000000000` (heap-left-redzone)
+- Parent pointers corrupted: `0x2252feb743377eb5` (invalid)
+
+## ReplDev Lifecycle vs CP
+
+### ReplDev Destruction
+**Location**: `HomeObject/src/lib/homestore_backend/hs_pg_manager.cpp:589-613`
+
+```cpp
+bool HSHomeObject::pg_destroy(pg_id_t pg_id, bool need_to_pause_pg_state_machine) {
+    mark_pg_destroyed(pg_id);
+    destroy_shards(pg_id);
+    destroy_hs_resources(pg_id);
+    destroy_pg_index_table(pg_id);
+    destroy_pg_superblk(pg_id);  // Triggers CP flush with force=true
+
+    // ⚠️ NOTE: Does NOT destroy ReplDev or remove from m_rd_map!
+    return true;
+}
+```
+
+**Key Point**: PG destroy does NOT destroy ReplDev. The ReplDev pointer remains valid and in `m_rd_map`.
+
+**Implication**: Using `ReplDev*` as map key is safe during CP lifecycle, no dangling pointers.
+
+### When ReplDev is Actually Destroyed
+**Location**: `HomeStore/src/lib/replication/service/raft_repl_service.cpp:699-735`
+
+```cpp
+void RaftReplService::gc_repl_devs() {
+    std::vector<group_id_t> groups_to_leave;
+    {
+        std::shared_lock lg(m_rd_map_mtx);
+        for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) {
+            auto rdev = std::dynamic_pointer_cast<RaftReplDev>(it->second);
+            if (rdev->is_destroy_pending() &&
+                (get_elapsed_time_sec(rdev->destroyed_time()) >=
+                 HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) {
+                groups_to_leave.push_back(rdev->group_id());
+            }
+        }
+    }
+
+    for (const auto& group_id : groups_to_leave) {
+        m_msg_mgr->leave_group(group_id);
+        m_repl_app->destroy_repl_dev_listener(group_id);
+        {
+            std::unique_lock lg(m_rd_map_mtx);
+            m_rd_map.erase(group_id);  // Finally remove from map
+        }
+    }
+}
+```
+
+**Timing**: ReplDev destroyed after configurable delay (default: seconds), long after PG destroy completes.
+
+## Synchronization Analysis
+
+### Protection Mechanisms
+
+1. **m_trigger_cp_mtx**: Serializes CP trigger operations
+   - Held during entire `on_switchover_cp` execution
+   - Prevents concurrent switchover calls
+
+2. **m_in_flush_phase**: Prevents concurrent CP flushes
+   - Set to `true` at start of `do_trigger_cp_flush`
+   - Set to `false` in `on_cp_flush_done`
+
+3. **RCU for m_cur_cp**: Lock-free read access
+   - `rcu_xchg_pointer` + `synchronize_rcu` for safe pointer swap
+   - Guarantees no readers accessing old CP without reference
+
+4. **NOT Protected**: `m_cp_ctx_map` access
+   - `add_repl_dev_ctx()`: No lock
+   - `get_repl_dev_ctx()`: No lock
+   - `m_cp_map_mtx` declared but never used
+
+### Why It (Mostly) Works
+
+**Assumption**: `m_trigger_cp_mtx` provides implicit protection because:
+- `on_switchover_cp` only modifies **cur_cp's** context, not new_cp's
+- `cp_flush` reads **same CP's** context that was populated in switchover
+- `cleanup_cp` only called after CP flush completes
+
+**Single-Threaded Access Pattern**:
+```
+do_trigger_cp_flush(N):
+  [Hold m_trigger_cp_mtx]
+  ├─ on_switchover_cp(N, N+1)
+  │   └─ Populate CP=N's m_cp_ctx_map
+  [Release m_trigger_cp_mtx]
+  └─ cp_start_flush(N)
+      └─ cp_flush(N)
+          └─ Read CP=N's m_cp_ctx_map
+
+on_cp_flush_done(N):
+  └─ cleanup_cp(N)
+      └─ Read CP=N's m_cp_ctx_map
+```
+
+**Why No Race**: Each CP's context is only accessed by operations on that specific CP, and CP operations are serialized.
+
+## Potential Issues
+
+### Issue 1: Unused Mutex
+**Code**: `std::shared_mutex m_cp_map_mtx` declared but never locked
+
+**Risk**: Low - current access pattern is single-threaded per CP
+
+**Recommendation**: Either remove unused mutex or add proper locking for defense-in-depth
+
+### Issue 2: Raw Pointer as Map Key
+**Code**: `std::map<ReplDev*, shared_ptr<ReplDevCPContext>>`
+
+**Risk**: Low - ReplDev destroyed long after CP completes
+
+**Benefit**: Avoids shared_ptr overhead for key comparisons
+
+### Issue 3: Returning Immediate Future
+**Code**: `return folly::makeFuture<bool>(true)` instead of `cp_ctx->get_future()`
+
+**Risk**: Low - work completes synchronously before return
+
+**Confusion**: Other consumers may expect to return the context's future
+
+## Interaction with Other Services
+
+### During Baseline Resync
+
+**Scenario**: Destroy old PG → Create new PG
+
+```
+Timeline:
+T1: destroy_pg(pg_id=1)
+    ├─ destroy_pg_superblk(1)
+    │   └─ trigger_cp_flush(force=true) → Queues CP=444
+    ├─ ReplDev for pg=1 still exists in m_rd_map
+    └─ Returns
+
+T2: CP=443 completes
+    └─ Auto-triggers CP=444
+
+T3: CP=444 switchover
+    ├─ on_switchover_cp(444, 445)
+    │   └─ iterate_repl_devs([...])  // Includes ReplDev for pg=1
+    │       └─ add_repl_dev_ctx(repl_dev.get(), ...)
+    └─ cp_start_flush(444)
+
+T4: destroy_pg(pg_id=2)
+    └─ trigger_cp_flush(force=true) → Queues CP=445
+
+T5: CP=444 completes (all rdevs skip flush)
+    └─ Auto-triggers CP=445
+
+T6: CP=445 runs, completes, destructs
+    └─ CRASH during m_cp_ctx_map destruction
+```
+
+**Key**: ReplDevs persist across PG destroy, remain in CP context.
+
+## Related Configuration
+
+- `generic.repl_dev_cleanup_interval_sec`: Delay before destroying ReplDev after marking for destruction
+- `consensus.flush_durable_commit_interval_ms`: Interval for flushing durable commit LSN
+- `consensus.replace_member_sync_check_interval_ms`: Interval for monitoring replace_member status
+
+## Common Misconceptions
+
+1. **"ReplDev is destroyed when PG is destroyed"**: FALSE - ReplDev destroyed later by GC thread
+2. **"m_cp_map_mtx protects the map"**: FALSE - mutex is never locked
+3. **"Returning wrong future causes bug"**: UNCLEAR - work completes synchronously
+4. **"Raw pointers cause dangling"**: FALSE - ReplDev lifetime exceeds CP lifetime
+
+## Debug Tips
+
+### Check m_cp_ctx_map State
+```gdb
+# Get CP context
+set $cp = (homestore::CP*)0x...
+set $ctx = (homestore::ReplSvcCPContext*)$cp->m_contexts[3].get()
+
+# Examine map
+p $ctx->m_cp_ctx_map
+p $ctx->m_cp_ctx_map._M_t._M_impl._M_node_count
+
+# Check red-black tree integrity
+set $root = $ctx->m_cp_ctx_map._M_t._M_impl._M_header._M_parent
+x/32gx $root
+```
+
+### Check ReplDev Validity
+```gdb
+# Check if ReplDev still in map
+set $rdev_ptr = (homestore::ReplDev*)0x...
+p homestore::repl_service().get_repl_dev(...)
+
+# Verify refcount
+p $rdev_ptr.use_count()
+```
+
+### Monitor CP Lifecycle
+Add breakpoints:
+- `RaftReplServiceCPHandler::on_switchover_cp`
+- `RaftReplServiceCPHandler::cp_flush`
+- `RaftReplServiceCPHandler::cp_cleanup`
+- `ReplSvcCPContext::~ReplSvcCPContext`