From f922a27fec0e6bd447bccfa32f00e3df59b88835 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 9 Mar 2026 14:56:29 +0800 Subject: [PATCH 1/4] [scrubber] phase1: add scrub manager Add comprehensive scrub infrastructure to detect data corruption and inconsistencies across replicas in HomeObject. This is phase 1 of the scrubber implementation. - Implements deep and shallow scrubbing for PG metadata, shards, and blobs - Supports periodic and manual scrub triggering modes - Uses priority queue (MPMCPriorityQueue) for scrub task scheduling - Persists scrub metadata using superblocks to track last scrub times - Coordinates scrub operations across all replicas in a PG 1. **Deep Scrub**: Full data integrity verification - PG metadata validation - Shard existence and consistency checks - Blob hash verification (reads data and computes checksums) - Detects corrupted, missing, and inconsistent data across replicas 2. **Shallow Scrub**: Lightweight metadata-only verification - Shard existence checks - Blob index validation (no data reads) - Faster execution for routine checks - FlatBuffer-based serialization for scrub requests and responses - Leader sends scrub requests to all replicas - Followers return scrub maps with their local state - Retry logic with configurable timeouts for reliability - **ShallowScrubReport**: Tracks missing shards and blobs per peer - **DeepScrubReport**: Extends shallow report with: - Corrupted blobs/shards with error details - Inconsistent blobs (different hashes across replicas) - Corrupted PG metadata - Scrubs data in configurable ranges to avoid timeouts - Shard range: 2M shards per request - Blob range: Based on HDD IOPS for deep scrub, 2M for shallow - Early cancellation support for graceful shutdown 1. **DeepScrubTest**: Verifies detection of: - Missing blobs on followers - Missing shards on followers - Corrupted blob data (IO errors) - Inconsistent blob hashes across replicas 2. **MPMCPriorityQueue Tests**: Lock-free queue validation - Concurrent push/pop operations - Priority ordering verification - Thread safety under contention --- CHANGELOG.md | 13 - conanfile.py | 2 +- src/include/homeobject/common.hpp | 2 +- src/lib/homeobject_impl.hpp | 2 + src/lib/homestore_backend/CMakeLists.txt | 27 +- .../homestore_backend/MPMCPriorityQueue.hpp | 188 ++ src/lib/homestore_backend/gc_manager.cpp | 16 +- src/lib/homestore_backend/gc_manager.hpp | 12 +- src/lib/homestore_backend/hs_homeobject.cpp | 30 +- src/lib/homestore_backend/hs_homeobject.hpp | 43 +- .../hs_homeobject_fbs/blob_scrub_req.fbs | 13 + .../hs_homeobject_fbs/deep_blob_scrub_map.fbs | 31 + .../deep_shard_scrub_map.fbs | 16 + .../hs_backend_config.fbs | 4 + .../hs_homeobject_fbs/meta_scrub_req.fbs | 13 + .../hs_homeobject_fbs/pg_meta_scrub_map.fbs | 11 + .../hs_homeobject_fbs/pg_meta_scrub_req.fbs | 10 + .../resync_blob_data.fbs | 0 .../resync_pg_data.fbs | 0 .../resync_shard_data.fbs | 0 .../hs_homeobject_fbs/scrub_common.fbs | 23 + .../shallow_blob_scrub_map.fbs | 13 + .../shallow_shard_scrub_map.fbs | 13 + .../hs_homeobject_fbs/shard_scrub_req.fbs | 13 + src/lib/homestore_backend/hs_pg_manager.cpp | 283 ++- .../homestore_backend/hs_shard_manager.cpp | 26 +- .../replication_state_machine.cpp | 37 +- .../replication_state_machine.hpp | 8 + src/lib/homestore_backend/scrub_manager.cpp | 1878 +++++++++++++++++ src/lib/homestore_backend/scrub_manager.hpp | 471 +++++ .../homestore_backend/tests/CMakeLists.txt | 9 + .../tests/hs_scrubber_tests.cpp | 569 +++++ .../tests/test_mpmc_priority_queue.cpp | 417 ++++ 33 files changed, 4111 insertions(+), 82 deletions(-) delete mode 100644 CHANGELOG.md create mode 100644 src/lib/homestore_backend/MPMCPriorityQueue.hpp create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/blob_scrub_req.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/deep_blob_scrub_map.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/deep_shard_scrub_map.fbs rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/hs_backend_config.fbs (93%) create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/meta_scrub_req.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_map.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_req.fbs rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/resync_blob_data.fbs (100%) rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/resync_pg_data.fbs (100%) rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/resync_shard_data.fbs (100%) create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/shallow_blob_scrub_map.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/shallow_shard_scrub_map.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/shard_scrub_req.fbs create mode 100644 src/lib/homestore_backend/scrub_manager.cpp create mode 100644 src/lib/homestore_backend/scrub_manager.hpp create mode 100644 src/lib/homestore_backend/tests/hs_scrubber_tests.cpp create mode 100644 src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 51f00cd2e..000000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,13 +0,0 @@ -# Changelog -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [Unreleased] - -### Added - -- Created repository - -[Unreleased]: https://github.com/eBay/HomeObject/compare/...HEAD diff --git a/conanfile.py b/conanfile.py index 565d84535..73fd2f159 100644 --- a/conanfile.py +++ b/conanfile.py @@ -10,7 +10,7 @@ class HomeObjectConan(ConanFile): name = "homeobject" - version = "4.1.5" + version = "4.2.0" homepage = "https://github.com/eBay/HomeObject" description = "Blob Store built on HomeStore" diff --git a/src/include/homeobject/common.hpp b/src/include/homeobject/common.hpp index 29a0589a9..63eff1305 100644 --- a/src/include/homeobject/common.hpp +++ b/src/include/homeobject/common.hpp @@ -14,7 +14,7 @@ SISL_LOGGING_DECL(homeobject); -#define HOMEOBJECT_LOG_MODS homeobject, blobmgr, shardmgr, gcmgr +#define HOMEOBJECT_LOG_MODS homeobject, blobmgr, shardmgr, gcmgr, scrubmgr #ifndef Ki constexpr uint64_t Ki = 1024ul; diff --git a/src/lib/homeobject_impl.hpp b/src/lib/homeobject_impl.hpp index 4eb2af48f..4a0841fdd 100644 --- a/src/lib/homeobject_impl.hpp +++ b/src/lib/homeobject_impl.hpp @@ -70,6 +70,8 @@ struct PG { std::atomic< bool > is_dirty_{false}; ShardPtrList shards_; + blob_id_t get_last_blob_id() const { return durable_entities_.blob_sequence_num; } + void durable_entities_update(auto&& cb, bool dirty = true) { cb(durable_entities_); if (dirty) { is_dirty_.store(true, std::memory_order_relaxed); } diff --git a/src/lib/homestore_backend/CMakeLists.txt b/src/lib/homestore_backend/CMakeLists.txt index 441492a6b..f71daae72 100644 --- a/src/lib/homestore_backend/CMakeLists.txt +++ b/src/lib/homestore_backend/CMakeLists.txt @@ -30,6 +30,8 @@ target_sources("${PROJECT_NAME}_homestore" PRIVATE hs_cp_callbacks.cpp hs_http_manager.cpp gc_manager.cpp + scrub_manager.cpp + MPMCPriorityQueue.hpp $ ) target_link_libraries("${PROJECT_NAME}_homestore" PUBLIC @@ -42,10 +44,19 @@ settings_gen_cpp( ${FLATBUFFERS_FLATC_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/generated/ "${PROJECT_NAME}_homestore" - hs_backend_config.fbs - resync_pg_data.fbs - resync_shard_data.fbs - resync_blob_data.fbs + hs_homeobject_fbs/hs_backend_config.fbs + hs_homeobject_fbs/resync_pg_data.fbs + hs_homeobject_fbs/resync_shard_data.fbs + hs_homeobject_fbs/resync_blob_data.fbs + hs_homeobject_fbs/deep_blob_scrub_map.fbs + hs_homeobject_fbs/shallow_blob_scrub_map.fbs + hs_homeobject_fbs/blob_scrub_req.fbs + hs_homeobject_fbs/shard_scrub_req.fbs + hs_homeobject_fbs/deep_shard_scrub_map.fbs + hs_homeobject_fbs/shallow_shard_scrub_map.fbs + hs_homeobject_fbs/pg_meta_scrub_req.fbs + hs_homeobject_fbs/pg_meta_scrub_map.fbs + hs_homeobject_fbs/scrub_common.fbs ) # Unit test objects @@ -155,3 +166,11 @@ add_test(NAME HomestoreTestGC COMMAND homestore_test_gc -csv error --executor im --override_config hs_backend_config.gc_garbage_rate_threshold=0 --override_config hs_backend_config.gc_scan_interval_sec=5) +add_executable(homestore_test_scrubber) +target_sources(homestore_test_scrubber PRIVATE $) +target_link_libraries(homestore_test_scrubber PUBLIC homeobject_homestore ${COMMON_TEST_DEPS}) +add_test(NAME HomestoreTestScrubber COMMAND homestore_test_scrubber -csv error --executor immediate --config_path ./ + --override_config hs_backend_config.enable_scrubber=true + --override_config nuraft_mesg_config.mesg_factory_config.data_request_deadline_secs:10) + + diff --git a/src/lib/homestore_backend/MPMCPriorityQueue.hpp b/src/lib/homestore_backend/MPMCPriorityQueue.hpp new file mode 100644 index 000000000..585e2925e --- /dev/null +++ b/src/lib/homestore_backend/MPMCPriorityQueue.hpp @@ -0,0 +1,188 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace homeobject { + +/** + * @brief Multi-Producer Multi-Consumer Priority Queue (C++20) + * + * Thread-safe priority queue that supports: + * - Concurrent push operations from multiple producers + * - Concurrent pop operations from multiple consumers + * - Blocking pop when queue is empty + * - Graceful shutdown via close() method + * + * @tparam T Element type (must be comparable) + * @tparam Compare Comparison function (default: std::less for max-heap) + */ +template < typename T, typename Compare = std::less< T > > + requires std::regular< T > && std::predicate< Compare, T, T > +class MPMCPriorityQueue { +public: + using value_type = T; + using size_type = std::size_t; + using comparator_type = Compare; + + /** + * @brief Status codes returned by pop operations + */ + enum class Status : uint8_t { + Ok, ///< Successfully popped an element + Closed ///< Queue is closed, no more elements available + }; + + /** + * @brief Result of a pop operation + */ + struct PopResult { + Status status; + std::optional< T > value; ///< Has value only if status == Ok + + // Convenience methods + [[nodiscard]] constexpr bool is_ok() const noexcept { return status == Status::Ok; } + [[nodiscard]] constexpr bool is_closed() const noexcept { return status == Status::Closed; } + }; + + /** + * @brief Construct an empty priority queue + */ + constexpr MPMCPriorityQueue() noexcept(std::is_nothrow_default_constructible_v< Compare >) = default; + + /** + * @brief Destructor - automatically closes the queue + */ + ~MPMCPriorityQueue() { close(); } + + // Disable copy and move to prevent issues with condition variables + MPMCPriorityQueue(const MPMCPriorityQueue&) = delete; + MPMCPriorityQueue& operator=(const MPMCPriorityQueue&) = delete; + MPMCPriorityQueue(MPMCPriorityQueue&&) = delete; + MPMCPriorityQueue& operator=(MPMCPriorityQueue&&) = delete; + + /** + * @brief Thread-safe push operation (copy) + * + * @param value Element to insert + * @note No-op if queue is closed + */ + void push(const T& value) { + { + std::scoped_lock lock(mutex_); + if (closed_) [[unlikely]] { + return; // Silently ignore pushes to closed queue + } + pq_.push(value); + } + cv_.notify_one(); // Wake one waiting consumer + } + + /** + * @brief Thread-safe push operation (move) + * + * @param value Element to insert (will be moved) + * @note No-op if queue is closed + */ + void push(T&& value) { + { + std::scoped_lock lock(mutex_); + if (closed_) [[unlikely]] { return; } + pq_.push(std::move(value)); + } + cv_.notify_one(); + } + + /** + * @brief Thread-safe pop operation + * + * Blocks if queue is empty and not closed. + * Returns immediately if queue is closed. + * + * @return PopResult containing status and optional value + * @note Thread-safe for multiple concurrent consumers + */ + [[nodiscard]] PopResult pop() { + std::unique_lock lock(mutex_); + + // Wait until queue has elements or is closed + cv_.wait(lock, [this] { return closed_ || !pq_.empty(); }); + + // Try to pop an element + if (!pq_.empty()) { + T top = std::move(const_cast< T& >(pq_.top())); + pq_.pop(); + return PopResult{.status = Status::Ok, .value = std::move(top)}; + } + + // Queue is empty and closed + return PopResult{.status = Status::Closed, .value = std::nullopt}; + } + + /** + * @brief Close the queue + * + * After calling close(): + * - All blocked pop() calls will wake up + * - Existing elements can still be popped + * - New push() calls will be ignored + * - pop() returns Status::Closed when queue becomes empty + * + * @note Thread-safe and idempotent + */ + void close() noexcept { + { + std::scoped_lock lock(mutex_); + closed_ = true; + } + cv_.notify_all(); // Wake all waiting consumers + } + + /** + * @brief Get current number of elements + * + * @return Number of elements in the queue + * @note Thread-safe + */ + [[nodiscard]] size_type size() const { + std::scoped_lock lock(mutex_); + return pq_.size(); + } + + /** + * @brief Check if queue is empty + * + * @return true if queue has no elements + * @note Thread-safe + */ + [[nodiscard]] bool empty() const { + std::scoped_lock lock(mutex_); + return pq_.empty(); + } + + /** + * @brief Check if queue is closed + * + * @return true if close() has been called + * @note Thread-safe + */ + [[nodiscard]] bool is_closed() const { + std::scoped_lock lock(mutex_); + return closed_; + } + +private: + mutable std::mutex mutex_; + std::condition_variable cv_; + bool closed_{false}; + std::priority_queue< T, std::vector< T >, Compare > pq_; +}; + +} // namespace homeobject diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp index 8076d92f3..83fcaf1df 100644 --- a/src/lib/homestore_backend/gc_manager.cpp +++ b/src/lib/homestore_backend/gc_manager.cpp @@ -25,14 +25,14 @@ SISL_LOGGING_DECL(gcmgr) GCManager::GCManager(HSHomeObject* homeobject) : m_chunk_selector{homeobject->chunk_selector()}, m_hs_home_object{homeobject} { homestore::meta_service().register_handler( - _gc_actor_meta_name, + gc_actor_meta_name, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { on_gc_actor_meta_blk_found(std::move(buf), voidptr_cast(mblk)); }, nullptr, true); homestore::meta_service().register_handler( - _gc_reserved_chunk_meta_name, + gc_reserved_chunk_meta_name, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { on_reserved_chunk_meta_blk_found(std::move(buf), voidptr_cast(mblk)); }, @@ -44,7 +44,7 @@ GCManager::GCManager(HSHomeObject* homeobject) : true); homestore::meta_service().register_handler( - _gc_task_meta_name, + gc_task_meta_name, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { on_gc_task_meta_blk_found(std::move(buf), voidptr_cast(mblk)); }, @@ -64,7 +64,7 @@ void GCManager::on_gc_task_meta_blk_found(sisl::byte_view const& buf, void* meta // here, we are under the protection of the lock of metaservice. however, we will also try to update pg and shard // metablk and then destroy the gc_task_sb, which will also try to acquire the lock of metaservice, as a result, a // dead lock will happen. so here we will handle all the gc tasks after read all the metablks - m_recovered_gc_tasks.emplace_back(_gc_task_meta_name); + m_recovered_gc_tasks.emplace_back(gc_task_meta_name); m_recovered_gc_tasks.back().load(buf, meta_cookie); } @@ -89,7 +89,7 @@ void GCManager::handle_all_recovered_gc_tasks() { } void GCManager::on_gc_actor_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - m_gc_actor_sbs.emplace_back(_gc_actor_meta_name); + m_gc_actor_sbs.emplace_back(gc_actor_meta_name); auto& gc_actor_sb = m_gc_actor_sbs.back(); gc_actor_sb.load(buf, meta_cookie); auto pdev_id = gc_actor_sb->pdev_id; @@ -100,7 +100,7 @@ void GCManager::on_gc_actor_meta_blk_found(sisl::byte_view const& buf, void* met } void GCManager::on_reserved_chunk_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - homestore::superblk< gc_reserved_chunk_superblk > reserved_chunk_sb(_gc_reserved_chunk_meta_name); + homestore::superblk< gc_reserved_chunk_superblk > reserved_chunk_sb(gc_reserved_chunk_meta_name); auto chunk_id = reserved_chunk_sb.load(buf, meta_cookie)->chunk_id; auto EXVchunk = m_chunk_selector->get_extend_vchunk(chunk_id); if (EXVchunk == nullptr) { @@ -976,7 +976,7 @@ bool GCManager::pdev_gc_actor::copy_valid_data( if (err) { // we will come here if: - // 1 any blob copy fails, then err is operation_canceled + // 1 any blob copy fails, then err is operation_cancelled // 2 write footer fails, then err is the error code of write footer GCLOGE(task_id, pg_id, shard_id, "Failed to copy some blos or failed to write shard footer for move_to_chunk={}, " @@ -1271,7 +1271,7 @@ void GCManager::pdev_gc_actor::process_gc_task(chunk_id_t move_from_chunk, uint8 // after data copy, we persist the gc task meta blk. now, we can make sure all the valid blobs are successfully // copyed and new blob indexes have be written to gc index table before gc task superblk is persisted. - homestore::superblk< GCManager::gc_task_superblk > gc_task_sb{GCManager::_gc_task_meta_name}; + homestore::superblk< GCManager::gc_task_superblk > gc_task_sb{GCManager::gc_task_meta_name}; gc_task_sb.create(sizeof(GCManager::gc_task_superblk)); gc_task_sb->move_from_chunk = move_from_chunk; gc_task_sb->move_to_chunk = move_to_chunk; diff --git a/src/lib/homestore_backend/gc_manager.hpp b/src/lib/homestore_backend/gc_manager.hpp index 7fd2a46be..6a0415023 100644 --- a/src/lib/homestore_backend/gc_manager.hpp +++ b/src/lib/homestore_backend/gc_manager.hpp @@ -46,9 +46,9 @@ class GCManager { GCManager& operator=(GCManager&&) = delete; public: - inline static auto const _gc_actor_meta_name = std::string("GCActor"); - inline static auto const _gc_task_meta_name = std::string("GCTask"); - inline static auto const _gc_reserved_chunk_meta_name = std::string("GCReservedChunk"); + inline static auto const gc_actor_meta_name = std::string("GCActor"); + inline static auto const gc_task_meta_name = std::string("GCTask"); + inline static auto const gc_reserved_chunk_meta_name = std::string("GCReservedChunk"); inline static atomic_uint64_t _gc_task_id{1}; // 0 is used for crash recovery #pragma pack(1) @@ -61,7 +61,7 @@ class GCManager { uint64_t failed_egc_task_count{0ull}; uint64_t total_reclaimed_blk_count_by_gc{0ull}; uint64_t total_reclaimed_blk_count_by_egc{0ull}; - static std::string name() { return _gc_actor_meta_name; } + static std::string name() { return gc_actor_meta_name; } }; struct gc_task_superblk { @@ -70,12 +70,12 @@ class GCManager { chunk_id_t vchunk_id; pg_id_t pg_id; uint8_t priority; - static std::string name() { return _gc_task_meta_name; } + static std::string name() { return gc_task_meta_name; } }; struct gc_reserved_chunk_superblk { chunk_id_t chunk_id; - static std::string name() { return _gc_reserved_chunk_meta_name; } + static std::string name() { return gc_reserved_chunk_meta_name; } }; #pragma pack() diff --git a/src/lib/homestore_backend/hs_homeobject.cpp b/src/lib/homestore_backend/hs_homeobject.cpp index ef84a4c27..b030815cd 100644 --- a/src/lib/homestore_backend/hs_homeobject.cpp +++ b/src/lib/homestore_backend/hs_homeobject.cpp @@ -259,6 +259,14 @@ void HSHomeObject::init_homestore() { } else { LOGI("GC is disabled"); } + + // start scrubber + if (HS_BACKEND_DYNAMIC_CONFIG(enable_scrubber)) { + LOGI("Starting scrub manager"); + scrub_mgr_->start(); + } else { + LOGI("scrub manager is disabled"); + } } void HSHomeObject::on_replica_restart() { @@ -309,7 +317,6 @@ void HSHomeObject::on_replica_restart() { // gc_manager will be created only once here. we need make sure gc manager is created after all the pg meta blk // are replayed since we build pdev chunk heap in the constructor of gc manager , which depends on the pg meta. - // gc metablk handlers are registered in the constructor of gc manager gc_mgr_ = std::make_shared< GCManager >(this); @@ -326,7 +333,7 @@ void HSHomeObject::on_replica_restart() { gc_index_table_map.emplace(boost::uuids::to_string(uuid), gc_index_table); // 2 create gc actor superblk for each pdev, which contains the pdev_id and index table uuid. - homestore::superblk< GCManager::gc_actor_superblk > gc_actor_sb{GCManager::_gc_actor_meta_name}; + homestore::superblk< GCManager::gc_actor_superblk > gc_actor_sb{GCManager::gc_actor_meta_name}; gc_actor_sb.create(sizeof(GCManager::gc_actor_superblk)); gc_actor_sb->pdev_id = pdev_id; gc_actor_sb->index_table_uuid = uuid; @@ -340,7 +347,7 @@ void HSHomeObject::on_replica_restart() { for (size_t i = 0; i < reserved_chunk_num_per_pdev; ++i) { auto chunk = chunks[i]; homestore::superblk< GCManager::gc_reserved_chunk_superblk > reserved_chunk_sb{ - GCManager::_gc_reserved_chunk_meta_name}; + GCManager::gc_reserved_chunk_meta_name}; reserved_chunk_sb.create(sizeof(GCManager::gc_reserved_chunk_superblk)); reserved_chunk_sb->chunk_id = chunk; reserved_chunk_sb.write(); @@ -356,9 +363,9 @@ void HSHomeObject::on_replica_restart() { // when initializing, there is not gc task. we need to recover reserved chunks here, so that the reserved chunks // will not be put into pdev heap when built - homestore::meta_service().read_sub_sb(GCManager::_gc_actor_meta_name); - homestore::meta_service().read_sub_sb(GCManager::_gc_reserved_chunk_meta_name); - homestore::meta_service().read_sub_sb(GCManager::_gc_task_meta_name); + homestore::meta_service().read_sub_sb(GCManager::gc_actor_meta_name); + homestore::meta_service().read_sub_sb(GCManager::gc_reserved_chunk_meta_name); + homestore::meta_service().read_sub_sb(GCManager::gc_task_meta_name); // At this point, log replay has not started yet. We must process all recovered GC tasks before replay begins. // After log replay completes, ReplicationStateMachine::on_log_replay_done() calls select_specific_chunk() for @@ -377,6 +384,9 @@ void HSHomeObject::on_replica_restart() { gc_mgr_->handle_all_recovered_gc_tasks(); }); + + // initialize scrub manager + scrub_mgr_ = std::make_shared< ScrubManager >(this); } #if 0 @@ -446,16 +456,20 @@ void HSHomeObject::shutdown() { LOGI("waiting for {} pending requests to complete", pending_reqs); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); }; - LOGI("start stopping GC"); + LOGI("stopping GC"); // we need stop gc before shutting down homestore(where metaservice is shutdown), because gc mgr needs metaservice // to persist gc task metablk if there is any ongoing gc task. after stopping gc manager, there is no gc task // anymore, and thus now new gc task will be written to metaservice during homestore shutdown. - gc_mgr_->stop(); + if (gc_mgr_) gc_mgr_->stop(); + + LOGI("stopping scrubbing"); + if (scrub_mgr_) scrub_mgr_->stop(); LOGI("start shutting down HomeStore"); homestore::HomeStore::instance()->shutdown(); homestore::HomeStore::reset_instance(); gc_mgr_.reset(); + scrub_mgr_.reset(); iomanager.stop(); LOGI("complete shutting down HomeStore"); } diff --git a/src/lib/homestore_backend/hs_homeobject.hpp b/src/lib/homestore_backend/hs_homeobject.hpp index d4a1d25f4..d5a11c78d 100644 --- a/src/lib/homestore_backend/hs_homeobject.hpp +++ b/src/lib/homestore_backend/hs_homeobject.hpp @@ -14,10 +14,27 @@ #include "homeobject/common.hpp" #include "index_kv.hpp" #include "gc_manager.hpp" +#include "scrub_manager.hpp" #include "hs_backend_config.hpp" #include "generated/resync_pg_data_generated.h" #include "generated/resync_shard_data_generated.h" #include "generated/resync_blob_data_generated.h" +#include "generated/blob_scrub_req_generated.h" +#include "generated/deep_blob_scrub_map_generated.h" +#include "generated/shallow_blob_scrub_map_generated.h" +#include "generated/shard_scrub_req_generated.h" +#include "generated/shallow_shard_scrub_map_generated.h" +#include "generated/deep_shard_scrub_map_generated.h" +#include "generated/pg_meta_scrub_req_generated.h" +#include "generated/pg_meta_scrub_map_generated.h" +#include "generated/scrub_common_generated.h" + +#define SCRUB_RESULT_STRING(type_) \ + ((type_) == ScrubResult::NONE ? "NONE" \ + : (type_) == ScrubResult::IO_ERROR ? "IO_ERROR" \ + : (type_) == ScrubResult::MISMATCH ? "MISMATCH" \ + : (type_) == ScrubResult::NOT_FOUND ? "NOT_FOUND" \ + : "UNKNOWN") namespace homestore { struct meta_blk; @@ -364,6 +381,7 @@ class HSHomeObject : public HomeObjectImpl { shared< homestore::ReplDev > repl_dev_; std::shared_ptr< BlobIndexTable > index_table_; PGMetrics metrics_; + HSHomeObject& home_obj_; mutable pg_state pg_state_{0}; // Snapshot receiver progress info, used as a checkpoint for recovery @@ -372,8 +390,8 @@ class HSHomeObject : public HomeObjectImpl { mutable homestore::superblk< snapshot_rcvr_shard_list_superblk > snp_rcvr_shard_list_sb_; HS_PG(PGInfo info, shared< homestore::ReplDev > rdev, shared< BlobIndexTable > index_table, - std::shared_ptr< const std::vector< homestore::chunk_num_t > > pg_chunk_ids); - HS_PG(homestore::superblk< pg_info_superblk >&& sb, shared< homestore::ReplDev > rdev); + std::shared_ptr< const std::vector< homestore::chunk_num_t > > pg_chunk_ids, HSHomeObject& home_obj); + HS_PG(homestore::superblk< pg_info_superblk >&& sb, shared< homestore::ReplDev > rdev, HSHomeObject& home_obj); ~HS_PG() override = default; static PGInfo pg_info_from_sb(homestore::superblk< pg_info_superblk > const& sb); @@ -416,6 +434,19 @@ class HSHomeObject : public HomeObjectImpl { * Update membership in pg's superblock. */ void update_membership(const MemberSet& members); + + /* + * RPC handlers for scrub: + * 1. on_scrub_req_received: receive the scrub req from leader + * 2. on_scrub_map_received: receive the scrub map from followers + */ + void on_scrub_req_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data); + void on_scrub_map_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data); + + /** + * Register data RPC handlers for this PG + */ + void register_data_rpc_handlers(); }; struct HS_Shard : public Shard { @@ -537,6 +568,11 @@ class HSHomeObject : public HomeObjectImpl { inline const static homestore::MultiBlkId tombstone_pbas{0, 0, 0}; inline const static std::string delete_marker_blob_data{"HOMEOBJECT_BLOB_DELETE_MARKER"}; + // ask followers to scrub + inline const static std::string PUSH_SCRUB_REQ{"scrub_req"}; + // return scrub map to leader + inline const static std::string PUSH_SCRUB_MAP{"push_scrub_map"}; + class PGBlobIterator { public: struct blob_read_result { @@ -732,6 +768,7 @@ class HSHomeObject : public HomeObjectImpl { mutable std::shared_mutex snp_sbs_lock_; shared< HeapChunkSelector > chunk_selector_; shared< GCManager > gc_mgr_; + shared< ScrubManager > scrub_mgr_; unique< HttpManager > http_mgr_; static constexpr size_t max_zpad_bufs = _data_block_size / io_align; @@ -767,6 +804,7 @@ class HSHomeObject : public HomeObjectImpl { void local_create_shard(ShardInfo shard_info, homestore::chunk_num_t v_chunk_id, homestore::chunk_num_t p_chunk_id, homestore::blk_count_t blk_count, trace_id_t tid = 0); void add_new_shard_to_map(std::unique_ptr< HS_Shard > shard); + void delete_shard_from_map(shard_id_t shard_id); void update_shard_in_map(const ShardInfo& shard_info); // recover part @@ -986,6 +1024,7 @@ class HSHomeObject : public HomeObjectImpl { cshared< HeapChunkSelector > chunk_selector() const { return chunk_selector_; } cshared< GCManager > gc_manager() const { return gc_mgr_; } + cshared< ScrubManager > scrub_manager() const { return scrub_mgr_; } /** * @brief Reconciles the leaders for all PGs or a specific PG identified by pg_id. diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/blob_scrub_req.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/blob_scrub_req.fbs new file mode 100644 index 000000000..d276c3587 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/blob_scrub_req.fbs @@ -0,0 +1,13 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table BlobScrubReq { + scrub_info: ScrubInfo; + start: uint64; + end: uint64; + isdeepscrub: bool; +} + +// BlobScrubReq is used for requesting blob range scrub +root_type BlobScrubReq; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/deep_blob_scrub_map.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/deep_blob_scrub_map.fbs new file mode 100644 index 000000000..aaffe25a8 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/deep_blob_scrub_map.fbs @@ -0,0 +1,31 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table HashValue { + hash:[ubyte]; +} + +table ScrubResultValue { + result:ScrubResult; +} + +union ScrubValue { + HashValue, + ScrubResultValue +} + +table DeepBlobScrubResultEntry { + blob_key: BlobKey; + scrub_result: ScrubValue; +} + +table DeepBlobScrubMap { + scrub_info: ScrubInfo; + start: uint64; + end: uint64; + deep_blob_scrub_results: [DeepBlobScrubResultEntry]; +} + +// this is used for sending deep scrub result +root_type DeepBlobScrubMap; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/deep_shard_scrub_map.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/deep_shard_scrub_map.fbs new file mode 100644 index 000000000..5d2b42487 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/deep_shard_scrub_map.fbs @@ -0,0 +1,16 @@ +include "shallow_shard_scrub_map.fbs"; + +namespace homeobject; + +table DeepShardScrubResultEntry { + shard_id: uint64; + result: ScrubResult; +} + +table DeepShardScrubMap { + shallow_map: ShallowShardScrubMap; + problematic_shards: [DeepShardScrubResultEntry]; +} + +// this is used for sending deep shard scrub map +root_type DeepShardScrubMap; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_backend_config.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/hs_backend_config.fbs similarity index 93% rename from src/lib/homestore_backend/hs_backend_config.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/hs_backend_config.fbs index bd6991db9..983d19208 100644 --- a/src/lib/homestore_backend/hs_backend_config.fbs +++ b/src/lib/homestore_backend/hs_homeobject_fbs/hs_backend_config.fbs @@ -23,6 +23,10 @@ table HSBackendSettings { //TODO: make this hotswap after gc is well tested enable_gc: bool = true; + //Enable scrubber + //TODO: make this hotswap after scrubber is well tested + enable_scrubber: bool = false; + //Total reserved chunk num (dedicated for gc/egc) per pdev reserved_chunk_num_per_pdev: uint8 = 6; diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/meta_scrub_req.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/meta_scrub_req.fbs new file mode 100644 index 000000000..5892b3e7e --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/meta_scrub_req.fbs @@ -0,0 +1,13 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table MetaScrubReq { + scrub_type: ScrubType; + issuer_uuid : [ubyte]; + scrub_lsn: int64; + scrub_task_id: int64; +} + +// MetaScrubReq is used for requesting metadata scrub (PG/shard metadata) +// root_type MetaScrubReq; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_map.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_map.fbs new file mode 100644 index 000000000..d372e0fde --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_map.fbs @@ -0,0 +1,11 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table PGMetaScrubMap { + scrub_info: ScrubInfo; + pg_meta_scrub_result: ScrubResult; +} + +// this is used for sending pg meta scrub result +root_type PGMetaScrubMap; diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_req.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_req.fbs new file mode 100644 index 000000000..db0cc0400 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/pg_meta_scrub_req.fbs @@ -0,0 +1,10 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table PgMetaScrubReq { + scrub_info: ScrubInfo; +} + +// PgMetaScrubReq is used for requesting pg meta scrub map +root_type PgMetaScrubReq; \ No newline at end of file diff --git a/src/lib/homestore_backend/resync_blob_data.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/resync_blob_data.fbs similarity index 100% rename from src/lib/homestore_backend/resync_blob_data.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/resync_blob_data.fbs diff --git a/src/lib/homestore_backend/resync_pg_data.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/resync_pg_data.fbs similarity index 100% rename from src/lib/homestore_backend/resync_pg_data.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/resync_pg_data.fbs diff --git a/src/lib/homestore_backend/resync_shard_data.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/resync_shard_data.fbs similarity index 100% rename from src/lib/homestore_backend/resync_shard_data.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/resync_shard_data.fbs diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs new file mode 100644 index 000000000..24800ada5 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs @@ -0,0 +1,23 @@ +native_include "sisl/utility/non_null_ptr.hpp"; + +namespace homeobject; + +enum ScrubResult : uint8 { + NONE = 0, + IO_ERROR = 1, + MISMATCH = 2, + NOT_FOUND = 3 +} + +table BlobKey { + shard_id: uint64; + blob_id: uint64; +} + +table ScrubInfo { + pg_id: uint16; + task_id: uint64; + req_id: uint64; + scrub_lsn: int64; + issuer_uuid : [ubyte]; +} \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/shallow_blob_scrub_map.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/shallow_blob_scrub_map.fbs new file mode 100644 index 000000000..c3bb19b40 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/shallow_blob_scrub_map.fbs @@ -0,0 +1,13 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table ShallowBlobScrubMap { + scrub_info: ScrubInfo; + start: uint64; + end: uint64; + blobs: [BlobKey]; +} + +// this is used for sending shallow blob scrub map +root_type ShallowBlobScrubMap; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/shallow_shard_scrub_map.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/shallow_shard_scrub_map.fbs new file mode 100644 index 000000000..f591f7565 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/shallow_shard_scrub_map.fbs @@ -0,0 +1,13 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table ShallowShardScrubMap { + scrub_info: ScrubInfo; + start: uint64; + end: uint64; + shards: [uint64]; +} + +// this is used for sending shallow shard scrub map +root_type ShallowShardScrubMap; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/shard_scrub_req.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/shard_scrub_req.fbs new file mode 100644 index 000000000..981807eb0 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/shard_scrub_req.fbs @@ -0,0 +1,13 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table ShardScrubReq { + scrub_info: ScrubInfo; + start: uint64; + end: uint64; + isdeepscrub: bool; +} + +// ShardScrubReq is used for requesting shard range scrub +root_type ShardScrubReq; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_pg_manager.cpp b/src/lib/homestore_backend/hs_pg_manager.cpp index e12cbf672..cb6ea952b 100644 --- a/src/lib/homestore_backend/hs_pg_manager.cpp +++ b/src/lib/homestore_backend/hs_pg_manager.cpp @@ -223,7 +223,7 @@ folly::Expected< HSHomeObject::HS_PG*, PGError > HSHomeObject::local_create_pg(s auto uuid_str = boost::uuids::to_string(index_table->uuid()); repl_dev->set_custom_rdev_name(fmt::format("rdev{}", pg_info.id)); - auto hs_pg = std::make_unique< HS_PG >(std::move(pg_info), std::move(repl_dev), index_table, chunk_ids); + auto hs_pg = std::make_unique< HS_PG >(std::move(pg_info), std::move(repl_dev), index_table, chunk_ids, *this); auto ret = hs_pg.get(); { scoped_lock lck(index_lock_); @@ -236,6 +236,9 @@ folly::Expected< HSHomeObject::HS_PG*, PGError > HSHomeObject::local_create_pg(s // Add to index service, so that it gets cleaned up when index service is shutdown. hs()->index_service().add_index_table(index_table); add_pg_to_map(std::move(hs_pg)); + + // when local_create_pg is called by BR ,pg scrub superblk will not be overrite if it already exists + scrub_mgr_->add_pg(pg_info.id); } return ret; } @@ -350,7 +353,6 @@ void HSHomeObject::on_pg_start_replace_member(group_id_t group_id, const std::st auto hs_pg = static_cast< HSHomeObject::HS_PG* >(pg.get()); pg->pg_info_.members.emplace(std::move(to_pg_member(member_in))); pg->pg_info_.members.emplace(std::move(to_pg_member(member_out))); - uint32_t i{0}; pg_members* sb_members = hs_pg->pg_sb_->get_pg_members_mutable(); for (auto const& m : pg->pg_info_.members) { @@ -815,7 +817,6 @@ void HSHomeObject::destroy_hs_resources(pg_id_t pg_id) { chunk_selector_->reset_ void HSHomeObject::destroy_pg_index_table(pg_id_t pg_id) { std::shared_ptr< BlobIndexTable > index_table; - { // index_table->destroy() will trigger a cp_flush, which will call homeobject#cp_flush and try to acquire // `_pg_lock`, so we need to release the lock here to avoid a dead lock @@ -935,7 +936,7 @@ void HSHomeObject::on_pg_meta_blk_found(sisl::byte_view const& buf, void* meta_c std::vector< chunk_num_t > p_chunk_ids(pg_sb->get_chunk_ids(), pg_sb->get_chunk_ids() + pg_sb->num_chunks); bool set_pg_chunks_res = chunk_selector_->recover_pg_chunks(pg_id, std::move(p_chunk_ids)); auto uuid_str = boost::uuids::to_string(pg_sb->index_table_uuid); - auto hs_pg = std::make_unique< HS_PG >(std::move(pg_sb), std::move(v.value())); + auto hs_pg = std::make_unique< HS_PG >(std::move(pg_sb), std::move(v.value()), *this); if (!set_pg_chunks_res) { hs_pg->pg_state_.set_state(PGStateMask::DISK_DOWN); hs_pg->repl_dev_->set_stage(homestore::repl_dev_stage_t::UNREADY); @@ -971,12 +972,13 @@ PGInfo HSHomeObject::HS_PG::pg_info_from_sb(homestore::superblk< pg_info_superbl } HSHomeObject::HS_PG::HS_PG(PGInfo info, shared< homestore::ReplDev > rdev, shared< BlobIndexTable > index_table, - std::shared_ptr< const std::vector< chunk_num_t > > pg_chunk_ids) : + std::shared_ptr< const std::vector< chunk_num_t > > pg_chunk_ids, HSHomeObject& home_obj) : PG{std::move(info)}, pg_sb_{_pg_meta_name}, repl_dev_{std::move(rdev)}, index_table_{std::move(index_table)}, metrics_{*this}, + home_obj_{home_obj}, snp_rcvr_info_sb_{_snp_rcvr_meta_name}, snp_rcvr_shard_list_sb_{_snp_rcvr_shard_list_meta_name} { RELEASE_ASSERT(pg_chunk_ids != nullptr, "PG chunks null, pg={}", pg_info_.id); @@ -1011,15 +1013,23 @@ HSHomeObject::HS_PG::HS_PG(PGInfo info, shared< homestore::ReplDev > rdev, share pg_sb_chunk_ids[i] = pg_chunk_ids->at(i); } pg_sb_.write(); + + register_data_rpc_handlers(); } -HSHomeObject::HS_PG::HS_PG(superblk< pg_info_superblk >&& sb, shared< ReplDev > rdev) : - PG{pg_info_from_sb(sb)}, pg_sb_{std::move(sb)}, repl_dev_{std::move(rdev)}, metrics_{*this} { +HSHomeObject::HS_PG::HS_PG(superblk< pg_info_superblk >&& sb, shared< ReplDev > rdev, HSHomeObject& home_obj) : + PG{pg_info_from_sb(sb)}, + pg_sb_{std::move(sb)}, + repl_dev_{std::move(rdev)}, + metrics_{*this}, + home_obj_{home_obj} { durable_entities_.blob_sequence_num = pg_sb_->blob_sequence_num; durable_entities_.active_blob_count = pg_sb_->active_blob_count; durable_entities_.tombstone_blob_count = pg_sb_->tombstone_blob_count; durable_entities_.total_occupied_blk_count = pg_sb_->total_occupied_blk_count; durable_entities_.total_reclaimed_blk_count = pg_sb_->total_reclaimed_blk_count; + + register_data_rpc_handlers(); } uint32_t HSHomeObject::HS_PG::total_shards() const { return shards_.size(); } @@ -1115,6 +1125,212 @@ void HSHomeObject::HS_PG::update_membership(const MemberSet& members) { LOGI("PG membership updated, member_nums={}", pg_sb_->num_dynamic_members); } +void HSHomeObject::HS_PG::register_data_rpc_handlers() { + const auto& pg_id = pg_info_.id; + bool success; + + success = repl_dev_->add_data_rpc_service(PUSH_SCRUB_REQ, bind_this(HS_PG::on_scrub_req_received, 1)); + if (success) { + LOGI("Successfully registered PUSH_SCRUB_REQ RPC handler for pg={}", pg_id); + } else { + LOGW("PUSH_SCRUB_REQ RPC handler already registered for pg={}", pg_id); + } + + success = repl_dev_->add_data_rpc_service(PUSH_SCRUB_MAP, bind_this(HS_PG::on_scrub_map_received, 1)); + if (success) { + LOGI("Successfully registered PUSH_SCRUB_MAP RPC handler for pg={}", pg_id); + } else { + LOGW("PUSH_SCRUB_MAP RPC handler already registered for pg={}", pg_id); + } +} + +void HSHomeObject::HS_PG::on_scrub_req_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data) { + const auto pg_id = pg_info_.id; + LOGD("Received scrub_blob request for pg={}", pg_id); + + struct rpc_cleanup { + boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data_; + ~rpc_cleanup() { + if (rpc_data_) { rpc_data_->send_response(); } + } + } rpc_cleanup{rpc_data}; + + auto const& incoming_buf = rpc_data->request_blob(); + const auto buf_size = incoming_buf.size(); + const auto buf_ptr = incoming_buf.cbytes(); + + if (!buf_ptr || !buf_size) { + LOGW("SCRUB_BLOB received with empty buffer for pg={}", pg_id); + return; + } + + const auto scrub_type = *reinterpret_cast< const SCRUB_TYPE* >(buf_ptr); + const auto flatbuf_ptr = buf_ptr + sizeof(SCRUB_TYPE); + const auto flatbuf_size = buf_size - sizeof(SCRUB_TYPE); + flatbuffers::Verifier verifier(flatbuf_ptr, flatbuf_size); + + std::shared_ptr< ScrubManager::base_scrub_req > scrub_req; + bool success_to_load{false}; + switch (scrub_type) { + case SCRUB_TYPE::PG_META: { + if (!VerifySizePrefixedPgMetaScrubReqBuffer(verifier)) { + LOGW("SCRUB_BLOB received with invalid flatbuffer for pg={}", pg_id); + return; + } + scrub_req = std::make_shared< ScrubManager::base_scrub_req >(); + success_to_load = scrub_req->load(flatbuf_ptr, flatbuf_size); + break; + } + case SCRUB_TYPE::DEEP_BLOB: + case SCRUB_TYPE::SHALLOW_BLOB: { + if (!VerifySizePrefixedBlobScrubReqBuffer(verifier)) { + LOGW("SCRUB_BLOB received with invalid flatbuffer for pg={}", pg_id); + return; + } + scrub_req = std::make_shared< ScrubManager::blob_scrub_req >(); + success_to_load = scrub_req->load(flatbuf_ptr, flatbuf_size); + break; + } + case SCRUB_TYPE::DEEP_SHARD: + case SCRUB_TYPE::SHALLOW_SHARD: { + if (!VerifySizePrefixedShardScrubReqBuffer(verifier)) { + LOGW("SCRUB_SHARD received with invalid flatbuffer for pg={}", pg_id); + return; + } + scrub_req = std::make_shared< ScrubManager::shard_scrub_req >(); + success_to_load = scrub_req->load(flatbuf_ptr, flatbuf_size); + break; + } + default: + RELEASE_ASSERT(false, "Received unknown scrub type {} for pg={}", scrub_type, pg_id); + } + + if (!success_to_load) { + LOGW("Failed to load scrub_blob request from flatbuffer for pg={}", pg_id); + return; + } + + if (scrub_type != scrub_req->get_scrub_type()) { + LOGW("Scrub type in the request {} does not match with the scrub type in the buffer {}, pg={}", scrub_type, + scrub_req->get_scrub_type(), pg_id); + return; + } + + auto scrub_mgr = home_obj_.scrub_manager(); + if (!scrub_mgr) { + LOGW("ScrubManager is not initialized in HS_PG::on_scrub_req_received for pg={}", pg_id); + return; + } + scrub_mgr->add_scrub_req(scrub_req); +} + +void HSHomeObject::HS_PG::on_scrub_map_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data) { + const auto pg_id = pg_info_.id; + + struct rpc_cleanup { + boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data_; + ~rpc_cleanup() { + if (rpc_data_) { rpc_data_->send_response(); } + } + } rpc_cleanup{rpc_data}; + + auto const& incoming_buf = rpc_data->request_blob(); + const auto buf_size = incoming_buf.size(); + const auto buf_ptr = incoming_buf.cbytes(); + + if (!buf_ptr || !buf_size) { + LOGW("PUSH_DEEP_BLOB_SM received with empty buffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + + const auto scrub_type = *reinterpret_cast< const SCRUB_TYPE* >(buf_ptr); + const auto flatbuf_ptr = buf_ptr + sizeof(SCRUB_TYPE); + const auto flatbuf_size = buf_size - sizeof(SCRUB_TYPE); + flatbuffers::Verifier verifier(flatbuf_ptr, flatbuf_size); + + /* + auto fnv1a64 = [](const void* data, std::size_t len) -> std::uint64_t { + const std::uint8_t* p = static_cast< const std::uint8_t* >(data); + std::uint64_t h = 14695981039346656037ull; // offset basis + for (std::size_t i = 0; i < len; ++i) { + h ^= p[i]; + h *= 1099511628211ull; // FNV prime + } + return h; + }; + */ + + std::shared_ptr< ScrubManager::BaseScrubMap > scrub_map; + bool success_to_load{false}; + switch (scrub_type) { + case SCRUB_TYPE::SHALLOW_BLOB: { + if (!VerifySizePrefixedShallowBlobScrubMapBuffer(verifier)) { + LOGW("SHALLOW_BLOB scrub map received with invalid flatbuffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + scrub_map = std::make_shared< ScrubManager::ShallowBlobScrubMap >(); + success_to_load = scrub_map->load(flatbuf_ptr, flatbuf_size); + break; + } + case SCRUB_TYPE::DEEP_BLOB: { + if (!VerifySizePrefixedDeepBlobScrubMapBuffer(verifier)) { + LOGW("DEEP_BLOB scrub map received with invalid flatbuffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + scrub_map = std::make_shared< ScrubManager::DeepBlobScrubMap >(); + success_to_load = scrub_map->load(flatbuf_ptr, flatbuf_size); + break; + } + case SCRUB_TYPE::DEEP_SHARD: { + if (!VerifySizePrefixedDeepShardScrubMapBuffer(verifier)) { + LOGW("DEEP_SHARD scrub map received with invalid flatbuffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + scrub_map = std::make_shared< ScrubManager::DeepShardScrubMap >(); + success_to_load = scrub_map->load(flatbuf_ptr, flatbuf_size); + break; + } + case SCRUB_TYPE::SHALLOW_SHARD: { + if (!VerifySizePrefixedShallowShardScrubMapBuffer(verifier)) { + LOGW("SHALLOW_SHARD scrub map received with invalid flatbuffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + scrub_map = std::make_shared< ScrubManager::ShallowShardScrubMap >(); + success_to_load = scrub_map->load(flatbuf_ptr, flatbuf_size); + break; + } + case SCRUB_TYPE::PG_META: { + if (!VerifySizePrefixedPGMetaScrubMapBuffer(verifier)) { + LOGW("PG_META scrub map received with invalid flatbuffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + scrub_map = std::make_shared< ScrubManager::PGMetaScrubMap >(); + success_to_load = scrub_map->load(flatbuf_ptr, flatbuf_size); + break; + } + default: + RELEASE_ASSERT(false, "Received unknown scrub map type {} for pg={}", scrub_type, pg_id); + } + + if (!success_to_load) { + LOGW("Failed to load scrub map from flatbuffer for pg={}, scrub_type:{}", pg_id, scrub_type); + return; + } + + if (scrub_type != scrub_map->get_scrub_type()) { + LOGW("Scrub type in the request {} does not match with the scrub type in the buffer {}, pg={}", scrub_type, + scrub_map->get_scrub_type(), pg_id); + return; + } + + auto scrub_mgr = home_obj_.scrub_manager(); + if (!scrub_mgr) { + LOGW("ScrubManager is not initialized in HS_PG::on_scrub_map_received for pg={}", pg_id); + return; + } + scrub_mgr->add_scrub_map(pg_id, scrub_map); +} + // NOTE: caller should hold the _pg_lock const HSHomeObject::HS_PG* HSHomeObject::_get_hs_pg_unlocked(pg_id_t pg_id) const { auto iter = _pg_map.find(pg_id); @@ -1324,9 +1540,9 @@ void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore: auto hs_pg = dynamic_cast< HS_PG* >(iter->second.get()); auto move_from_v_chunk = chunk_selector()->get_extend_vchunk(move_from_chunk); - // TODO:: for now, when updating pchunk for a vchunk, we have to update the whole pg super blk. we can optimize this - // by persist a single superblk for each vchunk in the pg, so that we only need to update the vchunk superblk - // itself. + // TODO:: for now, when updating pchunk for a vchunk, we have to update the whole pg super blk. we can optimize + // this by persist a single superblk for each vchunk in the pg, so that we only need to update the vchunk + // superblk itself. auto pg_chunks = hs_pg->pg_sb_->get_chunk_ids_mutable(); @@ -1338,7 +1554,7 @@ void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore: if (sisl_unlikely(pg_chunks[v_chunk_id] == move_to_chunk)) { // this might happens when crash recovery. the crash happens after pg metablk is updated but before gc task // metablk is destroyed. - LOGD("gc task_id={}, the pchunk_id for vchunk={} for pg_id={} is already {}, update pg metablk again!", + LOGD("gc task_id={}, the pchunk_id for vchunk={} for pg_id={} is already {}, skip updating pg metablk!", task_id, v_chunk_id, pg_id, move_to_chunk); } else { RELEASE_ASSERT(pg_chunks[v_chunk_id] == move_from_chunk, @@ -1349,35 +1565,36 @@ void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore: LOGD("gc task_id={}, pchunk for vchunk={} of pg_id={} is updated from {} to {}", task_id, v_chunk_id, pg_id, move_from_chunk, move_to_chunk); - // TODO:hs_pg->shards_.size() will be decreased by 1 in delete_shard if gc finds a empty shard, which will be - // implemented later - hs_pg->durable_entities_update([this, move_from_v_chunk, &move_to_chunk, &move_from_chunk, &pg_id, - &task_id](auto& de) { - // active_blob_count is updated by put/delete blob, not change it here. + // TODO:hs_pg->shards_.size() will be decreased by 1 in delete_shard if gc finds a empty shard, which will + // be implemented later + hs_pg->durable_entities_update( + [this, move_from_v_chunk, &move_to_chunk, &move_from_chunk, &pg_id, &task_id](auto& de) { + // active_blob_count is updated by put/delete blob, not change it here. - // considering the complexity of gc crash recovery for tombstone_blob_count, we get it directly from index - // table , which is the most accurate. + // considering the complexity of gc crash recovery for tombstone_blob_count, we get it directly from + // index table , which is the most accurate. - // TODO::do we need this as durable entity? remove it and get all the from pg index in real time. - de.tombstone_blob_count = get_pg_tombstone_blob_count(pg_id); + // TODO::do we need this as durable entity? remove it and get all the from pg index in real time. + de.tombstone_blob_count = get_pg_tombstone_blob_count(pg_id); - auto move_to_v_chunk = chunk_selector()->get_extend_vchunk(move_to_chunk); + auto move_to_v_chunk = chunk_selector()->get_extend_vchunk(move_to_chunk); - auto total_occupied_blk_count_by_move_from_chunk = move_from_v_chunk->get_used_blks(); - auto total_occupied_blk_count_by_move_to_chunk = move_to_v_chunk->get_used_blks(); + auto total_occupied_blk_count_by_move_from_chunk = move_from_v_chunk->get_used_blks(); + auto total_occupied_blk_count_by_move_to_chunk = move_to_v_chunk->get_used_blks(); - // TODO::in recovery case , this might be updated again , fix me later. - const auto reclaimed_blk_count = - total_occupied_blk_count_by_move_from_chunk - total_occupied_blk_count_by_move_to_chunk; + // TODO::in recovery case , this might be updated again , fix me later. + const auto reclaimed_blk_count = + total_occupied_blk_count_by_move_from_chunk - total_occupied_blk_count_by_move_to_chunk; - de.total_occupied_blk_count -= reclaimed_blk_count; - de.total_reclaimed_blk_count += reclaimed_blk_count; + de.total_occupied_blk_count -= reclaimed_blk_count; + de.total_reclaimed_blk_count += reclaimed_blk_count; - LOGD("gc task_id={}, move_from_chunk={}, total_occupied_blk_count_by_move_from_chunk={}, move_to_chunk={}, " - "total_occupied_blk_count_by_move_to_chunk={}, total_occupied_blk_count={}", - task_id, move_from_chunk, total_occupied_blk_count_by_move_from_chunk, move_to_chunk, - total_occupied_blk_count_by_move_to_chunk, de.total_occupied_blk_count.load()); - }); + LOGD("gc task_id={}, move_from_chunk={}, total_occupied_blk_count_by_move_from_chunk={}, " + "move_to_chunk={}, " + "total_occupied_blk_count_by_move_to_chunk={}, total_occupied_blk_count={}", + task_id, move_from_chunk, total_occupied_blk_count_by_move_from_chunk, move_to_chunk, + total_occupied_blk_count_by_move_to_chunk, de.total_occupied_blk_count.load()); + }); hs_pg->pg_sb_->total_occupied_blk_count = hs_pg->durable_entities().total_occupied_blk_count.load(std::memory_order_relaxed); diff --git a/src/lib/homestore_backend/hs_shard_manager.cpp b/src/lib/homestore_backend/hs_shard_manager.cpp index 8c949cb3e..387515b89 100644 --- a/src/lib/homestore_backend/hs_shard_manager.cpp +++ b/src/lib/homestore_backend/hs_shard_manager.cpp @@ -63,15 +63,15 @@ uint64_t ShardManager::max_shard_size() { return Gi; } uint64_t ShardManager::max_shard_num_in_pg() { return ((uint64_t)0x01) << shard_width; } -shard_id_t HSHomeObject::generate_new_shard_id(pg_id_t pgid) { +shard_id_t HSHomeObject::generate_new_shard_id(pg_id_t pg_id) { std::scoped_lock lock_guard(_pg_lock); - auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pgid)); + auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); RELEASE_ASSERT(hs_pg, "Missing pg info"); auto new_sequence_num = ++hs_pg->shard_sequence_num_; RELEASE_ASSERT(new_sequence_num < ShardManager::max_shard_num_in_pg(), "new shard id must be less than ShardManager::max_shard_num_in_pg()"); - return make_new_shard_id(pgid, new_sequence_num); + return make_new_shard_id(pg_id, new_sequence_num); } uint64_t HSHomeObject::get_sequence_num_from_shard_id(uint64_t shard_id) { @@ -737,6 +737,26 @@ void HSHomeObject::add_new_shard_to_map(std::unique_ptr< HS_Shard > shard) { if (sequence_num > hs_pg->shard_sequence_num_) { hs_pg->shard_sequence_num_ = sequence_num; } } +void HSHomeObject::delete_shard_from_map(shard_id_t shard_id) { + std::scoped_lock lock_guard(_pg_lock, _shard_lock); + auto shard_iter = _shard_map.find(shard_id); + RELEASE_ASSERT(shard_iter != _shard_map.end(), + "try to delete shardID=0x{:x}, pg={}, shard=0x{:x}, but shard does not exist", shard_id, + (shard_id >> homeobject::shard_width), (shard_id & homeobject::shard_mask)); + auto hs_shard = d_cast< HS_Shard* >((*shard_iter->second).get()); + const auto pg_id = hs_shard->info.placement_group; + + auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); + RELEASE_ASSERT(hs_pg, "Missing pg info, pg={}", pg_id); + auto& shards = hs_pg->shards_; + shards.remove_if([shard_id](auto& shard_it) { return (shard_it->info).id == shard_id; }); + _shard_map.erase(shard_id); + + auto p_chunk_id = hs_shard->p_chunk_id(); + chunk_to_shards_map_[p_chunk_id].erase(shard_id); + // TODO:: delete shard meta blk +} + void HSHomeObject::update_shard_in_map(const ShardInfo& shard_info) { std::scoped_lock lock_guard(_shard_lock); auto shard_iter = _shard_map.find(shard_info.id); diff --git a/src/lib/homestore_backend/replication_state_machine.cpp b/src/lib/homestore_backend/replication_state_machine.cpp index 6ea8a1c07..5d022213b 100644 --- a/src/lib/homestore_backend/replication_state_machine.cpp +++ b/src/lib/homestore_backend/replication_state_machine.cpp @@ -287,9 +287,13 @@ void ReplicationStateMachine::on_destroy(const homestore::group_id_t& group_id) LOGW("do not have pg mapped by group_id={}", boost::uuids::to_string(group_id)); return; } - home_object_->pg_destroy(PG_ID.value()); - LOGI("replica destroyed, cleared pg={} resources with group_id={}", PG_ID.value(), - boost::uuids::to_string(group_id)); + + const auto pg_id = PG_ID.value(); + home_object_->pg_destroy(pg_id); + LOGI("replica destroyed, cleared pg={} resources with group_id={}", pg_id, boost::uuids::to_string(group_id)); + // there is a case that after pg is destroyed above and crash happends before scrub_mgr#remove_pg is called, there + // will be a stale pg_scrub_superblk. we will handle this in metablk replay. + home_object_->scrub_manager()->remove_pg(pg_id); } void ReplicationStateMachine::on_remove_member(const homestore::replica_id_t& member, trace_id_t tid) { @@ -1052,4 +1056,31 @@ void ReplicationStateMachine::on_log_replay_done(const homestore::group_id_t& gr home_object_->refresh_pg_statistics(pg_id); } +void ReplicationStateMachine::on_become_leader(const homestore::group_id_t& group_id) { + auto pg_id_opt = home_object_->get_pg_id_with_group_id(group_id); + if (!pg_id_opt.has_value()) { + LOGE("become leader but can not find any pg for group={}!", group_id); + return; + } + const auto pg_id = pg_id_opt.value(); + RELEASE_ASSERT(home_object_->pg_exists(pg_id), "pg={} should exist, but not! fatal error!", pg_id); + // TODO:: add whatever acitons needed to be take. +} + +void ReplicationStateMachine::on_become_follower(const homestore::group_id_t& group_id) { + auto pg_id_opt = home_object_->get_pg_id_with_group_id(group_id); + if (!pg_id_opt.has_value()) { + LOGE("become follower but can not find any pg for group={}!", group_id); + return; + } + const auto pg_id = pg_id_opt.value(); + RELEASE_ASSERT(home_object_->pg_exists(pg_id), "pg={} should exist, but not! fatal error!", pg_id); + + LOGI("become follower of group {}, cancel scrub task for pg={}", group_id, pg_id); + // TODO:: add whatever acitons needed to be take. + + // cancel scrub task if I am not leader again. + home_object_->scrub_manager()->cancel_scrub_task(pg_id); +} + } // namespace homeobject diff --git a/src/lib/homestore_backend/replication_state_machine.hpp b/src/lib/homestore_backend/replication_state_machine.hpp index 77906c8ae..98849507e 100644 --- a/src/lib/homestore_backend/replication_state_machine.hpp +++ b/src/lib/homestore_backend/replication_state_machine.hpp @@ -240,6 +240,14 @@ class ReplicationStateMachine : public homestore::ReplDevListener { /// void on_log_replay_done(const homestore::group_id_t& group_id) override; + /// @brief this is called when this node becomes leader for the group + /// @param group_id - the group , where all the logs are replayed but not join raft group + virtual void on_become_leader(const homestore::group_id_t& group_id) override; + + /// @brief this is called when this node becomes follower for the group + /// @param group_id - the group , where all the logs are replayed but not join raft group + virtual void on_become_follower(const homestore::group_id_t& group_id) override; + private: HSHomeObject* home_object_{nullptr}; diff --git a/src/lib/homestore_backend/scrub_manager.cpp b/src/lib/homestore_backend/scrub_manager.cpp new file mode 100644 index 000000000..5b9452fa7 --- /dev/null +++ b/src/lib/homestore_backend/scrub_manager.cpp @@ -0,0 +1,1878 @@ +#include "hs_homeobject.hpp" +#include +#include +#include +#include + +namespace homeobject { + +SISL_LOGGING_DECL(scrubmgr) +#define NO_TASK_ID 0 +#define HDD_IOPS 200 + +#define SCRUBLOG(level, pg_id, task_id, msg, ...) \ + LOG##level##MOD(scrubmgr, "[pg_id={}, task_id={}] " msg, pg_id, task_id, ##__VA_ARGS__) + +#define SCRUBLOGD(pg_id, task_id, msg, ...) SCRUBLOG(DEBUG, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGI(pg_id, task_id, msg, ...) SCRUBLOG(INFO, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGW(pg_id, task_id, msg, ...) SCRUBLOG(WARN, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGE(pg_id, task_id, msg, ...) SCRUBLOG(ERROR, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGC(pg_id, task_id, msg, ...) SCRUBLOG(CRITICAL, pg_id, task_id, msg, ##__VA_ARGS__) + +ScrubManager::ScrubManager(HSHomeObject* homeobject) : m_hs_home_object{homeobject} { + // Register meta_service handlers to recover pg scrub superblocks + std::vector< homestore::superblk< pg_scrub_superblk > > stale_pg_scrub_sbs; + homestore::meta_service().register_handler( + pg_scrub_meta_name, + [this, &stale_pg_scrub_sbs](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { + on_pg_scrub_meta_blk_found(std::move(buf), voidptr_cast(mblk), stale_pg_scrub_sbs); + }, + nullptr, true); + homestore::meta_service().read_sub_sb(pg_scrub_meta_name); + + // remove stale pg scrub superblocks + for (auto& sb : stale_pg_scrub_sbs) + sb.destroy(); +} + +ScrubManager::~ScrubManager() { stop(); } + +void ScrubManager::scan_pg_for_scrub() { + for (auto const& [pg_id, _] : m_pg_scrub_sb_map) { + if (is_eligible_for_deep_scrub(pg_id)) { + LOGINFOMOD(scrubmgr, "pg={} is eligible for deep scrub", pg_id); + submit_scrub_task(pg_id, true) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, pg_id](std::shared_ptr< ShallowScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "deep scrub failed for pg={}", pg_id); + return; + } + LOGINFOMOD(scrubmgr, "deep scrub is completed for pg={}", pg_id); + auto deep_report = std::dynamic_pointer_cast< DeepScrubReport >(report); + if (!deep_report) { + LOGERRORMOD(scrubmgr, "report for deep scrub can not be casted to DeepScrubReport for pg={}", + pg_id); + return; + } + handle_deep_pg_scrub_report(std::move(deep_report)); + }); + } else if (is_eligible_for_shallow_scrub(pg_id)) { + LOGINFOMOD(scrubmgr, "pg={} is eligible for shallow scrub", pg_id); + submit_scrub_task(pg_id, false) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, pg_id](std::shared_ptr< ShallowScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "deep scrub failed for pg={}", pg_id); + return; + } + LOGINFOMOD(scrubmgr, "shallow scrub is completed for pg={}", pg_id); + handle_shallow_pg_scrub_report(std::move(report)); + }); + } else { + LOGINFOMOD(scrubmgr, "pg={} is not eligible for any scrubbing!", pg_id); + } + } +} + +void ScrubManager::handle_shallow_pg_scrub_report(std::shared_ptr< ShallowScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "Shallow scrub report is null!"); + return; + } + + report->print(); + // TODO:: add more logic, log event for notifcation. +} + +void ScrubManager::handle_deep_pg_scrub_report(std::shared_ptr< DeepScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "Deep scrub report is null!"); + return; + } + + report->print(); + // TODO:: add more logic, log event for notifcation. +} + +bool ScrubManager::is_eligible_for_deep_scrub(const pg_id_t& pg_id) { + // TODO:: add the real eligibility check logic + return false; +} + +bool ScrubManager::is_eligible_for_shallow_scrub(const pg_id_t& pg_id) { + // TODO:: add the real eligibility check logic + return false; +} + +void ScrubManager::start() { + // TODO :: make thread count configurable, thread number is the most concurrent scrub tasks that can be handled + // concurrently. too many concurrent scrub tasks may bring too much pressure to the node. + const auto most_concurrent_scrub_task_num = 2; + m_scrub_executor = std::make_shared< folly::IOThreadPoolExecutor >(most_concurrent_scrub_task_num); + for (int i = 0; i < most_concurrent_scrub_task_num; ++i) { + m_scrub_executor->add([this]() { + while (true) { + // if no available scrub task, it will be blocked here. + auto pop_result = m_scrub_task_queue.pop(); + if (pop_result.is_closed()) { + LOGINFOMOD(scrubmgr, "scrub task queue is stopped, no need to handle scrub task anymore!"); + break; + } + RELEASE_ASSERT(pop_result.value.has_value() && pop_result.is_ok(), + "pop from scrub task queue should not fail when it is not closed!"); + auto task = pop_result.value.value(); + // we handle pg scrub task in a single thread , so that we can control the concurrent scrub tasks by + // controlling the thread number of m_scrub_executor. + handle_pg_scrub_task(std::move(task)); + } + }); + } + + const auto most_concurrent_scrub_req_num = 2; + // we don`t set priority for req as that of task, only control the concurrency to not bring too much pressuer to + // this node. + m_scrub_req_executor = std::make_shared< folly::IOThreadPoolExecutor >(most_concurrent_scrub_req_num); + + iomanager.run_on_wait(iomgr::reactor_regex::random_worker, [&]() { + m_scrub_timer_fiber = iomanager.iofiber_self(); + // TODO: make the interval configurable, for now set it to 60 seconds + m_scrub_timer_hdl = iomanager.schedule_thread_timer(60ull * 1000 * 1000 * 1000, true, nullptr /*cookie*/, + [this](void*) { scan_pg_for_scrub(); }); + }); + LOGINFOMOD(scrubmgr, "scrub manager started!"); +} + +void ScrubManager::stop() { + // shutdown timer + if (m_scrub_timer_hdl == iomgr::null_timer_handle) { + LOGINFOMOD(scrubmgr, "scrub scheduler timer is not running, no need to stop it"); + return; + } + RELEASE_ASSERT(m_scrub_timer_fiber, + "m_scrub_timer_hdl is not null_timer_handle, but m_scrub_timer_fiber is null, fatal error!"); + LOGINFOMOD(scrubmgr, "stop scrub scheduler timer"); + iomanager.run_on_wait(m_scrub_timer_fiber, [&]() { + iomanager.cancel_timer(m_scrub_timer_hdl, true); + m_scrub_timer_hdl = iomgr::null_timer_handle; + }); + m_scrub_timer_fiber = nullptr; + + // cancel all the running scrub tasks and clear the scrub task queue. + // TODO:: add a stoopeed flag to avoid adding new scrub task if stopped. + m_scrub_task_queue.close(); + for (auto& [_, pg_scrub_ctx] : m_pg_scrub_ctx_map) { + pg_scrub_ctx->cancel(); + } + + m_scrub_executor->stop(); + m_scrub_executor.reset(); + m_scrub_req_executor->stop(); + m_scrub_req_executor.reset(); + LOGINFOMOD(scrubmgr, "scrub manager stopped!"); +} + +void ScrubManager::add_scrub_req(std::shared_ptr< base_scrub_req > req) { + m_scrub_req_executor->add([this, req = std::move(req)]() { handle_scrub_req(req); }); +} + +bool ScrubManager::add_scrub_map(const pg_id_t pg_id, std::shared_ptr< BaseScrubMap > bsm) { + auto pg_scrub_ctx_it = m_pg_scrub_ctx_map.find(pg_id); + if (pg_scrub_ctx_it == m_pg_scrub_ctx_map.end()) { + LOGERRORMOD(scrubmgr, "can not find scrub context for pg_id={}, fail to add scrub map!", pg_id); + return false; + } + + auto& pg_scrub_ctx = pg_scrub_ctx_it->second; + return pg_scrub_ctx->add_scrub_map(std::move(bsm)); +} + +void ScrubManager::handle_scrub_req(std::shared_ptr< base_scrub_req > req) { + if (!req) { + LOGERRORMOD(scrubmgr, "scrub req is null, can not handle it!"); + return; + } + + const auto& pg_id = req->pg_id; + const auto& task_id = req->task_id; + const auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + SCRUBLOGD(pg_id, task_id, "can not find hs_pg, fail to handle scrub req!"); + return; + } + + const auto& pg_repl_dev = hs_pg->repl_dev_; + if (!pg_repl_dev) { + SCRUBLOGD(pg_id, task_id, "repl_dev is null, fail to handle scrub req!"); + return; + } + + if (pg_repl_dev->is_leader()) { + SCRUBLOGD(pg_id, task_id, "leader of pg, no need to handle stale scrub req!"); + return; + } + + std::shared_ptr< BaseScrubMap > scrub_map; + auto& remote_peer_id = req->issuer_peer_id; + + // 1 do scrub + const auto scrub_type = req->get_scrub_type(); + switch (scrub_type) { + case SCRUB_TYPE::PG_META: { + SCRUBLOGD(pg_id, task_id, "handling pg meta scrub req, "); + scrub_map = scrub_pg_meta(req); + break; + } + case SCRUB_TYPE::DEEP_BLOB: + case SCRUB_TYPE::SHALLOW_BLOB: { + auto blob_req = std::dynamic_pointer_cast< blob_scrub_req >(req); + RELEASE_ASSERT(blob_req, "Failed to cast to blob_scrub_req"); + SCRUBLOGD(pg_id, task_id, "handling blob scrub req, is_deep_scrub:{}", req->is_deep_scrub()); + scrub_map = local_scrub_blob(blob_req); + break; + } + case SCRUB_TYPE::DEEP_SHARD: + case SCRUB_TYPE::SHALLOW_SHARD: { + auto shard_req = std::dynamic_pointer_cast< shard_scrub_req >(req); + RELEASE_ASSERT(shard_req, "Failed to cast to shard_scrub_req"); + SCRUBLOGD(pg_id, task_id, "handling shard scrub req, is_deep_scrub:{}", req->is_deep_scrub()); + scrub_map = local_scrub_shard(shard_req); + break; + } + default: + RELEASE_ASSERT(false, "unknown scrub req type: {}!", scrub_type); + } + + if (!scrub_map) { + SCRUBLOGD(pg_id, task_id, "fail to handle scrub req, drop it!"); + return; + } + + // 2 send scrub map back to leader + auto flatbuffer = scrub_map->build_flat_buffer(); + sisl::io_blob_list_t blob_list; + blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); + blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + + // no need to retry, leader will handle retries + pg_repl_dev->data_request_unidirectional(remote_peer_id, HSHomeObject::PUSH_SCRUB_MAP, blob_list) + .via(&folly::InlineExecutor::instance()) + .thenValue([pg_id, remote_peer_id, scrub_type, task_id](auto&& response) { + if (response.hasError()) { + SCRUBLOGD(pg_id, task_id, "failed to send scrub map to peer {}, scrub_type:{}, error={}", + remote_peer_id, scrub_type, response.error()); + return; + } + + SCRUBLOGD(pg_id, task_id, "successfully sent scrub map to peer {}, scrub_type:{}", remote_peer_id, + scrub_type); + }); +} + +bool ScrubManager::wait_for_scrub_lsn_commit(shared< homestore::ReplDev > repl_dev, int64_t scrub_lsn) { + if (!repl_dev) { + LOGERRORMOD(scrubmgr, "repl_dev is null, can not wait for scrub lsn commit!"); + return false; + } + + // TODO:: make this configurable + const auto wait_retry_times = 2; + for (auto i = 0; i < wait_retry_times; ++i) { + auto commit_lsn = repl_dev->get_last_commit_lsn(); + if (commit_lsn >= scrub_lsn) { + LOGINFOMOD(scrubmgr, "commit lsn {} is greater than or equal to scrub lsn {}, wait successfully", + commit_lsn, scrub_lsn); + return true; + } + LOGINFOMOD(scrubmgr, + "commit lsn {} is less than scrub lsn {}, wait for 1 second before retrying, retry times {}/{}", + commit_lsn, scrub_lsn, i + 1, wait_retry_times); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + return false; +} + +std::shared_ptr< ScrubManager::PGMetaScrubMap > ScrubManager::scrub_pg_meta(std::shared_ptr< base_scrub_req > req) { + const auto my_uuid = m_hs_home_object->our_uuid(); + const auto pg_id = req->pg_id; + const auto task_id = req->task_id; + const auto req_id = req->req_id; + const auto scrub_lsn = req->scrub_lsn; + auto pg_meta_scrub_map = + std::make_shared< ScrubManager::PGMetaScrubMap >(pg_id, task_id, req_id, scrub_lsn, my_uuid); + + SCRUBLOGD(pg_id, task_id, "req_id={}, do pg meta scrub", req_id); + + // TODO:: add support to read the pg meta blk of a specific pg. + // read pg metablk and compare with in-memory state, return the real pg meta scrub map after comparison. + + return pg_meta_scrub_map; +} + +std::shared_ptr< ScrubManager::BaseScrubMap > ScrubManager::local_scrub_blob(std::shared_ptr< blob_scrub_req > req) { + const auto my_uuid = m_hs_home_object->our_uuid(); + const auto task_id = req->task_id; + const auto req_id = req->req_id; + const auto scrub_lsn = req->scrub_lsn; + const auto& pg_id = req->pg_id; + const auto& start_blob_id = req->start; + const auto& end_blob_id = req->end; + + SCRUBLOGD(pg_id, task_id, "req_id={}, scrub_blob: range [{}, {}], scrub_lsn={}", req_id, start_blob_id, end_blob_id, + scrub_lsn); + + auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + SCRUBLOGD(pg_id, task_id, "req_id={},can not find hs_pg, fail to do deep blob scrub!", req_id); + return nullptr; + } + + if (!wait_for_scrub_lsn_commit(hs_pg->repl_dev_, scrub_lsn)) { + SCRUBLOGD(pg_id, task_id, + "req_id={}, commit lsn is not advanced to scrub lsn {} after waiting for a while, fail to do deep " + "blob scrub", + req_id, scrub_lsn); + return nullptr; + } + + // get all the scrub candidate blobs. we only get those blobs in this range and the sealed_lsn of the shard is after + // the scrub_lsn. + const auto start = BlobRouteKey{BlobRoute{0, start_blob_id}}; + const auto end = BlobRouteKey{BlobRoute{std::numeric_limits< uint64_t >::max(), end_blob_id}}; + + std::vector< std::pair< BlobRouteKey, BlobRouteValue > > scrub_candidate_blobs; + auto& pg_index_table = hs_pg->index_table_; + homestore::BtreeQueryRequest< BlobRouteKey > query_req{ + homestore::BtreeKeyRange< BlobRouteKey >{start, true /* inclusive */, end, true /* inclusive */}, + homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, std::numeric_limits< uint32_t >::max(), + [scrub_lsn, start_blob_id, end_blob_id](homestore::BtreeKey const& key, + homestore::BtreeValue const& value) -> bool { + BlobRouteValue existing_value{value}; + if (existing_value.pbas() == HSHomeObject::tombstone_pbas) { return false; } + const auto blob_route_key = BlobRouteKey{key}; + if (blob_route_key.key().blob < start_blob_id || blob_route_key.key().blob > end_blob_id) { return false; } + + // TODO:: after we have shard seal_lsn, check whether the shard of the blob is sealed after scrub_lsn. If + // yes, filter it out as well. + + return true; + }}; + + auto const status = pg_index_table->query(query_req, scrub_candidate_blobs); + if (status != homestore::btree_status_t::success) { + SCRUBLOGD(pg_id, task_id, "req_id={}, Failed to query blobs in index table for status={}", req_id, status); + return nullptr; + } + + const bool is_deep_scrub = req->is_deep_scrub(); + + if (!is_deep_scrub) { + auto shallow_srcub_map = std::make_shared< ScrubManager::ShallowBlobScrubMap >( + pg_id, task_id, req_id, scrub_lsn, my_uuid, start_blob_id, end_blob_id); + + for (const auto& [k, _] : scrub_candidate_blobs) { + shallow_srcub_map->add_blob(k.key()); + } + + SCRUBLOGD(pg_id, task_id, "req_id={}, shallow blob scrub completed, found {} blobs in range [{},{})", req_id, + shallow_srcub_map->blobs.size(), start, end); + + return shallow_srcub_map; + } + + // deep scrub: read and check blobs. + auto deep_scrub_map = std::make_shared< ScrubManager::DeepBlobScrubMap >(pg_id, task_id, req_id, scrub_lsn, my_uuid, + start_blob_id, end_blob_id); + auto& data_service = homestore::data_service(); + const auto blk_size = data_service.get_blk_size(); + + // Sort scrub_candidate_blobs by PBA (physical block address) for sequential disk access + std::sort(scrub_candidate_blobs.begin(), scrub_candidate_blobs.end(), [](const auto& a, const auto& b) { + // Compare by PBA to_string() for ordering + const auto pba_a = a.second.pbas().to_single_blkid(); + const auto pba_b = b.second.pbas().to_single_blkid(); + return pba_a.blk_num() < pba_b.blk_num(); + }); + + // to not bring to much io pressure, we deep scrub blob one by one. + // TODO: scrubbing blobs concurrently if neccessary. + for (const auto& [k, v] : scrub_candidate_blobs) { + auto pba = v.pbas(); + auto total_size = pba.blk_count() * blk_size; + sisl::sg_list data_sgs; + data_sgs.size = total_size; + data_sgs.iovs.emplace_back( + iovec{.iov_base = iomanager.iobuf_alloc(blk_size, total_size), .iov_len = total_size}); + + data_service.async_read(pba, data_sgs, total_size) + .thenValue([this, &k, data_sgs = std::move(data_sgs), deep_scrub_map](auto&& err) { + auto blob = data_sgs.iovs[0].iov_base; + + struct buffer_free_guard { + uint8_t* buf; + ~buffer_free_guard() { iomanager.iobuf_free(buf); } + } guard{reinterpret_cast< uint8_t* >(blob)}; + + if (err) { + LOGERRORMOD(scrubmgr, "Failed to read blob for deep scrub, blob_route={}, error={}", k.key(), + err.message()); + deep_scrub_map->add_blob_result(k.key(), ScrubResult::IO_ERROR); + return; + } + + const auto& shard_id = k.key().shard; + const auto& blob_id = k.key().blob; + const auto blob_verify_succeed = m_hs_home_object->verify_blob(blob, shard_id, blob_id, true); + if (!blob_verify_succeed) { + LOGERRORMOD(scrubmgr, "Blob verification failed for deep scrub, blob_route={}", k.key()); + deep_scrub_map->add_blob_result(k.key(), ScrubResult::MISMATCH); + return; + } + + BlobHashArray blob_hash{}; + HSHomeObject::BlobHeader const* header = r_cast< HSHomeObject::BlobHeader const* >(blob); + std::memcpy(blob_hash.data(), header->hash, blob_hash.size()); + deep_scrub_map->add_blob_result(k.key(), blob_hash); + }) + // we do deep blob sequentially, so that we can control the io pressure brought by deep scrub. + .get(); + } + + SCRUBLOGD(pg_id, task_id, "req_id={}, deep blob scrub completed, found {} blobs in range [{},{})", req_id, + deep_scrub_map->blobs.size(), start, end); + + return deep_scrub_map; +} + +std::shared_ptr< ScrubManager::ShallowShardScrubMap > +ScrubManager::local_scrub_shard(std::shared_ptr< shard_scrub_req > req) { + const auto my_uuid = m_hs_home_object->our_uuid(); + const auto task_id = req->task_id; + const auto req_id = req->req_id; + const auto scrub_lsn = req->scrub_lsn; + const auto& pg_id = req->pg_id; + const auto start = req->start; + const auto end = req->end; + + auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + SCRUBLOGD(pg_id, task_id, "can not find hs_pg, fail to do deep shard scrub!"); + return nullptr; + } + + if (!wait_for_scrub_lsn_commit(hs_pg->repl_dev_, scrub_lsn)) { + SCRUBLOGD(pg_id, task_id, + "commit lsn is not advanced to scrub lsn {} after waiting for a while, fail to do local shard scrub ", + scrub_lsn); + return nullptr; + } + + std::shared_ptr< ScrubManager::ShallowShardScrubMap > shard_srcub_map; + const bool is_deep_scrub = req->is_deep_scrub(); + if (is_deep_scrub) { + shard_srcub_map = + std::make_shared< ScrubManager::DeepShardScrubMap >(pg_id, task_id, req_id, scrub_lsn, my_uuid); + } else { + shard_srcub_map = + std::make_shared< ScrubManager::ShallowShardScrubMap >(pg_id, task_id, req_id, scrub_lsn, my_uuid); + } + + // Iterate through all shards in the PG + for (const auto& shard_it : hs_pg->shards_) { + auto shard_id = (shard_it->info).id; + // remove pg_id, get the pure shard id. + auto pure_shard_id = (shard_id << pg_width) >> pg_width; + if (pure_shard_id < start || pure_shard_id > end) { continue; } + + // TODO:: filter out those shards whose seal_lsn is after scrub_lsn, as they are not in the candidate shard + // list for scrub. + shard_srcub_map->add_shard(shard_id); + + // TODO:: optimize the folloing logic, dynamic cast once. + if (is_deep_scrub) { + auto deep_shard_scrub_map = std::dynamic_pointer_cast< DeepShardScrubMap >(shard_srcub_map); + RELEASE_ASSERT(deep_shard_scrub_map, + "shard_srcub_map should be DeepShardScrubMap when is_deep_scrub is true!"); + + // TODO: Read and verify shard metablk + // For now, we just mark it as NONE (no error found) since we can not read a specific shard metablk for now. + // it needs the support of homestore#metaservice. we should: + // 1. Read shard metablk from homestore + // 2. Compare with in-memory shard info + // 3. If mismatch or not found or error_io, add to problematic_shards with appropriate ScrubResult + + // TODO:: if find any problematic shard meta blk. + // deep_shard_scrub_map->add_problematic_shard(shard_id, ScrubResult::NONE); + } + } + + SCRUBLOGD(pg_id, task_id, "shard scrub completed, checked {} shards in range [{},{})", + shard_srcub_map->shards.size(), start, end); + + return shard_srcub_map; +} + +folly::SemiFuture< std::shared_ptr< ScrubManager::ShallowScrubReport > > +ScrubManager::submit_scrub_task(const pg_id_t& pg_id, const bool is_deep, const bool force, + SCRUB_TRIGGER_TYPE trigger_type) { + LOGINFOMOD(scrubmgr, "submit a scrub task for pg={}, deep_scrub:{}", pg_id, is_deep); + auto it = m_pg_scrub_ctx_map.find(pg_id); + if (it != m_pg_scrub_ctx_map.end()) { + // TODO:: there is case that two thread try to submit scrub task for the same pg at the same time, we can + // optimize it by adding a lock for each pg or using atomic operation to make sure only one scrub task can be + // submitted for each pg, and other threads can get the existing scrub task if they want to submit another scrub + // task for the same pg. + LOGWARNMOD(scrubmgr, "a scrub task is already running for pg={}, no need to submit another one!", pg_id); + return folly::makeFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + const auto ps_scrub_super_blk_it = m_pg_scrub_sb_map.find(pg_id); + if (ps_scrub_super_blk_it == m_pg_scrub_sb_map.end()) { + LOGERRORMOD(scrubmgr, "can not find scrub superblk for pg={}, fail to submit scrub task!", pg_id); + return folly::makeFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + // Get the PG and check its state + const auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + LOGERRORMOD(scrubmgr, "can not find hs_pg for pg={}, fail to submit scrub task!", pg_id); + return folly::makeFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + // Check if pg_state is HEALTHY (state must be 0) + if (!force) { + const auto current_state = hs_pg->pg_state_.get(); + if (current_state != 0) { + LOGWARNMOD(scrubmgr, "pg={} is not in HEALTHY state (current_state={}), cannot submit scrub task!", pg_id, + current_state); + return folly::makeFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + // Set SCRUBBING state + hs_pg->pg_state_.set_state(PGStateMask::SCRUBBING); + LOGINFOMOD(scrubmgr, "set SCRUBBING state for pg={}", pg_id); + } + + // TODO::check the stopped flag to avoid submit new scrub task when scrub manager is stopped. + + const auto& pg_scrub_sb = *(ps_scrub_super_blk_it->second); + const auto last_scrub_time = + is_deep ? pg_scrub_sb->last_deep_scrub_timestamp : pg_scrub_sb->last_shallow_scrub_timestamp; + + auto [promise, future] = folly::makePromiseContract< std::shared_ptr< ShallowScrubReport > >(); + ScrubManager::scrub_task task(last_scrub_time, pg_id, is_deep, trigger_type, std::move(promise)); + m_scrub_task_queue.push(std::move(task)); + return std::move(future); +} + +void ScrubManager::cancel_scrub_task(const pg_id_t& pg_id) { + auto it = m_pg_scrub_ctx_map.find(pg_id); + if (it == m_pg_scrub_ctx_map.end()) { + LOGWARNMOD(scrubmgr, "no running scrub task for pg={}, no need to cancel!", pg_id); + return; + } + it->second->cancel(); + LOGINFOMOD(scrubmgr, "cancel scrub task for pg={}", pg_id); +} + +// Helper function to send scrub requests to all peers and handle retries +bool ScrubManager::send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, + const std::unordered_set< peer_id_t >& all_member_peer_ids, + const peer_id_t& my_uuid, shared< homestore::ReplDev > pg_repl_dev, + const sisl::io_blob_list_t& req_blob_list, + std::shared_ptr< PGScrubContext > scrub_ctx, uint32_t max_retries, + std::chrono::seconds timeout, const std::string& scrub_type_name) { + // Lambda to send requests to a list of peers + auto send_requests_to_remote_peers = [&](const auto& peer_list, bool is_retry) { + for (const auto& peer_id : peer_list) { + if (peer_id == my_uuid) continue; + pg_repl_dev->data_request_unidirectional(peer_id, HSHomeObject::PUSH_SCRUB_REQ, req_blob_list) + .via(&folly::InlineExecutor::instance()) + .thenValue([pg_id, peer_id, task_id, scrub_type_name, is_retry](auto&& response) { + if (response.hasError()) { + SCRUBLOGE(pg_id, task_id, "{} to send {} scrub request to peer {}", + is_retry ? "retry failed" : "failed", scrub_type_name, peer_id); + } + }); + } + }; + + // Send initial requests to all peers + send_requests_to_remote_peers(all_member_peer_ids, false); + + // Wait for all responses and retry if needed + if (!scrub_ctx->wait_for_all_req_sms(timeout)) { + for (uint32_t retry = 0; retry < max_retries; ++retry) { + auto peers_to_retry = scrub_ctx->get_peers_to_retry(); + if (peers_to_retry.empty()) break; + + SCRUBLOGD(pg_id, task_id, "Retrying {} scrub for {} peers", scrub_type_name, peers_to_retry.size()); + send_requests_to_remote_peers(peers_to_retry, true); + + if (scrub_ctx->wait_for_all_req_sms(timeout)) break; + } + } + + // Check if cancelled or incomplete + if (scrub_ctx->is_cancelled() || scrub_ctx->peer_sm_map_.size() != scrub_ctx->member_peer_ids_.size()) { + SCRUBLOGD(pg_id, task_id, "scrub task is cancelled or incomplete when scrubbing {}!", scrub_type_name); + return false; + } + return true; +} + +void ScrubManager::handle_pg_scrub_task(scrub_task task) { + // we handle deep and shallow scrub task in the same fuction to reduce code duplication. + // TODO:: separate them if the logic is very different in the future. + + const auto& pg_id = task.pg_id; + const auto& task_id = task.task_id; + const auto is_deep_scrub = task.is_deep_scrub; + SCRUBLOGD(pg_id, task_id, + "Starting handling {} scrub task, last_scrub_time={} =====", is_deep_scrub ? "deep" : "shallow", + task.last_scrub_time); + + std::shared_ptr< ShallowScrubReport > pg_scrub_report = + is_deep_scrub ? std::make_shared< DeepScrubReport >(pg_id) : std::make_shared< ShallowScrubReport >(pg_id); + + struct scrub_task_guard { + HSHomeObject* home_obj; + folly::ConcurrentHashMap< pg_id_t, std::shared_ptr< PGScrubContext > >& pg_scrub_ctx_map; + scrub_task& task; + std::shared_ptr< ShallowScrubReport >& scrub_report; + const pg_id_t& pg_id; + + ~scrub_task_guard() { + pg_scrub_ctx_map.erase(pg_id); + task.scrub_report_promise->setValue(scrub_report); + + // Clear SCRUBBING state from pg_state + auto hs_pg = home_obj->get_hs_pg(pg_id); + if (hs_pg) { + hs_pg->pg_state_.clear_state(PGStateMask::SCRUBBING); + LOGINFOMOD(scrubmgr, "cleared SCRUBBING state for pg={}", pg_id); + } + } + } guard{m_hs_home_object, m_pg_scrub_ctx_map, task, pg_scrub_report, pg_id}; + + const auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + SCRUBLOGE(pg_id, task_id, "can not find hs_pg for this pg, fail this scrub task!"); + return; + } + + const auto& members = (hs_pg->pg_info_).members; + std::unordered_set< peer_id_t > all_member_peer_ids; + for (const auto& member : members) { + all_member_peer_ids.insert(member.id); + } + + const auto& my_uuid = m_hs_home_object->our_uuid(); + // TODO: the node is removed from the raft group? handle this case later + RELEASE_ASSERT(all_member_peer_ids.find(my_uuid) != all_member_peer_ids.end(), + "my uuid={} is not in the member list of this pg, something is wrong!", my_uuid); + + auto [ctx_it, happened] = + m_pg_scrub_ctx_map.try_emplace(pg_id, std::make_shared< PGScrubContext >(task_id, all_member_peer_ids)); + if (!happened) { + SCRUBLOGE(pg_id, task_id, "a scrub task is already running for this pg, fail this {} scrub task!", + is_deep_scrub ? "deep" : "shallow"); + return; + } + + auto& scrub_ctx = ctx_it->second; + const auto& pg_repl_dev = hs_pg->repl_dev_; + auto scrub_lsn = pg_repl_dev->get_last_commit_lsn(); + + // Scrub timeout configuration (based on HDD random read performance, iops) + // Worst case scenario: 2GB data in a chunk, 4K (min blob size) random read, 7200 RPM HDD ~200 IOPS + // Total read operations: 2GB / 4KB = 524,288 reads + // Estimated time: 524,288 / 200 = 5,243 seconds ≈ 44minutes + // so scrub the whole chunk at a time is not acceptable. we need to scrub blobs range by range , every range should + // have a acceptable timeout. + + // TODO::make the following parameters configurable and find the optimal value based on real world scrub performance + // test. for hdd, iops matters more than throughput in scrubbing case. + constexpr uint32_t MAX_RETRIES = 5; // Maximum retry attempts + constexpr auto SM_REQUEST_TIMEOUT = std::chrono::seconds(10); + + // Step 1: Scrub PG Meta (only for deep scrub) + if (is_deep_scrub) { + SCRUBLOGD(pg_id, task_id, "Starting PG meta scrub"); + auto pg_meta_req = std::make_shared< base_scrub_req >(task_id, scrub_ctx->req_id.fetch_add(1), scrub_lsn, + my_uuid, pg_id, true); + // TODO:: add a lock here to protect add_scrub_map when changing current_req. + scrub_ctx->current_req = pg_meta_req; + // Send requests to all peers + auto flatbuffer = pg_meta_req->build_flat_buffer(); + sisl::io_blob_list_t req_blob_list; + const auto scrub_type = SCRUB_TYPE::PG_META; + req_blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); + req_blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + + // Scrub locally async (runs in parallel with remote requests) + m_scrub_req_executor->add([this, pg_meta_req, scrub_ctx, pg_id, task_id]() { + auto pg_meta_map = scrub_pg_meta(pg_meta_req); + if (!scrub_ctx->add_scrub_map(pg_meta_map)) { + SCRUBLOGE(pg_id, task_id, "failed to add local PG meta scrub map to context!"); + } else { + SCRUBLOGD(pg_id, task_id, "Local PG meta scrub added"); + } + }); + + // Send requests to all peers and wait for responses + if (!send_scrub_req_and_wait(pg_id, task_id, all_member_peer_ids, my_uuid, pg_repl_dev, req_blob_list, + scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT, "PG meta")) { + return; + } + + // Merge PG meta scrub results + pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + SCRUBLOGD(pg_id, task_id, "PG meta scrub completed"); + } + + // Step 2: Scrub Shard Range + SCRUBLOGD(pg_id, task_id, "Starting shard range {} scrub", is_deep_scrub ? "deep" : "shallow"); + { + // we can not scrub all shards based on the shard sealed_lsn, especially after we have shard seal_lsn. since + // leader might lose some shard, so if we select the shard range seen by leader itself, we might miss those lost + // shards which exist on the follower , but not on leader. so for now, we select shard range based on the + // current shard_sequence_num_ at leader, which is the last shard_id in of the pg. + + // we assume shard_id will not overflow uint64_t; + const auto last_shard_id = hs_pg->shard_sequence_num_; + + // the key point here is that until we commit to scrub_lsn , we should at least see last_shard_id. + scrub_lsn = pg_repl_dev->get_last_commit_lsn(); + SCRUBLOGD(pg_id, task_id, "Shard range: 0 to {}, scrub_lsn={}", last_shard_id, scrub_lsn); + + // TODO:: make it configurable。 + // a shard_id_t is uint64(8B). if we want the max size of a shard scrub map is 16MB, then the num of + // shard_it in a shard scrub map should be 16MB/8B=2M(2097152) + const auto shard_scrub_range_size = 2097152; + + // Scrub shard range + uint64_t shard_start = 0; + uint64_t shard_end = shard_scrub_range_size; + uint64_t shard_range_count = 0; + for (; shard_start <= last_shard_id; + shard_start = shard_end + 1, shard_end = std::min(shard_end + shard_scrub_range_size, last_shard_id)) { + ++shard_range_count; + SCRUBLOGD(pg_id, task_id, "Scrubbing shard range {}: [{}, {}]", shard_range_count, shard_start, shard_end); + + auto shard_req = std::make_shared< shard_scrub_req >(task_id, scrub_ctx->req_id.fetch_add(1), scrub_lsn, + my_uuid, pg_id, shard_start, shard_end, is_deep_scrub); + scrub_ctx->reset_for_new_req(); + scrub_ctx->current_req = shard_req; + + // scrub locally async (runs in parallel with remote requests) + m_scrub_req_executor->add([this, shard_req, scrub_ctx, pg_id, task_id, is_deep_scrub]() { + auto scrub_map = local_scrub_shard(shard_req); + if (!scrub_ctx->add_scrub_map(scrub_map)) { + SCRUBLOGE(pg_id, task_id, "failed to add local {} shard scrub map to context!", + is_deep_scrub ? "deep" : "shallow"); + } else { + SCRUBLOGD(pg_id, task_id, "local {} shard scrub map added!", is_deep_scrub ? "deep" : "shallow"); + } + }); + + // request remote peers to scrub this shard range and wait for responses + auto flatbuffer = shard_req->build_flat_buffer(); + sisl::io_blob_list_t req_blob_list; + const auto scrub_type = is_deep_scrub ? SCRUB_TYPE::DEEP_SHARD : SCRUB_TYPE::SHALLOW_SHARD; + req_blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); + req_blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + + if (!send_scrub_req_and_wait(pg_id, task_id, all_member_peer_ids, my_uuid, pg_repl_dev, req_blob_list, + scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT, "shard")) { + SCRUBLOGE(pg_id, task_id, "shard scrub failed or was cancelled"); + return; + } + + SCRUBLOGD(pg_id, task_id, "Merging shard scrub results for range [{}, {}]", shard_start, shard_end); + pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + } + SCRUBLOGD(pg_id, task_id, "shard scrub completed, total ranges scrubbed: {}", shard_range_count); + } + + // Step 3: Scrub Blob Range + SCRUBLOGD(pg_id, task_id, "Starting blob range {} scrub", is_deep_scrub ? "deep" : "shallow"); + { + // we assume shard_id will not overflow uint64_t; + const auto last_blob_id = hs_pg->get_last_blob_id(); + + // just like shard, the key point here is that until we commit to scrub_lsn , we can see last_blob_id. + scrub_lsn = pg_repl_dev->get_last_commit_lsn(); + SCRUBLOGD(pg_id, task_id, "Blob range: 0 to {}, scrub_lsn={}", last_blob_id, scrub_lsn); + + // For deep scrub: since we have a SM_REQUEST_TIMEOUT as scrub map request timeout. assuming the iops of a hdd + // is 200, and we want at most half of the time to be spent on io, so we have this blob range. + // For shallow scrub: we will not schedule io to disk, so we set blob scrub range the same as that of shard. + const auto blob_scrub_range_size = is_deep_scrub ? (HDD_IOPS * (SM_REQUEST_TIMEOUT.count() / 2)) : 2097152; + + // Scrub blob range + uint64_t blob_start = 0; + uint64_t blob_end = blob_scrub_range_size; + uint64_t blob_range_count = 0; + for (; blob_start <= last_blob_id; + blob_start = blob_end + 1, blob_end = std::min(blob_end + blob_scrub_range_size, last_blob_id)) { + ++blob_range_count; + SCRUBLOGD(pg_id, task_id, "Scrubbing blob range {}: [{}, {}]", blob_range_count, blob_start, blob_end); + + auto blob_req = std::make_shared< blob_scrub_req >(task_id, scrub_ctx->req_id.fetch_add(1), scrub_lsn, + my_uuid, pg_id, blob_start, blob_end, is_deep_scrub); + scrub_ctx->reset_for_new_req(); + scrub_ctx->current_req = blob_req; + + // locally scrub this blob range async (runs in parallel with remote requests) + m_scrub_req_executor->add([this, blob_req, scrub_ctx, pg_id, task_id, is_deep_scrub]() { + auto scrub_map = local_scrub_blob(blob_req); + if (!scrub_ctx->add_scrub_map(scrub_map)) { + SCRUBLOGE(pg_id, task_id, "failed to add local {} blob scrub map to context!", + is_deep_scrub ? "deep" : "shallow"); + } else { + SCRUBLOGD(pg_id, task_id, "local {} blob scrub map added!", is_deep_scrub ? "deep" : "shallow"); + } + }); + + // request remote peers to scrub this blob range and wait for responses + auto flatbuffer = blob_req->build_flat_buffer(); + sisl::io_blob_list_t req_blob_list; + const auto scrub_type = is_deep_scrub ? SCRUB_TYPE::DEEP_BLOB : SCRUB_TYPE::SHALLOW_BLOB; + req_blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); + req_blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + + if (!send_scrub_req_and_wait(pg_id, task_id, all_member_peer_ids, my_uuid, pg_repl_dev, req_blob_list, + scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT, "blob")) { + SCRUBLOGE(pg_id, task_id, "blob scrub failed or was cancelled"); + return; + } + + SCRUBLOGD(pg_id, task_id, "Merging blob scrub results for range [{}, {}]", blob_start, blob_end); + pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + } + SCRUBLOGD(pg_id, task_id, "blob scrub completed, total ranges scrubbed: {}", blob_range_count); + } + + // only if pg is successfully scrubbed, we persist scrub metablk. + save_scrub_superblk(pg_id, is_deep_scrub, true); + SCRUBLOGD(pg_id, task_id, "successfully complete {} scrub task!", is_deep_scrub ? "deep" : "shallow"); +} + +void ScrubManager::add_pg(const pg_id_t pg_id) { + LOGINFOMOD(scrubmgr, "added new scrub superblock for pg={}", pg_id); + if (nullptr == m_hs_home_object->get_hs_pg(pg_id)) { + LOGINFOMOD(scrubmgr, "can not find pg={}!", pg_id); + return; + } + + // to avoid create-pg log replay overriding existing scrub superblock, we only create new superblock when there is + // no existing one + save_scrub_superblk(pg_id, false, false); +} + +void ScrubManager::remove_pg(const pg_id_t pg_id) { + auto it = m_pg_scrub_sb_map.find(pg_id); + if (it == m_pg_scrub_sb_map.end()) { + LOGINFOMOD(scrubmgr, "no scrub superblock found for pg={}, no need to remove", pg_id); + return; + } + + LOGINFOMOD(scrubmgr, "removed pg={} in scrub manager!", pg_id); + cancel_scrub_task(pg_id); + it->second->destroy(); + m_pg_scrub_ctx_map.erase(pg_id); + m_pg_scrub_sb_map.erase(it); +} + +// this function is called in meta_service thread context and m_pg_scrub_sb_map_mtx +void ScrubManager::on_pg_scrub_meta_blk_found( + sisl::byte_view const& buf, void* meta_cookie, + std::vector< homestore::superblk< pg_scrub_superblk > >& stale_pg_scrub_sbs) { + auto sb = std::make_shared< homestore::superblk< pg_scrub_superblk > >(); + (*sb).load(buf, meta_cookie); + const auto pg_id = (*sb)->pg_id; + + auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + // this is a stale pg scrub superblock, we just log and destroy it. + LOGINFOMOD(scrubmgr, "can not find pg={}, destroy stale scrub superblock", pg_id); + stale_pg_scrub_sbs.emplace_back(std::move(*sb)); + return; + } + const auto last_deep_scrub_time = (*sb)->last_deep_scrub_timestamp; + const auto last_shallow_scrub_time = (*sb)->last_shallow_scrub_timestamp; + + m_pg_scrub_sb_map.emplace(pg_id, std::move(sb)); + LOGINFOMOD(scrubmgr, "loaded scrub superblock for pg={}, last_deep_scrub_time={}, last_shallow_scrub_time={}", + pg_id, last_deep_scrub_time, last_shallow_scrub_time); +} + +void ScrubManager::save_scrub_superblk(const pg_id_t pg_id, const bool is_deep_scrub, bool force_update) { + const auto current_time = + std::chrono::duration_cast< std::chrono::seconds >(std::chrono::system_clock::now().time_since_epoch()).count(); + + auto it = m_pg_scrub_sb_map.find(pg_id); + if (it == m_pg_scrub_sb_map.end()) { + // Create new superblock for this PG + auto sb = std::make_shared< homestore::superblk< pg_scrub_superblk > >(pg_scrub_meta_name); + (*sb).create(sizeof(pg_scrub_superblk)); + (*sb)->pg_id = pg_id; + (*sb)->last_deep_scrub_timestamp = current_time; + (*sb)->last_deep_scrub_timestamp = current_time; + (*sb).write(); + m_pg_scrub_sb_map.emplace(pg_id, std::move(sb)); + return; + } + + if (force_update) { + // Update existing superblock + if (is_deep_scrub) { + (*(it->second))->last_deep_scrub_timestamp = current_time; + } else { + (*(it->second))->last_shallow_scrub_timestamp = current_time; + } + (*(it->second)).write(); + } else { + LOGINFOMOD(scrubmgr, "skip updating scrub superblock for pg={} since there is no scrub progress update", pg_id); + } +} + +std::optional< ScrubManager::pg_scrub_superblk > ScrubManager::get_scrub_superblk(const pg_id_t pg_id) const { + auto it = m_pg_scrub_sb_map.find(pg_id); + if (it == m_pg_scrub_sb_map.end()) { + LOGWARNMOD(scrubmgr, "scrub superblk not found for pg {}", pg_id); + return std::nullopt; + } + + return *(*(it->second)); +} + +/* ScrubContext */ +bool ScrubManager::PGScrubContext::add_scrub_map(std::shared_ptr< ScrubManager::BaseScrubMap > bsm) { + if (!bsm) { + LOGWARNMOD(scrubmgr, "received null scrub map, ignore it!"); + return false; + } + + const auto& peer_id = bsm->peer_id; + const auto pg_id = bsm->pg_id; + if (member_peer_ids_.find(peer_id) == member_peer_ids_.end()) { + SCRUBLOGD(pg_id, task_id, "received scrub map from peer {} which is not in the pg member list, ignore it!", + peer_id); + return false; + } + + { + std::lock_guard lg(mtx_); + if (!bsm->match(current_req)) { + SCRUBLOGD(pg_id, task_id, "scrub map does not match up with current req, skip adding"); + return false; + } + auto [_, happened] = peer_sm_map_.try_emplace(peer_id, bsm); + if (!happened) { + SCRUBLOGD(pg_id, task_id, "already received scrub map from peer {}, ignore the duplicated one!", peer_id); + return false; + } + const auto received_sm_count = peer_sm_map_.size(); + RELEASE_ASSERT(received_sm_count <= member_peer_ids_.size(), + "received scrub map count {} should not exceed member peer count {}, something is wrong!", + received_sm_count, member_peer_ids_.size()); + } + + // this is a best effort notification, wait might miss this notification and wait for timeout, but it won't cause + // correctness issue. + cv_.notify_all(); + SCRUBLOGD(pg_id, task_id, "added scrub map from peer {}, current received scrub map count {}/{}", peer_id, + peer_sm_map_.size(), member_peer_ids_.size()); + return true; +} + +std::vector< peer_id_t > ScrubManager::PGScrubContext::get_peers_to_retry() const { + std::vector< peer_id_t > peers_to_retry; + std::lock_guard lg(mtx_); + for (const auto& peer_id : member_peer_ids_) { + if (peer_sm_map_.find(peer_id) == peer_sm_map_.end()) { peers_to_retry.push_back(peer_id); } + } + + return peers_to_retry; +} + +// wait until sms from all peers are received, or the task is cancelled, or timeout happens. if timeout happens, caller +// can decide to retry or not. +bool ScrubManager::PGScrubContext::wait_for_all_req_sms(std::chrono::milliseconds timeout) { + // return true means no need to wait and can proceed, false means timeout and need to retry. + if (cancelled) { + LOGINFOMOD(scrubmgr, "scrub task is cancelled, no need to wait for req sms!"); + return true; + } + + std::unique_lock lock(mtx_); + if (peer_sm_map_.size() == member_peer_ids_.size()) return true; + + // receiving sm or task cancellation will notify this condition variable. + cv_.wait_for(lock, timeout, [this] { return cancelled || peer_sm_map_.size() == member_peer_ids_.size(); }); + + // if task is cancelled or all the req sms are received, we can proceed, otherwise it means timeout happens and we + // can retry for pending peers. + return cancelled || (peer_sm_map_.size() == member_peer_ids_.size()); +} + +void ScrubManager::PGScrubContext::cancel() { + cancelled.store(true); + cv_.notify_all(); +} + +void ScrubManager::PGScrubContext::reset_for_new_req() { + std::lock_guard lg(mtx_); + peer_sm_map_.clear(); + current_req.reset(); +} + +//=========================== Scrub Request Serialization/Deserialization ===========================// + +// base_scrub_req implementations +flatbuffers::DetachedBuffer ScrubManager::base_scrub_req::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(issuer_peer_id.data, issuer_peer_id.data + 16); + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + auto pg_meta_req_off = CreatePgMetaScrubReq(fb_builder, scrub_info_off); + FinishSizePrefixedPgMetaScrubReqBuffer(fb_builder, pg_meta_req_off); + return fb_builder.Release(); +} + +bool ScrubManager::base_scrub_req::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for base_scrub_req deserialization"); + return false; + } + + auto fb_req = GetSizePrefixedPgMetaScrubReq(buf_ptr); + if (!fb_req) { + LOGERROR("Failed to parse base_scrub_req from buffer"); + return false; + } + + auto scrub_info = fb_req->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in base_scrub_req"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(issuer_peer_id.data, issuer_uuid_bytes->data(), 16); + } + + return true; +} + +// blob_scrub_req implementations +flatbuffers::DetachedBuffer ScrubManager::blob_scrub_req::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(issuer_peer_id.data, issuer_peer_id.data + 16); + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + auto blob_req_off = CreateBlobScrubReq(fb_builder, scrub_info_off, start, end, is_deep_scrub_); + FinishSizePrefixedBlobScrubReqBuffer(fb_builder, blob_req_off); + return fb_builder.Release(); +} + +bool ScrubManager::blob_scrub_req::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for blob_scrub_req deserialization"); + return false; + } + + auto fb_req = GetSizePrefixedBlobScrubReq(buf_ptr); + if (!fb_req) { + LOGERROR("Failed to parse blob_scrub_req from buffer"); + return false; + } + + auto scrub_info = fb_req->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in blob_scrub_req"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(issuer_peer_id.data, issuer_uuid_bytes->data(), 16); + } + + // Load start and end blob_id + start = fb_req->start(); + end = fb_req->end(); + + is_deep_scrub_ = fb_req->isdeepscrub(); + + return true; +} + +// shard_scrub_req implementations +flatbuffers::DetachedBuffer ScrubManager::shard_scrub_req::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(issuer_peer_id.data, issuer_peer_id.data + 16); + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + auto shard_req_off = CreateShardScrubReq(fb_builder, scrub_info_off, start, end, is_deep_scrub_); + FinishSizePrefixedShardScrubReqBuffer(fb_builder, shard_req_off); + return fb_builder.Release(); +} + +bool ScrubManager::shard_scrub_req::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for shard_scrub_req deserialization"); + return false; + } + + auto fb_req = GetSizePrefixedShardScrubReq(buf_ptr); + if (!fb_req) { + LOGERROR("Failed to parse shard_scrub_req from buffer"); + return false; + } + + auto scrub_info = fb_req->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in shard_scrub_req"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(issuer_peer_id.data, issuer_uuid_bytes->data(), 16); + } + + start = fb_req->start(); + end = fb_req->end(); + is_deep_scrub_ = fb_req->isdeepscrub(); + + return true; +} + +//=========================== Scrub Map Serialization/Deserialization ===========================// + +// DeepBlobScrubMap implementations +flatbuffers::DetachedBuffer ScrubManager::DeepBlobScrubMap::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(16); + peer_uuid_bytes.assign(peer_id.begin(), peer_id.end()); + // Create scrub_info + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + // Create deep blob scrub result entries + std::vector< flatbuffers::Offset< DeepBlobScrubResultEntry > > result_entries; + for (const auto& [blob_route, scrub_result_variant] : blobs) { + auto blob_key_off = CreateBlobKey(fb_builder, blob_route.shard, blob_route.blob); + + homeobject::ScrubValue scrub_value_type; + flatbuffers::Offset< void > scrub_value_off; + if (std::holds_alternative< ScrubResult >(scrub_result_variant)) { + // It's a ScrubResult + auto result = std::get< ScrubResult >(scrub_result_variant); + scrub_value_type = homeobject::ScrubValue::ScrubResultValue; + scrub_value_off = CreateScrubResultValue(fb_builder, result).Union(); + } else { + // It's a BlobHashArray + const auto& hash_array = std::get< BlobHashArray >(scrub_result_variant); + std::vector< uint8_t > hash_vec(hash_array.begin(), hash_array.end()); + scrub_value_type = homeobject::ScrubValue::HashValue; + scrub_value_off = CreateHashValueDirect(fb_builder, &hash_vec).Union(); + } + + result_entries.push_back( + CreateDeepBlobScrubResultEntry(fb_builder, blob_key_off, scrub_value_type, scrub_value_off)); + } + auto results_vec_off = fb_builder.CreateVector(result_entries); + auto deep_blob_map_off = CreateDeepBlobScrubMap(fb_builder, scrub_info_off, start, end, results_vec_off); + FinishSizePrefixedDeepBlobScrubMapBuffer(fb_builder, deep_blob_map_off); + + return fb_builder.Release(); +} + +bool ScrubManager::DeepBlobScrubMap::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for DeepBlobScrubMap deserialization"); + return false; + } + + auto fb_map = GetSizePrefixedDeepBlobScrubMap(buf_ptr); + if (!fb_map) { + LOGERROR("Failed to parse DeepBlobScrubMap from buffer"); + return false; + } + + // Load scrub_info + auto scrub_info = fb_map->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in DeepBlobScrubMap"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(peer_id.data, issuer_uuid_bytes->data(), 16); + } + + // Load start and end blob_id + start = fb_map->start(); + end = fb_map->end(); + + // Load blob results + blobs.clear(); + auto results = fb_map->deep_blob_scrub_results(); + if (results) { + for (const auto* entry : *results) { + if (!entry || !entry->blob_key()) continue; + + BlobRoute blob_route(entry->blob_key()->shard_id(), entry->blob_key()->blob_id()); + + auto scrub_value_type = entry->scrub_result_type(); + if (scrub_value_type == ScrubValue::ScrubResultValue) { + auto result_value = static_cast< const ScrubResultValue* >(entry->scrub_result()); + blobs[blob_route] = result_value->result(); + } else if (scrub_value_type == ScrubValue::HashValue) { + auto hash_value = static_cast< const HashValue* >(entry->scrub_result()); + BlobHashArray hash_array; + if (hash_value->hash() && hash_value->hash()->size() <= blob_max_hash_len) { + std::memcpy(hash_array.data(), hash_value->hash()->data(), hash_value->hash()->size()); + } + blobs[blob_route] = hash_array; + } + } + } + + return true; +} + +// ShallowBlobScrubMap implementations +flatbuffers::DetachedBuffer ScrubManager::ShallowBlobScrubMap::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(16); + peer_uuid_bytes.assign(peer_id.begin(), peer_id.end()); + // Create scrub_info + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + // Create blob keys vector + std::vector< flatbuffers::Offset< BlobKey > > blob_keys; + for (const auto& blob_route : blobs) { + blob_keys.push_back(CreateBlobKey(fb_builder, blob_route.shard, blob_route.blob)); + } + auto blobs_vec_off = fb_builder.CreateVector(blob_keys); + auto shallow_blob_map_off = CreateShallowBlobScrubMap(fb_builder, scrub_info_off, start, end, blobs_vec_off); + FinishSizePrefixedShallowBlobScrubMapBuffer(fb_builder, shallow_blob_map_off); + return fb_builder.Release(); +} + +bool ScrubManager::ShallowBlobScrubMap::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for ShallowBlobScrubMap deserialization"); + return false; + } + + auto fb_map = GetSizePrefixedShallowBlobScrubMap(buf_ptr); + if (!fb_map) { + LOGERROR("Failed to parse ShallowBlobScrubMap from buffer"); + return false; + } + + // Load scrub_info + auto scrub_info = fb_map->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in ShallowBlobScrubMap"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(peer_id.data, issuer_uuid_bytes->data(), 16); + } + + // Load start and end blob_id + start = fb_map->start(); + end = fb_map->end(); + + // Load blob routes + blobs.clear(); + auto blob_keys = fb_map->blobs(); + if (blob_keys) { + for (const auto* blob_key : *blob_keys) { + if (!blob_key) continue; + blobs.insert(BlobRoute(blob_key->shard_id(), blob_key->blob_id())); + } + } + + return true; +} + +// ShallowShardScrubMap implementations +flatbuffers::DetachedBuffer ScrubManager::ShallowShardScrubMap::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(16); + peer_uuid_bytes.assign(peer_id.begin(), peer_id.end()); + // Create scrub_info + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + // Note: ShallowShardScrubMap doesn't have start/end in flatbuffer schema + // Create shard ids vector + std::vector< shard_id_t > shard_ids_vec; + for (const auto& shard_id : shards) { + shard_ids_vec.push_back(shard_id); // Assuming BlobRoute.shard is the shard_id + } + auto shards_vec_off = fb_builder.CreateVector(shard_ids_vec); + auto shallow_shard_map_off = CreateShallowShardScrubMap(fb_builder, scrub_info_off, 0, 0, shards_vec_off); + FinishSizePrefixedShallowShardScrubMapBuffer(fb_builder, shallow_shard_map_off); + return fb_builder.Release(); +} + +bool ScrubManager::ShallowShardScrubMap::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for ShallowShardScrubMap deserialization"); + return false; + } + + auto fb_map = GetSizePrefixedShallowShardScrubMap(buf_ptr); + if (!fb_map) { + LOGERROR("Failed to parse ShallowShardScrubMap from buffer"); + return false; + } + + // Load scrub_info + auto scrub_info = fb_map->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in ShallowShardScrubMap"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(peer_id.data, issuer_uuid_bytes->data(), 16); + } + + // Load shard ids + shards.clear(); + auto shard_ids = fb_map->shards(); + if (shard_ids) { + for (auto shard_id : *shard_ids) { + shards.insert(shard_id); + } + } + + return true; +} + +// DeepShardScrubMap implementations +flatbuffers::DetachedBuffer ScrubManager::DeepShardScrubMap::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(16); + peer_uuid_bytes.assign(peer_id.begin(), peer_id.end()); + // Create scrub_info + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + // Create shallow shard scrub map (base class data) + std::vector< uint64_t > shard_ids_vec; + for (const auto& shard_id : shards) { + shard_ids_vec.push_back(shard_id); + } + auto shards_vec_off = fb_builder.CreateVector(shard_ids_vec); + auto shallow_shard_map_off = CreateShallowShardScrubMap(fb_builder, scrub_info_off, 0, 0, shards_vec_off); + // Create problematic shards entries + std::vector< flatbuffers::Offset< DeepShardScrubResultEntry > > result_entries; + for (const auto& [shard_id, scrub_result] : problematic_shards) { + result_entries.push_back(CreateDeepShardScrubResultEntry(fb_builder, shard_id, scrub_result)); + } + auto results_vec_off = fb_builder.CreateVector(result_entries); + auto deep_shard_map_off = CreateDeepShardScrubMap(fb_builder, shallow_shard_map_off, results_vec_off); + FinishSizePrefixedDeepShardScrubMapBuffer(fb_builder, deep_shard_map_off); + + return fb_builder.Release(); +} + +bool ScrubManager::DeepShardScrubMap::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for DeepShardScrubMap deserialization"); + return false; + } + + auto fb_map = GetSizePrefixedDeepShardScrubMap(buf_ptr); + if (!fb_map) { + LOGERROR("Failed to parse DeepShardScrubMap from buffer"); + return false; + } + + // Load shallow shard scrub map (base class data) + auto shallow_map = fb_map->shallow_map(); + if (!shallow_map) { + LOGERROR("Missing shallow_map in DeepShardScrubMap"); + return false; + } + + // Load scrub_info + auto scrub_info = shallow_map->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in DeepShardScrubMap"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(peer_id.data, issuer_uuid_bytes->data(), 16); + } + + // Load shard ids from shallow map + shards.clear(); + auto shard_ids = shallow_map->shards(); + if (shard_ids) { + for (auto shard_id : *shard_ids) { + shards.insert(shard_id); + } + } + + // Load problematic shards + problematic_shards.clear(); + auto results = fb_map->problematic_shards(); + if (results) { + for (const auto* entry : *results) { + if (!entry) continue; + problematic_shards[entry->shard_id()] = entry->result(); + } + } + + return true; +} + +flatbuffers::DetachedBuffer ScrubManager::PGMetaScrubMap::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder fb_builder; + // Prepare peer_id as UUID bytes + std::vector< uint8_t > peer_uuid_bytes(16); + peer_uuid_bytes.assign(peer_id.begin(), peer_id.end()); + // Create scrub_info + auto scrub_info_off = + CreateScrubInfo(fb_builder, pg_id, task_id, req_id, scrub_lsn, fb_builder.CreateVector(peer_uuid_bytes)); + auto pg_meta_map_off = CreatePGMetaScrubMap(fb_builder, scrub_info_off, pg_meta_scrub_result); + FinishSizePrefixedPGMetaScrubMapBuffer(fb_builder, pg_meta_map_off); + return fb_builder.Release(); +} + +bool ScrubManager::PGMetaScrubMap::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERROR("Invalid buffer for PGMetaScrubMap deserialization"); + return false; + } + + auto fb_map = GetSizePrefixedPGMetaScrubMap(buf_ptr); + if (!fb_map) { + LOGERROR("Failed to parse PGMetaScrubMap from buffer"); + return false; + } + + // Load scrub_info + auto scrub_info = fb_map->scrub_info(); + if (!scrub_info) { + LOGERROR("Missing scrub_info in PGMetaScrubMap"); + return false; + } + + pg_id = scrub_info->pg_id(); + task_id = scrub_info->task_id(); + req_id = scrub_info->req_id(); + scrub_lsn = scrub_info->scrub_lsn(); + + // Load peer_id from issuer_uuid + auto issuer_uuid_bytes = scrub_info->issuer_uuid(); + if (issuer_uuid_bytes && issuer_uuid_bytes->size() == 16) { + std::memcpy(peer_id.data, issuer_uuid_bytes->data(), 16); + } + + // Load PG meta scrub result + pg_meta_scrub_result = fb_map->pg_meta_scrub_result(); + + return true; +} + +//=========================== Scrub Report Merge Functions ===========================// + +void ScrubManager::ShallowScrubReport::print() const { + std::stringstream ss; + ss << "ShallowScrubReport for pg=" << pg_id_ << " | "; + + // Report missing shards + ss << "MissingShards={"; + for (const auto& [peer_id, shard_set] : missing_shard_ids) { + ss << "peer=" << peer_id << ":["; + bool first = true; + for (const auto& shard_id : shard_set) { + if (!first) ss << ","; + ss << shard_id; + first = false; + } + ss << "] "; + } + ss << "} | "; + + // Report missing blobs + ss << "MissingBlobs={"; + for (const auto& [peer_id, blob_set] : missing_blobs) { + ss << "peer=" << peer_id << ":["; + bool first = true; + for (const auto& blob_route : blob_set) { + if (!first) ss << ","; + ss << fmt::format("{}", blob_route); + first = false; + } + ss << "] "; + } + ss << "}"; + + LOGINFOMOD(scrubmgr, "{}", ss.str()); +} + +void ScrubManager::ShallowScrubReport::merge( + const std::map< peer_id_t, std::shared_ptr< BaseScrubMap > >& peer_sm_map) { + if (peer_sm_map.empty()) { + LOGWARNMOD(scrubmgr, "[pg={}] No scrub maps to merge", pg_id_); + return; + } + + // Collect all blobs and shards from all peers + std::map< BlobRoute, std::set< peer_id_t > > blob_peers_map; // blob -> set of peers that have it + std::map< shard_id_t, std::set< peer_id_t > > shard_peers_map; // shard -> set of peers that have it + + for (const auto& [peer_id, scrub_map] : peer_sm_map) { + if (!scrub_map) { + LOGWARNMOD(scrubmgr, "[pg={}] Null scrub map from peer {}", pg_id_, peer_id); + continue; + } + + // Handle ShallowBlobScrubMap + auto shallow_blob_map = std::dynamic_pointer_cast< ShallowBlobScrubMap >(scrub_map); + if (shallow_blob_map) { + for (const auto& blob_route : shallow_blob_map->blobs) { + blob_peers_map[blob_route].insert(peer_id); + } + continue; + } + + // Handle DeepBlobScrubMap (also contains blob list) + auto deep_blob_map = std::dynamic_pointer_cast< DeepBlobScrubMap >(scrub_map); + if (deep_blob_map) { + for (const auto& [blob_route, _] : deep_blob_map->blobs) { + blob_peers_map[blob_route].insert(peer_id); + } + continue; + } + + // Handle ShallowShardScrubMap + auto shallow_shard_map = std::dynamic_pointer_cast< ShallowShardScrubMap >(scrub_map); + if (shallow_shard_map) { + for (const auto& shard_id : shallow_shard_map->shards) { + shard_peers_map[shard_id].insert(peer_id); + } + continue; + } + + // Handle DeepShardScrubMap (inherits from ShallowShardScrubMap) + auto deep_shard_map = std::dynamic_pointer_cast< DeepShardScrubMap >(scrub_map); + if (deep_shard_map) { + for (const auto& shard_id : deep_shard_map->shards) { + shard_peers_map[shard_id].insert(peer_id); + } + continue; + } + } + + // Determine which blobs are missing on which peers + // A blob is considered missing on a peer if it appears on other peers but not this one + for (const auto& [blob_route, peer_set] : blob_peers_map) { + // If not all peers have this blob, some are missing it + if (peer_set.size() < peer_sm_map.size()) { + for (const auto& [peer_id, _] : peer_sm_map) { + if (peer_set.find(peer_id) == peer_set.end()) { + // This peer is missing the blob + add_missing_blob(blob_route, peer_id); + } + } + } + } + + // Determine which shards are missing on which peers + for (const auto& [shard_id, peer_set] : shard_peers_map) { + if (peer_set.size() < peer_sm_map.size()) { + for (const auto& [peer_id, _] : peer_sm_map) { + if (peer_set.find(peer_id) == peer_set.end()) { + // This peer is missing the shard + add_missing_shard(shard_id, peer_id); + } + } + } + } + + // Count total missing blobs and shards across all peers + size_t total_missing_blobs = 0; + for (const auto& [peer_id, blobs] : missing_blobs) { + total_missing_blobs += blobs.size(); + } + size_t total_missing_shards = 0; + for (const auto& [peer_id, shards] : missing_shard_ids) { + total_missing_shards += shards.size(); + } + LOGINFOMOD(scrubmgr, + "[pg={}] Shallow scrub merge completed: {} peers with missing blobs (total {} blobs), {} peers with " + "missing shards (total {} shards)", + pg_id_, missing_blobs.size(), total_missing_blobs, missing_shard_ids.size(), total_missing_shards); +} + +void ScrubManager::DeepScrubReport::print() const { + std::stringstream ss; + ss << "DeepScrubReport for pg=" << pg_id_ << " | "; + + // Report missing shards (from ShallowScrubReport) + ss << "MissingShards={"; + for (const auto& [peer_id, shard_set] : missing_shard_ids) { + ss << "peer=" << peer_id << ":["; + bool first = true; + for (const auto& shard_id : shard_set) { + if (!first) ss << ","; + ss << shard_id; + first = false; + } + ss << "] "; + } + ss << "} | "; + + // Report missing blobs (from ShallowScrubReport) + ss << "MissingBlobs={"; + for (const auto& [peer_id, blob_set] : missing_blobs) { + ss << "peer=" << peer_id << ":["; + bool first = true; + for (const auto& blob_route : blob_set) { + if (!first) ss << ","; + ss << fmt::format("{}", blob_route); + first = false; + } + ss << "] "; + } + ss << "} | "; + + // Report corrupted blobs + ss << "CorruptedBlobs={"; + for (const auto& [peer_id, blob_map] : corrupted_blobs) { + ss << "peer=" << peer_id << ":["; + bool first = true; + for (const auto& [blob_route, scrub_result] : blob_map) { + if (!first) ss << ","; + ss << fmt::format("{}", blob_route) << "(" << SCRUB_RESULT_STRING(scrub_result) << ")"; + first = false; + } + ss << "] "; + } + ss << "} | "; + + // Report corrupted shards + ss << "CorruptedShards={"; + for (const auto& [peer_id, shard_map] : corrupted_shards) { + ss << "peer=" << peer_id << ":["; + bool first = true; + for (const auto& [shard_id, scrub_result] : shard_map) { + if (!first) ss << ","; + ss << shard_id << "(" << SCRUB_RESULT_STRING(scrub_result) << ")"; + first = false; + } + ss << "] "; + } + ss << "} | "; + + // Report inconsistent blobs (different hashes across replicas) + ss << "InconsistentBlobs={"; + for (const auto& [blob_route, peer_hash_map] : inconsistent_blobs) { + ss << fmt::format("{}", blob_route); + bool first = true; + for (const auto& [peer_id, hash] : peer_hash_map) { + if (!first) ss << ","; + ss << "peer=" << peer_id << "(hash="; + // Print first 8 bytes of hash for brevity + for (size_t i = 0; i < std::min(size_t(8), hash.size()); ++i) { + ss << fmt::format("{:02x}", hash[i]); + } + ss << ")"; + first = false; + } + ss << "] "; + } + ss << "} | "; + + // Report corrupted PG metadata + ss << "CorruptedPGMeta={"; + bool first = true; + for (const auto& [peer_id, scrub_result] : corrupted_pg_metas) { + if (!first) ss << ","; + ss << "peer=" << peer_id << "(" << SCRUB_RESULT_STRING(scrub_result) << ")"; + first = false; + } + ss << "}"; + + LOGINFOMOD(scrubmgr, "{}", ss.str()); +} + +void ScrubManager::DeepScrubReport::merge(const std::map< peer_id_t, std::shared_ptr< BaseScrubMap > >& peer_sm_map) { + // First do shallow merge to find missing blobs/shards + ShallowScrubReport::merge(peer_sm_map); + + if (peer_sm_map.empty()) { return; } + + // Now do deep scrub specific comparisons + std::map< BlobRoute, std::map< peer_id_t, std::variant< ScrubResult, BlobHashArray > > > blob_results_map; + std::map< shard_id_t, std::map< peer_id_t, ScrubResult > > shard_results_map; + std::map< peer_id_t, ScrubResult > pg_meta_results_map; + + // Collect all deep scrub results + for (const auto& [peer_id, scrub_map] : peer_sm_map) { + if (!scrub_map) continue; + + // Handle DeepBlobScrubMap + auto deep_blob_map = std::dynamic_pointer_cast< DeepBlobScrubMap >(scrub_map); + if (deep_blob_map) { + for (const auto& [blob_route, result_variant] : deep_blob_map->blobs) { + blob_results_map[blob_route][peer_id] = result_variant; + } + continue; + } + + // Handle DeepShardScrubMap + auto deep_shard_map = std::dynamic_pointer_cast< DeepShardScrubMap >(scrub_map); + if (deep_shard_map) { + for (const auto& [shard_id, scrub_result] : deep_shard_map->problematic_shards) { + shard_results_map[shard_id][peer_id] = scrub_result; + } + continue; + } + + // Handle PGMetaScrubMap + auto pg_meta_map = std::dynamic_pointer_cast< PGMetaScrubMap >(scrub_map); + if (pg_meta_map) { + if (pg_meta_map->pg_meta_scrub_result != ScrubResult::NONE) { + pg_meta_results_map[peer_id] = pg_meta_map->pg_meta_scrub_result; + } + continue; + } + } + + // Analyze blob results + for (const auto& [blob_route, peer_results] : blob_results_map) { + std::map< peer_id_t, BlobHashArray > hash_map; + bool has_error = false; + + for (const auto& [peer_id, result_variant] : peer_results) { + if (std::holds_alternative< ScrubResult >(result_variant)) { + // This peer has an error (IO_ERROR, MISMATCH, NOT_FOUND) + auto scrub_result = std::get< ScrubResult >(result_variant); + add_corrupted_blob(peer_id, blob_route, scrub_result); + has_error = true; + } else { + // This peer has a valid hash + hash_map[peer_id] = std::get< BlobHashArray >(result_variant); + } + } + + // Check for hash inconsistencies among peers with valid hashes + if (!has_error && hash_map.size() > 1) { + // Compare all hashes + BlobHashArray reference_hash; + peer_id_t reference_peer; + bool first = true; + bool hashes_consistent = true; + + for (const auto& [peer_id, hash] : hash_map) { + if (first) { + reference_hash = hash; + reference_peer = peer_id; + first = false; + } else { + if (std::memcmp(reference_hash.data(), hash.data(), blob_max_hash_len) != 0) { + hashes_consistent = false; + break; + } + } + } + + // If hashes are inconsistent, record all of them + if (!hashes_consistent) { + for (const auto& [peer_id, hash] : hash_map) { + add_inconsistent_blob(blob_route, peer_id, hash); + } + } + } + } + + // Analyze shard results + for (const auto& [shard_id, peer_results] : shard_results_map) { + for (const auto& [peer_id, scrub_result] : peer_results) { + if (scrub_result != ScrubResult::NONE) { add_corrupted_shard(peer_id, shard_id, scrub_result); } + } + } + + // Record PG meta errors + for (const auto& [peer_id, scrub_result] : pg_meta_results_map) { + add_corrupted_pg_meta(peer_id, scrub_result); + } + + LOGINFOMOD(scrubmgr, + "[pg={}] Deep scrub merge completed: {} corrupted blobs, {} inconsistent blobs, " + "{} corrupted shards, {} corrupted pg metas", + pg_id_, corrupted_blobs.size(), inconsistent_blobs.size(), corrupted_shards.size(), + corrupted_pg_metas.size()); +} + +} // namespace homeobject \ No newline at end of file diff --git a/src/lib/homestore_backend/scrub_manager.hpp b/src/lib/homestore_backend/scrub_manager.hpp new file mode 100644 index 000000000..771ec5263 --- /dev/null +++ b/src/lib/homestore_backend/scrub_manager.hpp @@ -0,0 +1,471 @@ +#pragma once + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include +#include +#include +#pragma GCC diagnostic pop + +#include +#include "homeobject/common.hpp" +#include +#include +#include "lib/blob_route.hpp" +#include "MPMCPriorityQueue.hpp" +#include "generated/scrub_common_generated.h" + +namespace homeobject { + +class HSHomeObject; + +ENUM(SCRUB_TRIGGER_TYPE, uint8_t, PERIODICALLY = 0, MANUALLY); +ENUM(SCRUB_TYPE, uint8_t, PG_META = 0, DEEP_SHARD, SHALLOW_SHARD, DEEP_BLOB, SHALLOW_BLOB); + +class ScrubManager { +public: + ScrubManager(HSHomeObject* homeobject); + ~ScrubManager(); + + // Disallow copy and move + ScrubManager(const ScrubManager&) = delete; + ScrubManager(ScrubManager&&) = delete; + ScrubManager& operator=(const ScrubManager&) = delete; + ScrubManager& operator=(ScrubManager&&) = delete; + +public: + inline static auto const pg_scrub_meta_name = std::string("PG_SCRUB"); + static constexpr uint64_t blob_max_hash_len = 32; + using BlobHashArray = std::array< uint8_t, blob_max_hash_len >; + using chunk_id_t = homestore::chunk_num_t; + // TODO: persist this into metablk. + inline static atomic_uint64_t scrub_task_id{1}; + + // pg scrub superblk +#pragma pack(1) + struct pg_scrub_superblk { + uint64_t last_deep_scrub_timestamp; + uint64_t last_shallow_scrub_timestamp; + pg_id_t pg_id; + static std::string name() { return pg_scrub_meta_name; } + }; +#pragma pack() + + // scrub req +public: + class base_scrub_req { + public: + base_scrub_req() = default; + base_scrub_req(uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t issuer_peer_id, pg_id_t pg_id, + bool is_deep_scrub) : + task_id(task_id), + req_id(req_id), + scrub_lsn(scrub_lsn), + issuer_peer_id(issuer_peer_id), + pg_id(pg_id), + is_deep_scrub_(is_deep_scrub) {} + + bool is_deep_scrub() const { return is_deep_scrub_; } + + virtual ~base_scrub_req() = default; + virtual SCRUB_TYPE get_scrub_type() const { return SCRUB_TYPE::PG_META; } + virtual flatbuffers::DetachedBuffer build_flat_buffer() const; + virtual bool load(uint8_t const* buf_ptr, const uint32_t buf_size); + + public: + uint64_t task_id; + uint64_t req_id; + int64_t scrub_lsn; + peer_id_t issuer_peer_id; + pg_id_t pg_id; + bool is_deep_scrub_; + }; + + class blob_scrub_req : public base_scrub_req { + public: + blob_scrub_req() = default; + blob_scrub_req(uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t issuer_peer_id, pg_id_t pg_id, + blob_id_t start, blob_id_t end, bool is_deep_scrub) : + base_scrub_req(task_id, req_id, scrub_lsn, issuer_peer_id, pg_id, is_deep_scrub), + start(start), + end(end) {} + ~blob_scrub_req() = default; + + SCRUB_TYPE get_scrub_type() const override { + return is_deep_scrub() ? SCRUB_TYPE::DEEP_BLOB : SCRUB_TYPE::SHALLOW_BLOB; + } + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + + public: + blob_id_t start; + blob_id_t end; + }; + + class shard_scrub_req : public base_scrub_req { + public: + shard_scrub_req() = default; + shard_scrub_req(uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t issuer_peer_id, pg_id_t pg_id, + uint64_t start, uint64_t end, bool is_deep_scrub) : + base_scrub_req(task_id, req_id, scrub_lsn, issuer_peer_id, pg_id, is_deep_scrub), + start(start), + end(end) {} + ~shard_scrub_req() = default; + + SCRUB_TYPE get_scrub_type() const override { + return is_deep_scrub() ? SCRUB_TYPE::DEEP_SHARD : SCRUB_TYPE::SHALLOW_SHARD; + } + + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + + public: + uint64_t start; + uint64_t end; + }; + + // scrub map, the scrub result of a specific scrub. +public: + class BaseScrubMap { + public: + BaseScrubMap() = default; + BaseScrubMap(pg_id_t pg_id, uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t peer_id) : + pg_id(pg_id), task_id(task_id), req_id(req_id), scrub_lsn(scrub_lsn), peer_id(peer_id) {} + virtual ~BaseScrubMap() = default; + + public: + // convert the scrub map to io_blob_list for sending through data rpc + virtual flatbuffers::DetachedBuffer build_flat_buffer() const = 0; + virtual bool load(uint8_t const* buf_ptr, const uint32_t buf_size) = 0; + virtual SCRUB_TYPE get_scrub_type() const = 0; + + bool match(std::shared_ptr< base_scrub_req > req) const { + if (!req) return false; + + // TODO:: add more logic to check. for example, adding a random sha256 for each req in a scrub task. + return pg_id == req->pg_id && task_id == req->task_id && req_id == req->req_id && + scrub_lsn == req->scrub_lsn && get_scrub_type() == req->get_scrub_type(); + } + + public: + pg_id_t pg_id; + uint64_t task_id; + uint64_t req_id; + int64_t scrub_lsn; + peer_id_t peer_id; + }; + + class DeepBlobScrubMap : public BaseScrubMap { + public: + DeepBlobScrubMap() = default; + DeepBlobScrubMap(pg_id_t pg_id, uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t peer_id, + blob_id_t start, blob_id_t end) : + BaseScrubMap(pg_id, task_id, req_id, scrub_lsn, peer_id), start(start), end(end) {} + + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + SCRUB_TYPE get_scrub_type() const override { return SCRUB_TYPE::DEEP_BLOB; } + + void add_blob_result(const BlobRoute& blob_route, std::variant< ScrubResult, BlobHashArray > scrub_result) { + blobs[blob_route] = scrub_result; + } + + public: + blob_id_t start; // inclusive + blob_id_t end; // exclusive + std::map< BlobRoute, std::variant< ScrubResult, BlobHashArray > > blobs; + }; + + class ShallowBlobScrubMap : public BaseScrubMap { + public: + ShallowBlobScrubMap() = default; + ShallowBlobScrubMap(pg_id_t pg_id, uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t peer_id, + blob_id_t start, blob_id_t end) : + BaseScrubMap(pg_id, task_id, req_id, scrub_lsn, peer_id), start(start), end(end) {} + + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + SCRUB_TYPE get_scrub_type() const override { return SCRUB_TYPE::SHALLOW_BLOB; } + + void add_blob(const BlobRoute& blob_route) { blobs.insert(blob_route); } + + public: + blob_id_t start; // inclusive + blob_id_t end; // exclusive + std::set< BlobRoute > blobs; + }; + + class ShallowShardScrubMap : public BaseScrubMap { + public: + ShallowShardScrubMap() = default; + ShallowShardScrubMap(pg_id_t pg_id, uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t peer_id) : + BaseScrubMap(pg_id, task_id, req_id, scrub_lsn, peer_id) {} + + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + SCRUB_TYPE get_scrub_type() const override { return SCRUB_TYPE::SHALLOW_SHARD; } + + void add_shard(const shard_id_t& shard_id) { shards.insert(shard_id); } + + public: + std::set< shard_id_t > shards; + }; + + class DeepShardScrubMap : public ShallowShardScrubMap { + public: + DeepShardScrubMap() = default; + DeepShardScrubMap(pg_id_t pg_id, uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t peer_id) : + ShallowShardScrubMap(pg_id, task_id, req_id, scrub_lsn, peer_id) {} + + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + SCRUB_TYPE get_scrub_type() const override { return SCRUB_TYPE::DEEP_SHARD; } + + void add_problematic_shard(const shard_id_t& shard_id, ScrubResult scrub_result) { + problematic_shards[shard_id] = scrub_result; + } + + public: + std::map< shard_id_t, ScrubResult > problematic_shards; + }; + + class PGMetaScrubMap : public BaseScrubMap { + public: + PGMetaScrubMap() = default; + PGMetaScrubMap(pg_id_t pg_id, uint64_t task_id, uint64_t req_id, int64_t scrub_lsn, peer_id_t peer_id) : + BaseScrubMap(pg_id, task_id, req_id, scrub_lsn, peer_id) {} + + flatbuffers::DetachedBuffer build_flat_buffer() const override; + bool load(uint8_t const* buf_ptr, const uint32_t buf_size) override; + SCRUB_TYPE get_scrub_type() const override { return SCRUB_TYPE::PG_META; } + + public: + ScrubResult pg_meta_scrub_result{ScrubResult::NONE}; + }; + + // scrub report +public: + // shallow scrub report for a pg + class ShallowScrubReport { + public: + ShallowScrubReport(pg_id_t pg_id) : pg_id_(pg_id) {} + virtual ~ShallowScrubReport() = default; + + public: + pg_id_t get_pg_id() const { return pg_id_; } + void add_missing_shard(shard_id_t shard_id, peer_id_t peer_id) { missing_shard_ids[peer_id].insert(shard_id); } + void add_missing_blob(BlobRoute blob_route, peer_id_t peer_id) { missing_blobs[peer_id].insert(blob_route); } + const auto& get_missing_shard_ids() const { return missing_shard_ids; } + const auto& get_missing_blobs() const { return missing_blobs; } + virtual void merge(const std::map< peer_id_t, std::shared_ptr< BaseScrubMap > >& peer_sm_map); + virtual void print() const; + + public: + std::map< peer_id_t, std::set< shard_id_t > > missing_shard_ids; + std::map< peer_id_t, std::set< BlobRoute > > missing_blobs; + pg_id_t pg_id_; + }; + + // deep scrub report for a pg + class DeepScrubReport : public ShallowScrubReport { + public: + DeepScrubReport(pg_id_t pg_id) : ShallowScrubReport(pg_id) {} + ~DeepScrubReport() = default; + void add_corrupted_blob(peer_id_t peer_id, BlobRoute blob_route, ScrubResult scrub_result) { + corrupted_blobs[peer_id][blob_route] = scrub_result; + } + void add_corrupted_shard(peer_id_t peer_id, shard_id_t shard_id, ScrubResult scrub_result) { + corrupted_shards[peer_id][shard_id] = scrub_result; + } + void add_inconsistent_blob(BlobRoute blob_route, peer_id_t peer_id, BlobHashArray hash) { + inconsistent_blobs[blob_route][peer_id] = hash; + } + void add_corrupted_pg_meta(peer_id_t peer_id, ScrubResult scrub_result) { + corrupted_pg_metas[peer_id] = scrub_result; + } + + const auto& get_corrupted_blobs() const { return corrupted_blobs; } + const auto& get_corrupted_shards() const { return corrupted_shards; } + const auto& get_inconsistent_blobs() const { return inconsistent_blobs; } + const auto& get_corrupted_pg_metas() const { return corrupted_pg_metas; } + void merge(const std::map< peer_id_t, std::shared_ptr< BaseScrubMap > >& peer_sm_map) override; + void print() const override; + + private: + std::map< peer_id_t, std::map< BlobRoute, ScrubResult > > corrupted_blobs; + std::map< peer_id_t, std::map< shard_id_t, ScrubResult > > corrupted_shards; + std::map< BlobRoute, std::map< peer_id_t, BlobHashArray > > inconsistent_blobs; + std::map< peer_id_t, ScrubResult > corrupted_pg_metas; + }; + + // scrub task that will be put into scrub task queue, and executed by scrub worker +public: + struct scrub_task { + // Default constructor (required for std::regular) + scrub_task() : + task_id{0}, + last_scrub_time{0}, + pg_id{0}, + is_deep_scrub{false}, + triggered{SCRUB_TRIGGER_TYPE::PERIODICALLY} {} + + // Main constructor + scrub_task(uint64_t last_scrub_time, pg_id_t pg_id, bool is_deep_scrub, SCRUB_TRIGGER_TYPE trigger_type, + folly::Promise< std::shared_ptr< ShallowScrubReport > > promise) : + task_id{scrub_task_id.fetch_add(1)}, + last_scrub_time{last_scrub_time}, + pg_id{pg_id}, + is_deep_scrub{is_deep_scrub}, + triggered{trigger_type}, + scrub_report_promise{ + std::make_shared< folly::Promise< std::shared_ptr< ShallowScrubReport > > >(std::move(promise))} {} + + scrub_task(const scrub_task& other) = default; + scrub_task& operator=(const scrub_task& other) = default; + scrub_task(scrub_task&& other) noexcept = default; + scrub_task& operator=(scrub_task&& other) noexcept = default; + + ~scrub_task() { + // make sure there is not any unfulfilled promise + if (scrub_report_promise && scrub_report_promise->isFulfilled() == false) { + scrub_report_promise->setValue(nullptr); + } + } + + uint64_t task_id; + uint64_t last_scrub_time; + pg_id_t pg_id; + bool is_deep_scrub; + SCRUB_TRIGGER_TYPE triggered; + std::shared_ptr< folly::Promise< std::shared_ptr< ShallowScrubReport > > > scrub_report_promise; + + // Equality operator (required for std::regular) + bool operator==(const scrub_task& other) const noexcept { return task_id == other.task_id; } + + // the priority of `manually` is higher than `periodically` + bool operator<(const scrub_task& other) const noexcept { + using U = std::underlying_type_t< SCRUB_TRIGGER_TYPE >; + // First compare by trigger type (manually > periodically) + if (static_cast< U >(triggered) != static_cast< U >(other.triggered)) { + return static_cast< U >(triggered) < static_cast< U >(other.triggered); + } + // If same trigger type, compare by task_id (earlier tasks have higher priority) + return task_id > other.task_id; + // TODO:: add more logic to decide the priority between two tasks after we introduce more logic for + // automatic schedule, the following are some criteria we can consider: + /* + 2. Time Since Last Scrub + - PGs that haven't been scrubbed in the longest time get higher priority + - Uses last_scrub_stamp timestamp to track + - Prevents starvation of individual PGs + 3. Deep vs Shallow Scrub Deadline + - Deep scrub deadline (deep_scrub_interval, default 7 days) + - Shallow scrub deadline (scrub_interval_randomize_ratio, default 24 hours) + - PGs approaching their deadline get boosted priority + 4. Load Balancing + - scrub_load_threshold prevents scrubbing during high I/O load + - scrub_min_interval and scrub_max_interval control frequency + - Time window restrictions (scrub_begin_hour, scrub_end_hour) + 5. Concurrency Limits + - max_scrubs (default 1) limits concurrent scrubs per sm + - Prevents multiple PGs from overwhelming single sm + */ + } + }; + + // PG Scrub Context, every pg being scrubbed has a scrub context to track its progress +private: + class PGScrubContext { + public: + PGScrubContext(uint64_t task_id, std::unordered_set< peer_id_t > member_peer_ids) : + task_id(task_id), member_peer_ids_(member_peer_ids) {} + ~PGScrubContext() = default; + + public: + bool add_scrub_map(std::shared_ptr< BaseScrubMap > bsm); + void reset_for_new_req(); + bool wait_for_all_req_sms(std::chrono::milliseconds timeout); + std::vector< peer_id_t > get_peers_to_retry() const; + void cancel(); + bool is_cancelled() const { return cancelled.load(); } + + public: + uint64_t task_id{0}; + std::unordered_set< peer_id_t > member_peer_ids_; + std::shared_ptr< base_scrub_req > current_req{nullptr}; + atomic_uint64_t req_id{0}; + mutable std::mutex mtx_; + std::map< peer_id_t, std::shared_ptr< BaseScrubMap > > peer_sm_map_; + + private: + std::atomic_bool cancelled{false}; + std::condition_variable cv_; + }; + + /*scrub scheduler*/ +public: + void start(); + void stop(); + + folly::SemiFuture< std::shared_ptr< ShallowScrubReport > > + submit_scrub_task(const pg_id_t& pg_id, const bool is_deep, const bool force = false, + SCRUB_TRIGGER_TYPE trigger_type = SCRUB_TRIGGER_TYPE::PERIODICALLY); + + // cancel will only cancel a running scrub task. for those submitted but not running tasks in the queue, cancel will + // not remove them from the queue. + void cancel_scrub_task(const pg_id_t& pg_id); + + bool add_scrub_map(const pg_id_t pg_id, std::shared_ptr< BaseScrubMap > bsm); + // new pg is created + void add_pg(const pg_id_t pg_id); + // new pg permanently removed + void remove_pg(const pg_id_t pg_id); + std::optional< pg_scrub_superblk > get_scrub_superblk(const pg_id_t pg_id) const; + void save_scrub_superblk(const pg_id_t pg_id, const bool is_deep_scrub, bool force_update = true); + void add_scrub_req(std::shared_ptr< base_scrub_req > req); + + /*local scrub*/ +public: + std::shared_ptr< BaseScrubMap > local_scrub_blob(std::shared_ptr< blob_scrub_req > req); + std::shared_ptr< ShallowShardScrubMap > local_scrub_shard(std::shared_ptr< shard_scrub_req > req); + std::shared_ptr< PGMetaScrubMap > scrub_pg_meta(std::shared_ptr< base_scrub_req > req); + + // handlers +private: + void scan_pg_for_scrub(); + void handle_pg_scrub_task(scrub_task task); + + bool send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, + const std::unordered_set< peer_id_t >& all_member_peer_ids, const peer_id_t& my_uuid, + shared< homestore::ReplDev > pg_repl_dev, const sisl::io_blob_list_t& req_blob_list, + std::shared_ptr< PGScrubContext > scrub_ctx, uint32_t max_retries, + std::chrono::seconds timeout, const std::string& scrub_type_name); + + bool is_eligible_for_deep_scrub(const pg_id_t& pg_id); + bool is_eligible_for_shallow_scrub(const pg_id_t& pg_id); + void on_pg_scrub_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie, + std::vector< homestore::superblk< pg_scrub_superblk > >& stale_pg_scrub_sbs); + void handle_deep_pg_scrub_report(std::shared_ptr< DeepScrubReport > report); + void handle_shallow_pg_scrub_report(std::shared_ptr< ShallowScrubReport > report); + void handle_scrub_req(std::shared_ptr< base_scrub_req > req); + bool wait_for_scrub_lsn_commit(shared< homestore::ReplDev > repl_dev, int64_t scrub_lsn); + +private: + iomgr::timer_handle_t m_scrub_timer_hdl{iomgr::null_timer_handle}; + iomgr::io_fiber_t m_scrub_timer_fiber{nullptr}; + HSHomeObject* m_hs_home_object{nullptr}; + MPMCPriorityQueue< scrub_task > m_scrub_task_queue; + std::shared_ptr< folly::IOThreadPoolExecutor > m_scrub_executor; + folly::ConcurrentHashMap< pg_id_t, std::shared_ptr< PGScrubContext > > m_pg_scrub_ctx_map; + folly::ConcurrentHashMap< pg_id_t, std::shared_ptr< homestore::superblk< pg_scrub_superblk > > > m_pg_scrub_sb_map; + + std::shared_ptr< folly::IOThreadPoolExecutor > m_scrub_req_executor; +}; +} // namespace homeobject + +// TODO:: consider the following scenarios and decide how we want to handle them in scrub manager +// 1 baseline resync +// 2 replace memeber +// 3 permeantly destroy pg +// 4 GC \ No newline at end of file diff --git a/src/lib/homestore_backend/tests/CMakeLists.txt b/src/lib/homestore_backend/tests/CMakeLists.txt index a40812ab3..8eceb3d1f 100644 --- a/src/lib/homestore_backend/tests/CMakeLists.txt +++ b/src/lib/homestore_backend/tests/CMakeLists.txt @@ -30,3 +30,12 @@ add_test(NAME HeapChunkSelectorTest COMMAND test_heap_chunk_selector) add_library(homestore_tests_gc OBJECT) target_sources(homestore_tests_gc PRIVATE test_homestore_backend.cpp hs_gc_tests.cpp) target_link_libraries(homestore_tests_gc homeobject_homestore ${COMMON_TEST_DEPS}) + +add_library(homestore_tests_scrubber OBJECT) +target_sources(homestore_tests_scrubber PRIVATE test_homestore_backend.cpp hs_scrubber_tests.cpp) +target_link_libraries(homestore_tests_scrubber homeobject_homestore ${COMMON_TEST_DEPS}) + +add_executable(test_mpmc_priority_queue) +target_sources(test_mpmc_priority_queue PRIVATE test_mpmc_priority_queue.cpp) +target_link_libraries(test_mpmc_priority_queue homeobject_homestore ${COMMON_TEST_DEPS}) +add_test(NAME MPMCPriorityQueueTest COMMAND test_mpmc_priority_queue) diff --git a/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp b/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp new file mode 100644 index 000000000..ab7636180 --- /dev/null +++ b/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp @@ -0,0 +1,569 @@ +#include "homeobj_fixture.hpp" +#include +#include +#include +#include +#include "lib/homestore_backend/hs_homeobject.hpp" + +using namespace homeobject; +using BlobHeader = HSHomeObject::BlobHeader; + +// Helper function to delete a blob from index table +static void delete_blob_from_index(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t shard_id, blob_id_t blob_id) { + BlobRouteKey blob_key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue out_value; + homestore::BtreeSingleRemoveRequest remove_req{&blob_key, &out_value}; + auto status = pg_index_table->remove(remove_req); + ASSERT_TRUE(status == homestore::btree_status_t::success) << "Failed to remove blob key from index table"; +} + +// Helper function to corrupt a blob's data +static void corrupt_blob_data(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t shard_id, blob_id_t blob_id) { + auto& data_service = homestore::data_service(); + const auto blk_size = data_service.get_blk_size(); + + BlobRouteKey blob_key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue out_value; + homestore::BtreeSingleGetRequest blob_get_req{&blob_key, &out_value}; + + auto status = pg_index_table->get(blob_get_req); + ASSERT_TRUE(status == homestore::btree_status_t::success) << "Failed to get blob key from index table"; + + auto pbas = out_value.pbas(); + auto total_size = pbas.blk_count() * blk_size; + sisl::sg_list data_sgs; + data_sgs.size = total_size; + data_sgs.iovs.emplace_back(iovec{.iov_base = iomanager.iobuf_alloc(blk_size, total_size), .iov_len = total_size}); + + data_service.async_read(pbas, data_sgs, total_size) + .thenValue([&](auto&& err) { + if (err) { + LOGE("Failed to read blob data, blob_id={}, err={}", blob_id, err.message()); + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + throw std::runtime_error(fmt::format("Failed to read blob data: {}", err.message())); + } + + auto* data_ptr = reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base); + for (size_t i = 0; i <= data_sgs.iovs[0].iov_len / 2; i++) { + data_ptr[i] ^= 0xFF; // Flip first half of data + } + + return data_service.async_write(data_sgs, pbas).thenValue([data_sgs = std::move(data_sgs)](auto&& err) { + ASSERT_FALSE(err) << "Failed to write corrupted blob data"; + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + }); + }) + .get(); +} + +// Helper function to make a blob inconsistent (valid but different hash) +static void make_blob_inconsistent(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t shard_id, blob_id_t blob_id, HSHomeObject* obj_inst) { + auto& data_service = homestore::data_service(); + const auto blk_size = data_service.get_blk_size(); + + BlobRouteKey blob_key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue out_value; + homestore::BtreeSingleGetRequest blob_get_req{&blob_key, &out_value}; + + auto status = pg_index_table->get(blob_get_req); + ASSERT_TRUE(status == homestore::btree_status_t::success) << "Failed to get blob key from index table"; + + auto pbas = out_value.pbas(); + auto total_size = pbas.blk_count() * blk_size; + sisl::sg_list data_sgs; + data_sgs.size = total_size; + data_sgs.iovs.emplace_back(iovec{.iov_base = iomanager.iobuf_alloc(blk_size, total_size), .iov_len = total_size}); + + data_service.async_read(pbas, data_sgs, total_size) + .thenValue([&](auto&& err) { + if (err) { + LOGE("Failed to read blob data, blob_id={}, err={}", blob_id, err.message()); + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + throw std::runtime_error(fmt::format("Failed to read blob data: {}", err.message())); + } + + // Modify blob data and recompute valid hash + uint8_t* read_buf = r_cast< uint8_t* >(data_sgs.iovs[0].iov_base); + auto header = r_cast< BlobHeader* >(read_buf); + uint8_t* blob_bytes = read_buf + header->data_offset; + + std::mt19937 rng{std::random_device{}()}; + std::uniform_int_distribution< int > dist(0, 255); + + for (size_t i = 0; i <= header->blob_size / 2; i++) { + blob_bytes[i] ^= static_cast< uint8_t >(dist(rng)); + } + + std::string user_key = header->user_key_size + ? std::string((const char*)(read_buf + sizeof(BlobHeader)), (size_t)header->user_key_size) + : std::string{}; + + uint8_t computed_hash[BlobHeader::blob_max_hash_len]{}; + obj_inst->compute_blob_payload_hash(header->hash_algorithm, blob_bytes, header->blob_size, computed_hash, + BlobHeader::blob_max_hash_len); + + std::memcpy(header->hash, computed_hash, BlobHeader::blob_max_hash_len); + std::memset(header->header_hash, 0, BlobHeader::blob_max_hash_len); + uint32_t computed_header_hash = crc32_ieee(0, (uint8_t*)header, sizeof(BlobHeader)); + std::memcpy(header->header_hash, &computed_header_hash, sizeof(uint32_t)); + + if (!obj_inst->verify_blob(data_sgs.iovs[0].iov_base, header->shard_id, header->blob_id)) { + LOGE("Blob verification failed after modification, blob_id={}", blob_id); + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + throw std::runtime_error(fmt::format("Blob verification failed for blob_id={}", blob_id)); + } + + return data_service.async_write(data_sgs, pbas).thenValue([data_sgs = std::move(data_sgs)](auto&& err) { + ASSERT_FALSE(err) << "Failed to write inconsistent blob data"; + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + }); + }) + .get(); +} + +// Helper function to verify missing blobs in scrub report +static void verify_missing_blobs(const ScrubManager::DeepScrubReport* report, const peer_id_t& peer_id, + const BlobRoute& expected_blob) { + const auto& missing_blobs = report->get_missing_blobs(); + auto it = missing_blobs.find(peer_id); + EXPECT_TRUE(it != missing_blobs.end()) << "Missing blob should be reported for peer_id=" << peer_id; + if (it != missing_blobs.end()) { + EXPECT_TRUE(it->second.count(expected_blob) == 1) << "Expected missing blob should be in the report"; + } +} + +// Helper function to verify corrupted blobs in scrub report +static void verify_corrupted_blobs(const ScrubManager::DeepScrubReport* report, const peer_id_t& peer_id, + const BlobRoute& expected_blob) { + const auto& corrupted_blobs = report->get_corrupted_blobs(); + auto it = corrupted_blobs.find(peer_id); + EXPECT_TRUE(it != corrupted_blobs.end()) << "Corrupted blob should be reported for peer_id=" << peer_id; + if (it != corrupted_blobs.end()) { + EXPECT_TRUE(it->second.count(expected_blob) == 1) << "Expected corrupted blob should be in the report"; + } +} + +// Helper function to verify missing shards in scrub report +static void verify_missing_shards(const ScrubManager::DeepScrubReport* report, const peer_id_t& peer_id, + shard_id_t expected_shard) { + const auto& missing_shards = report->get_missing_shard_ids(); + auto it = missing_shards.find(peer_id); + EXPECT_TRUE(it != missing_shards.end()) << "Missing shard should be reported for peer_id=" << peer_id; + if (it != missing_shards.end()) { + EXPECT_TRUE(it->second.count(expected_shard) == 1) << "Expected missing shard should be in the report"; + } +} + +TEST_F(HomeObjectFixture, BasicScrubTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + // empty pg scrub should report no issues + run_on_pg_leader(pg_id, [&]() { + // Deep scrub on empty PG should complete without errors + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null for empty PG"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + // Empty PG should have no issues + EXPECT_TRUE(deep_scrub_report->get_missing_blobs().empty()) << "Empty PG should have no missing blobs"; + EXPECT_TRUE(deep_scrub_report->get_missing_shard_ids().empty()) << "Empty PG should have no missing shards"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_blobs().empty()) << "Empty PG should have no corrupted blobs"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_shards().empty()) << "Empty PG should have no corrupted shards"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_blobs().empty()) + << "Empty PG should have no inconsistent blobs"; + + // Shallow scrub on empty PG + scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Shallow scrub report should not be null for empty PG"; + auto shallow_scrub_report = std::dynamic_pointer_cast< ScrubManager::ShallowScrubReport >(scrub_report); + ASSERT_NE(shallow_scrub_report, nullptr) << "Should be ShallowScrubReport"; + + EXPECT_TRUE(shallow_scrub_report->get_missing_blobs().empty()) + << "Empty PG should have no missing blobs in shallow scrub"; + EXPECT_TRUE(shallow_scrub_report->get_missing_shard_ids().empty()) + << "Empty PG should have no missing shards in shallow scrub"; + }); + + const uint64_t num_shards = SISL_OPTIONS["num_shards"].as< uint64_t >(); + const uint64_t num_blobs_per_shard = SISL_OPTIONS["num_blobs"].as< uint64_t >(); + const uint64_t shard_size = 64 * Mi; + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_blob_id[pg_id] = 0; + + std::map< shard_id_t, std::map< blob_id_t, uint64_t > > shard_blob_ids_map; + + // Create multiple shards + for (uint64_t i = 0; i < num_shards; i++) { + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + pg_shard_id_vec[pg_id].push_back(shard_info.id); + LOGINFO("Created pg={} shard={} (shard {}/{})", pg_id, shard_info.id, i + 1, num_shards); + } + + // pg with empty shard scrub should report no issues + run_on_pg_leader(pg_id, [&]() { + // Deep scrub on empty PG should complete without errors + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null for empty PG"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + // Empty PG should have no issues + EXPECT_TRUE(deep_scrub_report->get_missing_blobs().empty()) << "Empty PG should have no missing blobs"; + EXPECT_TRUE(deep_scrub_report->get_missing_shard_ids().empty()) << "Empty PG should have no missing shards"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_blobs().empty()) << "Empty PG should have no corrupted blobs"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_shards().empty()) << "Empty PG should have no corrupted shards"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_blobs().empty()) + << "Empty PG should have no inconsistent blobs"; + + // Shallow scrub on empty PG + scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Shallow scrub report should not be null for empty PG"; + auto shallow_scrub_report = std::dynamic_pointer_cast< ScrubManager::ShallowScrubReport >(scrub_report); + ASSERT_NE(shallow_scrub_report, nullptr) << "Should be ShallowScrubReport"; + + EXPECT_TRUE(shallow_scrub_report->get_missing_blobs().empty()) + << "Empty PG should have no missing blobs in shallow scrub"; + EXPECT_TRUE(shallow_scrub_report->get_missing_shard_ids().empty()) + << "Empty PG should have no missing shards in shallow scrub"; + }); + + g_helper->sync(); + + // Create blobs in all shards + shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs_per_shard, pg_blob_id); + LOGINFO("Created {} blobs per shard, total {} blobs", num_blobs_per_shard, num_shards * num_blobs_per_shard); + + // Verify blobs were created + verify_get_blob(pg_shard_id_vec, num_blobs_per_shard); + + // everything is healthy, deep scrub should report no issues. + run_on_pg_leader(pg_id, [&]() { + // do deep scrub + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + EXPECT_TRUE(deep_scrub_report->get_missing_blobs().empty()) << "No blobs should be missing in normal case"; + EXPECT_TRUE(deep_scrub_report->get_missing_shard_ids().empty()) << "No shards should be missing in normal case"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_blobs().empty()) << "No blobs should be corrupted in normal case"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_shards().empty()) + << "No shards should be corrupted in normal case"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_blobs().empty()) + << "No blobs should be inconsistent in normal case"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_pg_metas().empty()) + << "No PG metas should be corrupted in normal case"; + + // do shallow scrub + scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(scrub_report, nullptr) << "Shallow scrub report should not be null"; + auto shallow_scrub_report = std::dynamic_pointer_cast< ScrubManager::ShallowScrubReport >(scrub_report); + ASSERT_NE(shallow_scrub_report, nullptr) << "Should be ShallowScrubReport"; + EXPECT_TRUE(shallow_scrub_report->get_missing_blobs().empty()) << "No blobs should be missing in normal case"; + EXPECT_TRUE(shallow_scrub_report->get_missing_shard_ids().empty()) + << "No shards should be missing in normal case"; + }); + + g_helper->sync(); + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + const auto missing_shard_id = shard_blob_ids_map.begin()->first; + auto it = shard_blob_ids_map[missing_shard_id].begin(); + const auto missing_blob_id = it->first; + const auto corrupted_blob_id = (++it)->first; + const auto inconsistent_blob_id = (++it)->first; + + // TODO:: add corruptted shard and corrupted pg meta after we have the implementation for corrupting them. + + // Corrupt data on followers + run_on_pg_follower(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + + // 1. Remove missing_shard_id to simulate missing shard + _obj_inst->delete_shard_from_map(missing_shard_id); + + // 2. Delete missing_blob_id from pg_index table + delete_blob_from_index(pg_index_table, missing_shard_id, missing_blob_id); + + // 3. Make corrupted_blob_id corrupted + corrupt_blob_data(pg_index_table, missing_shard_id, corrupted_blob_id); + + // 4. Make inconsistent_blob_id inconsistent (valid but different hash) + make_blob_inconsistent(pg_index_table, missing_shard_id, inconsistent_blob_id, _obj_inst.get()); + }); + + g_helper->sync(); + + run_on_pg_leader(pg_id, [&]() { + // do deep scrub and check the scrub report + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + deep_scrub_report->print(); + + const auto& members = (hs_pg->pg_info_).members; + std::set< peer_id_t > follower_peer_ids; + const auto& leader_uuid = _obj_inst->our_uuid(); + for (const auto& member : members) { + if (member.id == leader_uuid) { continue; } + follower_peer_ids.insert(member.id); + } + + // Verify missing blobs, missing shards, and corrupted blobs for all followers + for (const auto& peer_id : follower_peer_ids) { + verify_missing_blobs(deep_scrub_report.get(), peer_id, BlobRoute{missing_shard_id, missing_blob_id}); + verify_missing_shards(deep_scrub_report.get(), peer_id, missing_shard_id); + verify_corrupted_blobs(deep_scrub_report.get(), peer_id, BlobRoute{missing_shard_id, corrupted_blob_id}); + } + + const auto inconsistent_blobs = deep_scrub_report->get_inconsistent_blobs(); + EXPECT_TRUE(inconsistent_blobs.size() == 1) + << "Inconsistent blob should be reported in deep scrub report for one of the followers"; + const auto it = inconsistent_blobs.find(BlobRoute{missing_shard_id, inconsistent_blob_id}); + EXPECT_TRUE(it != inconsistent_blobs.end()) + << "The inconsistent blob should be reported in deep scrub report for blob_id=" << inconsistent_blob_id; + auto& inconsistent_blob_peers = it->second; + + // inconsistent_blob_peers should contains all the peers. + EXPECT_TRUE(inconsistent_blob_peers.size() == follower_peer_ids.size() + 1) + << "Inconsistent blob should be reported in deep scrub report for all followers"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(inconsistent_blob_peers.count(peer_id) == 1) + << "The inconsistent blob should be reported in deep scrub report for peer_id=" << peer_id; + } + EXPECT_TRUE(inconsistent_blob_peers.count(leader_uuid) == 1) + << "The inconsistent blob should be reported in deep scrub report for leader peer_id=" << leader_uuid; + + // do shallow scrub, shallow scrub can only find missing blob/shard + scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(scrub_report, nullptr) << "Shallow scrub report should not be null"; + auto shallow_scrub_report = std::dynamic_pointer_cast< ScrubManager::ShallowScrubReport >(scrub_report); + ASSERT_NE(shallow_scrub_report, nullptr) << "Should be ShallowScrubReport"; + shallow_scrub_report->print(); + + auto miss_blob_in_shallow_report = shallow_scrub_report->get_missing_blobs(); + EXPECT_TRUE(miss_blob_in_shallow_report.size() == follower_peer_ids.size()) + << "Missing blob should be reported in shallow scrub report for all followers"; + for (const auto& peer_id : follower_peer_ids) { + auto it = miss_blob_in_shallow_report.find(peer_id); + EXPECT_TRUE(it != miss_blob_in_shallow_report.end()) + << "Missing blob should be reported in shallow scrub report for peer_id=" << peer_id; + EXPECT_TRUE(it->second.size() == 1) + << "There should be one missing blob for each peer in shallow scrub report"; + EXPECT_TRUE(it->second.count(BlobRoute{missing_shard_id, missing_blob_id}) == 1) + << "The missing blob should be reported in shallow scrub report for peer_id=" << peer_id; + } + + // peers that have the missing shard should be reported in the shallow scrub report. + const auto missing_shards_in_shallow_report = shallow_scrub_report->get_missing_shard_ids(); + EXPECT_TRUE(missing_shards_in_shallow_report.size() == follower_peer_ids.size()) + << "Missing shard should be reported in shallow scrub report for all followers"; + for (const auto& peer_id : follower_peer_ids) { + auto it = missing_shards_in_shallow_report.find(peer_id); + EXPECT_TRUE(it != missing_shards_in_shallow_report.end()) + << "Missing shard should be reported in shallow scrub report for peer_id=" << peer_id; + EXPECT_TRUE(it->second.size() == 1) + << "There should be one missing shard for each peer in shallow scrub report"; + EXPECT_TRUE(it->second.count(missing_shard_id) == 1) + << "The missing shard should be reported in shallow scrub report for peer_id=" << peer_id; + } + }); + + g_helper->sync(); + + // Test case for leader missing/corrupted + LOGINFO("Starting leader missing/corrupted test case"); + + // Get new blob ids for leader corruption test + auto& leader_shard_blobs = shard_blob_ids_map[missing_shard_id]; + auto leader_it = leader_shard_blobs.begin(); + std::advance(leader_it, 3); // Skip the first 3 blobs already used + const auto leader_missing_blob_id = leader_it->first; + const auto leader_corrupted_blob_id = (++leader_it)->first; + const auto leader_inconsistent_blob_id = (++leader_it)->first; + + // Corrupt data on leader + run_on_pg_leader(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + + // 1. Delete leader_missing_blob_id from pg_index table on leader + delete_blob_from_index(pg_index_table, missing_shard_id, leader_missing_blob_id); + LOGINFO("Deleted blob {} from leader index table", leader_missing_blob_id); + + // 2. Make leader_corrupted_blob_id corrupted on leader + corrupt_blob_data(pg_index_table, missing_shard_id, leader_corrupted_blob_id); + LOGINFO("Corrupted blob {} on leader", leader_corrupted_blob_id); + + // 3. Make leader_inconsistent_blob_id inconsistent on leader + make_blob_inconsistent(pg_index_table, missing_shard_id, leader_inconsistent_blob_id, _obj_inst.get()); + LOGINFO("Made blob {} inconsistent on leader", leader_inconsistent_blob_id); + }); + + g_helper->sync(); + + // Run scrub and verify both leader and follower corruptions are detected + run_on_pg_leader(pg_id, [&]() { + LOGINFO("Running deep scrub to detect both leader and follower corruptions"); + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + deep_scrub_report->print(); + + const auto& leader_uuid = _obj_inst->our_uuid(); + const auto& members = (hs_pg->pg_info_).members; + std::set< peer_id_t > follower_peer_ids; + for (const auto& member : members) { + if (member.id != leader_uuid) { follower_peer_ids.insert(member.id); } + } + + // ========== Verify Missing Blobs ========== + LOGINFO("Verifying missing blobs detection"); + verify_missing_blobs(deep_scrub_report.get(), leader_uuid, BlobRoute{missing_shard_id, leader_missing_blob_id}); + for (const auto& peer_id : follower_peer_ids) { + verify_missing_blobs(deep_scrub_report.get(), peer_id, BlobRoute{missing_shard_id, missing_blob_id}); + } + + // ========== Verify Missing Shards ========== + LOGINFO("Verifying missing shards detection"); + for (const auto& peer_id : follower_peer_ids) { + verify_missing_shards(deep_scrub_report.get(), peer_id, missing_shard_id); + } + + // ========== Verify Corrupted Blobs ========== + LOGINFO("Verifying corrupted blobs detection"); + verify_corrupted_blobs(deep_scrub_report.get(), leader_uuid, + BlobRoute{missing_shard_id, leader_corrupted_blob_id}); + for (const auto& peer_id : follower_peer_ids) { + verify_corrupted_blobs(deep_scrub_report.get(), peer_id, BlobRoute{missing_shard_id, corrupted_blob_id}); + } + + // ========== Verify Inconsistent Blobs ========== + const auto inconsistent_blobs = deep_scrub_report->get_inconsistent_blobs(); + LOGINFO("Verifying inconsistent blobs detection, inconsistent_blobs.size()={}", inconsistent_blobs.size()); + + // Should have 2 inconsistent blobs: one from follower test, one from leader test + EXPECT_TRUE(inconsistent_blobs.size() == 2) + << "Should have 2 inconsistent blobs (1 from follower, 1 from leader)"; + + // Verify leader's inconsistent blob + auto leader_inconsistent_it = inconsistent_blobs.find(BlobRoute{missing_shard_id, leader_inconsistent_blob_id}); + EXPECT_TRUE(leader_inconsistent_it != inconsistent_blobs.end()) + << "The leader's inconsistent blob should be reported in deep scrub report"; + if (leader_inconsistent_it != inconsistent_blobs.end()) { + auto& inconsistent_blob_peers = leader_inconsistent_it->second; + // All peers including leader should be in the inconsistent blob report + EXPECT_TRUE(inconsistent_blob_peers.size() == follower_peer_ids.size() + 1) + << "Leader's inconsistent blob should be reported for all peers including leader"; + EXPECT_TRUE(inconsistent_blob_peers.count(leader_uuid) == 1) + << "Leader should be in the inconsistent blob peers"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(inconsistent_blob_peers.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should be in leader's inconsistent blob peers"; + } + } + + // Verify follower's inconsistent blob (from earlier test) + auto follower_inconsistent_it = inconsistent_blobs.find(BlobRoute{missing_shard_id, inconsistent_blob_id}); + EXPECT_TRUE(follower_inconsistent_it != inconsistent_blobs.end()) + << "The follower's inconsistent blob should be reported in deep scrub report"; + if (follower_inconsistent_it != inconsistent_blobs.end()) { + auto& inconsistent_blob_peers = follower_inconsistent_it->second; + // All peers should be in the inconsistent blob report + EXPECT_TRUE(inconsistent_blob_peers.size() == follower_peer_ids.size() + 1) + << "Follower's inconsistent blob should be reported for all peers"; + EXPECT_TRUE(inconsistent_blob_peers.count(leader_uuid) == 1) + << "Leader should be in follower's inconsistent blob peers"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(inconsistent_blob_peers.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should be in follower's inconsistent blob peers"; + } + } + + LOGINFO("Leader and follower corruption test completed successfully"); + }); + + g_helper->sync(); +} + +// Test scrub superblock persistence across deep and shallow scrubs +TEST_F(HomeObjectFixture, ScrubSuperblockPersistenceTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + + const uint64_t shard_size = 64 * Mi; + create_shard(pg_id, shard_size, "shard_meta"); + + auto scrub_mgr = _obj_inst->scrub_manager(); + + run_on_pg_leader(pg_id, [&]() { + // Get initial scrub superblock (should be newly created) + auto initial_sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(initial_sb.has_value()) << "Should have scrub superblock"; + + auto initial_deep_scrub_time = initial_sb->last_deep_scrub_timestamp; + auto initial_shallow_scrub_time = initial_sb->last_shallow_scrub_timestamp; + + // Give some time to ensure timestamps will be different + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Run a deep scrub + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + // Check that deep scrub timestamp updated + auto after_deep_sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(after_deep_sb.has_value()); + EXPECT_GT(after_deep_sb->last_deep_scrub_timestamp, initial_deep_scrub_time) + << "Deep scrub timestamp should be updated"; + EXPECT_EQ(after_deep_sb->last_shallow_scrub_timestamp, initial_shallow_scrub_time) + << "Shallow scrub timestamp should not change after deep scrub"; + + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Run a shallow scrub + scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + // Check that shallow scrub timestamp updated + auto after_shallow_sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(after_shallow_sb.has_value()); + EXPECT_EQ(after_shallow_sb->last_deep_scrub_timestamp, after_deep_sb->last_deep_scrub_timestamp) + << "Deep scrub timestamp should not change after shallow scrub"; + EXPECT_GT(after_shallow_sb->last_shallow_scrub_timestamp, after_deep_sb->last_shallow_scrub_timestamp) + << "Shallow scrub timestamp should be updated"; + }); + + g_helper->sync(); +} \ No newline at end of file diff --git a/src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp b/src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp new file mode 100644 index 000000000..8e8b698f3 --- /dev/null +++ b/src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp @@ -0,0 +1,417 @@ +#include +#include +#include +#include +#include +#include + +#include "../MPMCPriorityQueue.hpp" + +using namespace homeobject; +using namespace std::chrono_literals; + +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, BasicPushPop) { + MPMCPriorityQueue< int > queue; + + // Push elements + queue.push(5); + queue.push(2); + queue.push(8); + queue.push(1); + + EXPECT_EQ(queue.size(), 4); + EXPECT_FALSE(queue.empty()); + + // Pop in priority order (max heap by default) + auto r1 = queue.pop(); + EXPECT_TRUE(r1.is_ok()); + EXPECT_EQ(r1.value.value(), 8); + + auto r2 = queue.pop(); + EXPECT_TRUE(r2.is_ok()); + EXPECT_EQ(r2.value.value(), 5); + + auto r3 = queue.pop(); + EXPECT_TRUE(r3.is_ok()); + EXPECT_EQ(r3.value.value(), 2); + + auto r4 = queue.pop(); + EXPECT_TRUE(r4.is_ok()); + EXPECT_EQ(r4.value.value(), 1); + + EXPECT_EQ(queue.size(), 0); + EXPECT_TRUE(queue.empty()); +} + +TEST(MPMCPriorityQueueTest, CustomComparator) { + // Min-heap using std::greater + MPMCPriorityQueue< int, std::greater< int > > queue; + + queue.push(5); + queue.push(2); + queue.push(8); + queue.push(1); + + // Pop in ascending order + EXPECT_EQ(queue.pop().value.value(), 1); + EXPECT_EQ(queue.pop().value.value(), 2); + EXPECT_EQ(queue.pop().value.value(), 5); + EXPECT_EQ(queue.pop().value.value(), 8); +} + +// Note: MPMCPriorityQueue requires std::regular, which includes copy constructibility. +// Move-only types are not supported due to the std::regular constraint. +// This test is commented out as it violates the template requirements. +// +// TEST(MPMCPriorityQueueTest, MoveSemantics) { +// struct MoveOnly { +// int value; +// +// explicit MoveOnly(int v) : value(v) {} +// MoveOnly(const MoveOnly&) = delete; +// MoveOnly& operator=(const MoveOnly&) = delete; +// MoveOnly(MoveOnly&&) = default; +// MoveOnly& operator=(MoveOnly&&) = default; +// +// bool operator<(const MoveOnly& other) const { return value < other.value; } +// }; +// +// MPMCPriorityQueue< MoveOnly > queue; +// +// queue.push(MoveOnly(5)); +// queue.push(MoveOnly(2)); +// queue.push(MoveOnly(8)); +// +// EXPECT_EQ(queue.pop().value.value().value, 8); +// EXPECT_EQ(queue.pop().value.value().value, 5); +// EXPECT_EQ(queue.pop().value.value().value, 2); +// } + +// ============================================================================ +// Close Operation Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, Close) { + MPMCPriorityQueue< int > queue; + + queue.push(1); + queue.push(2); + queue.push(3); + + EXPECT_FALSE(queue.is_closed()); + queue.close(); + EXPECT_TRUE(queue.is_closed()); + + // Can still pop existing elements + EXPECT_EQ(queue.pop().value.value(), 3); + EXPECT_EQ(queue.pop().value.value(), 2); + EXPECT_EQ(queue.pop().value.value(), 1); + + // Now should return Closed status + auto result = queue.pop(); + EXPECT_TRUE(result.is_closed()); + EXPECT_FALSE(result.value.has_value()); +} + +TEST(MPMCPriorityQueueTest, PushAfterClose) { + MPMCPriorityQueue< int > queue; + + queue.push(1); + queue.close(); + + // Pushes after close are ignored + queue.push(2); + queue.push(3); + + EXPECT_EQ(queue.size(), 1); + + auto r1 = queue.pop(); + EXPECT_TRUE(r1.is_ok()); + EXPECT_EQ(r1.value.value(), 1); + + auto r2 = queue.pop(); + EXPECT_TRUE(r2.is_closed()); +} + +TEST(MPMCPriorityQueueTest, CloseIdempotent) { + MPMCPriorityQueue< int > queue; + + queue.push(1); + queue.close(); + queue.close(); // Should be safe + queue.close(); + + EXPECT_TRUE(queue.is_closed()); + EXPECT_EQ(queue.size(), 1); +} + +// ============================================================================ +// Blocking Behavior Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, BlockingPop) { + MPMCPriorityQueue< int > queue; + std::atomic< bool > pop_started{false}; + std::atomic< bool > pop_completed{false}; + + // Consumer thread that will block + std::thread consumer([&]() { + pop_started = true; + auto result = queue.pop(); + pop_completed = true; + + EXPECT_TRUE(result.is_ok()); + EXPECT_EQ(result.value.value(), 42); + }); + + // Wait for consumer to start + while (!pop_started) { + std::this_thread::yield(); + } + + std::this_thread::sleep_for(50ms); + EXPECT_FALSE(pop_completed); + + // Unblock consumer by pushing + queue.push(42); + + consumer.join(); + EXPECT_TRUE(pop_completed); +} + +TEST(MPMCPriorityQueueTest, CloseUnblocksWaiters) { + MPMCPriorityQueue< int > queue; + std::atomic< int > closed_count{0}; + + // Start multiple waiting consumers + std::vector< std::thread > consumers; + for (int i = 0; i < 5; ++i) { + consumers.emplace_back([&]() { + auto result = queue.pop(); + if (result.is_closed()) { closed_count.fetch_add(1, std::memory_order_relaxed); } + }); + } + + std::this_thread::sleep_for(100ms); + + // Close should wake all waiters + queue.close(); + + for (auto& t : consumers) { + t.join(); + } + + EXPECT_EQ(closed_count.load(), 5); +} + +// ============================================================================ +// Multi-threaded Producer Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, MultipleProducers) { + MPMCPriorityQueue< int > queue; + constexpr int num_producers = 4; + constexpr int items_per_producer = 250; + + std::vector< std::thread > producers; + for (int i = 0; i < num_producers; ++i) { + producers.emplace_back([&, i]() { + for (int j = 0; j < items_per_producer; ++j) { + queue.push(i * items_per_producer + j); + } + }); + } + + for (auto& t : producers) { + t.join(); + } + + EXPECT_EQ(queue.size(), num_producers * items_per_producer); + + // Verify all elements come out in descending order + std::vector< int > popped; + for (int i = 0; i < num_producers * items_per_producer; ++i) { + auto result = queue.pop(); + ASSERT_TRUE(result.is_ok()); + popped.push_back(result.value.value()); + } + + EXPECT_TRUE(std::is_sorted(popped.rbegin(), popped.rend())); +} + +// ============================================================================ +// Multi-threaded Consumer Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, MultipleConsumers) { + MPMCPriorityQueue< int > queue; + constexpr int num_items = 1000; + + // Fill queue + for (int i = 0; i < num_items; ++i) { + queue.push(i); + } + + constexpr int num_consumers = 4; + std::vector< std::thread > consumers; + std::atomic< int > total_consumed{0}; + + for (int i = 0; i < num_consumers; ++i) { + consumers.emplace_back([&]() { + int count = 0; + while (true) { + auto result = queue.pop(); + if (result.is_closed()) { break; } + ++count; + } + total_consumed.fetch_add(count, std::memory_order_relaxed); + }); + } + + // Give consumers time to start + std::this_thread::sleep_for(50ms); + + // Close to signal completion + queue.close(); + + for (auto& t : consumers) { + t.join(); + } + + EXPECT_EQ(total_consumed.load(), num_items); +} + +// ============================================================================ +// Concurrent Producers and Consumers +// ============================================================================ + +TEST(MPMCPriorityQueueTest, ConcurrentProducersConsumers) { + MPMCPriorityQueue< int > queue; + constexpr int num_producers = 3; + constexpr int num_consumers = 3; + constexpr int items_per_producer = 200; + + std::atomic< int > total_consumed{0}; + std::vector< std::thread > threads; + + // Start consumers + for (int i = 0; i < num_consumers; ++i) { + threads.emplace_back([&]() { + int count = 0; + while (true) { + auto result = queue.pop(); + if (result.is_closed()) { break; } + ++count; + } + total_consumed.fetch_add(count, std::memory_order_relaxed); + }); + } + + // Start producers + for (int i = 0; i < num_producers; ++i) { + threads.emplace_back([&, i]() { + for (int j = 0; j < items_per_producer; ++j) { + queue.push(i * items_per_producer + j); + std::this_thread::sleep_for(10us); // Simulate work + } + }); + } + + // Wait for producers + for (int i = num_consumers; i < num_consumers + num_producers; ++i) { + threads[i].join(); + } + + // Close and wait for consumers + queue.close(); + for (int i = 0; i < num_consumers; ++i) { + threads[i].join(); + } + + EXPECT_EQ(total_consumed.load(), num_producers * items_per_producer); +} + +// ============================================================================ +// Stress Test +// ============================================================================ + +TEST(MPMCPriorityQueueTest, StressTest) { + MPMCPriorityQueue< int > queue; + constexpr int num_threads = 8; + constexpr int operations_per_thread = 1000; + + std::atomic< int > push_count{0}; + std::atomic< int > pop_count{0}; + std::vector< std::thread > threads; + + // Half producers, half consumers + for (int i = 0; i < num_threads / 2; ++i) { + threads.emplace_back([&]() { + for (int j = 0; j < operations_per_thread; ++j) { + queue.push(j); + push_count.fetch_add(1, std::memory_order_relaxed); + } + }); + } + + for (int i = 0; i < num_threads / 2; ++i) { + threads.emplace_back([&]() { + for (int j = 0; j < operations_per_thread; ++j) { + auto result = queue.pop(); + if (result.is_ok()) { pop_count.fetch_add(1, std::memory_order_relaxed); } + } + }); + } + + for (auto& t : threads) { + t.join(); + } + + EXPECT_EQ(push_count.load(), (num_threads / 2) * operations_per_thread); + + // Pop remaining elements + while (!queue.empty()) { + auto result = queue.pop(); + if (result.is_ok()) { pop_count.fetch_add(1, std::memory_order_relaxed); } + } + + EXPECT_EQ(pop_count.load(), push_count.load()); +} + +// ============================================================================ +// Destructor Test +// ============================================================================ + +TEST(MPMCPriorityQueueTest, DestructorClosesQueue) { + std::atomic< bool > consumer_unblocked{false}; + + std::thread consumer([&]() { + auto queue = std::make_unique< MPMCPriorityQueue< int > >(); + queue->push(1); + + std::thread waiter([&, q = queue.get()]() { + auto first_result = q->pop(); // Pop the 1 + (void)first_result; // Explicitly ignore the result + auto result = q->pop(); // This will block until destructor closes queue + if (result.is_closed()) { consumer_unblocked = true; } + }); + + std::this_thread::sleep_for(100ms); + // Destructor will be called here + queue.reset(); + + waiter.join(); + }); + + consumer.join(); + EXPECT_TRUE(consumer_unblocked); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From 27b08aa21b11a17d385ec4eebf804074c31d37ff Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Fri, 20 Mar 2026 16:36:13 +0800 Subject: [PATCH 2/4] add http interface for scrub --- .../homestore_backend/MPMCPriorityQueue.hpp | 14 +- src/lib/homestore_backend/hs_http_manager.cpp | 375 +++++++++++++++++- src/lib/homestore_backend/hs_http_manager.hpp | 54 +++ src/lib/homestore_backend/scrub_manager.cpp | 44 +- 4 files changed, 473 insertions(+), 14 deletions(-) diff --git a/src/lib/homestore_backend/MPMCPriorityQueue.hpp b/src/lib/homestore_backend/MPMCPriorityQueue.hpp index 585e2925e..cf5fea548 100644 --- a/src/lib/homestore_backend/MPMCPriorityQueue.hpp +++ b/src/lib/homestore_backend/MPMCPriorityQueue.hpp @@ -72,32 +72,34 @@ class MPMCPriorityQueue { * @brief Thread-safe push operation (copy) * * @param value Element to insert - * @note No-op if queue is closed + * @return true if pushed successfully, false if queue is closed */ - void push(const T& value) { + bool push(const T& value) { { std::scoped_lock lock(mutex_); if (closed_) [[unlikely]] { - return; // Silently ignore pushes to closed queue + return false; // Queue is closed, cannot push } pq_.push(value); } cv_.notify_one(); // Wake one waiting consumer + return true; } /** * @brief Thread-safe push operation (move) * * @param value Element to insert (will be moved) - * @note No-op if queue is closed + * @return true if pushed successfully, false if queue is closed */ - void push(T&& value) { + bool push(T&& value) { { std::scoped_lock lock(mutex_); - if (closed_) [[unlikely]] { return; } + if (closed_) [[unlikely]] { return false; } pq_.push(std::move(value)); } cv_.notify_one(); + return true; } /** diff --git a/src/lib/homestore_backend/hs_http_manager.cpp b/src/lib/homestore_backend/hs_http_manager.cpp index 5dd9a865a..6b8e71c9a 100644 --- a/src/lib/homestore_backend/hs_http_manager.cpp +++ b/src/lib/homestore_backend/hs_http_manager.cpp @@ -17,12 +17,38 @@ #include #include #include +#include +#include +#include +#include #include "hs_http_manager.hpp" #include "hs_homeobject.hpp" namespace homeobject { +namespace { +// Helper function to format time as ISO 8601 +std::string format_iso8601_time(const std::chrono::system_clock::time_point& tp) { + auto time_t = std::chrono::system_clock::to_time_t(tp); + std::tm tm; + gmtime_r(&time_t, &tm); // Thread-safe version + char buf[32]; + std::strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &tm); + return std::string(buf); +} + +// Helper to count total items across peer map +template < typename PeerMap > +size_t count_peer_map_items(const PeerMap& peer_map) { + size_t count = 0; + for (const auto& [peer_id, items] : peer_map) { + count += items.size(); + } + return count; +} +} // anonymous namespace + HttpManager::HttpManager(HSHomeObject& ho) : ho_(ho) { using namespace Pistache; using namespace Pistache::Rest; @@ -74,7 +100,13 @@ HttpManager::HttpManager(HSHomeObject& ho) : ho_(ho) { {Pistache::Http::Method::Post, "/api/v1/trigger_gc", Pistache::Rest::Routes::bind(&HttpManager::trigger_gc, this)}, {Pistache::Http::Method::Get, "/api/v1/gc_job_status", - Pistache::Rest::Routes::bind(&HttpManager::get_gc_job_status, this)}}; + Pistache::Rest::Routes::bind(&HttpManager::get_gc_job_status, this)}, + {Pistache::Http::Method::Post, "/api/v1/trigger_pg_scrub", + Pistache::Rest::Routes::bind(&HttpManager::trigger_pg_scrub, this)}, + {Pistache::Http::Method::Get, "/api/v1/scrub_job_status", + Pistache::Rest::Routes::bind(&HttpManager::get_scrub_job_status, this)}, + {Pistache::Http::Method::Post, "/api/v1/cancel_scrub_job", + Pistache::Rest::Routes::bind(&HttpManager::cancel_scrub_job, this)}}; auto http_server = ioenvironment.get_http_server(); if (!http_server) { @@ -486,6 +518,162 @@ void HttpManager::exit_pg(const Pistache::Rest::Request& request, Pistache::Http response.send(Pistache::Http::Code::Ok, "Exit pg request submitted"); } +void HttpManager::trigger_pg_scrub(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { + auto scrub_mgr = ho_.scrub_manager(); + if (!scrub_mgr) { + response.send(Pistache::Http::Code::Internal_Server_Error, "Scrub manager not available"); + return; + } + + // Get query parameters + const auto pg_id_param = request.query().get("pg_id"); + const auto is_deep_param = request.query().get("deep"); + const auto force_param = request.query().get("force"); + + // Validate pg_id parameter (required) + if (!pg_id_param || pg_id_param.value().empty()) { + nlohmann::json error; + error["error"] = "Missing required parameter: pg_id"; + error["usage"] = "POST /api/v1/trigger_pg_scrub?pg_id=&deep=&force="; + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + + uint16_t pg_id; + try { + auto val = std::stoul(pg_id_param.value()); + if (val > std::numeric_limits< uint16_t >::max()) { + nlohmann::json error; + error["error"] = "pg_id out of range"; + error["pg_id"] = pg_id_param.value(); + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + pg_id = static_cast< uint16_t >(val); + } catch (const std::invalid_argument& e) { + nlohmann::json error; + error["error"] = "Invalid pg_id format: not a number"; + error["pg_id"] = pg_id_param.value(); + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } catch (const std::out_of_range& e) { + nlohmann::json error; + error["error"] = "pg_id out of range"; + error["pg_id"] = pg_id_param.value(); + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + + // Parse optional parameters + bool is_deep = false; + if (is_deep_param && !is_deep_param.value().empty()) { + const auto& value = is_deep_param.value(); + is_deep = (value == "true" || value == "1" || value == "yes"); + } + + bool force = false; + if (force_param && !force_param.value().empty()) { + const auto& value = force_param.value(); + force = (value == "true" || value == "1" || value == "yes"); + } + + LOGINFO("Received trigger_pg_scrub request for pg_id={}, deep={}, force={}", pg_id, is_deep, force); + + // Verify PG exists + auto hs_pg = ho_.get_hs_pg(pg_id); + if (!hs_pg) { + nlohmann::json error; + error["error"] = "PG not found"; + error["pg_id"] = pg_id; + response.send(Pistache::Http::Code::Not_Found, error.dump()); + return; + } + + // Generate job ID and create job info + const auto job_id = generate_job_id(); + auto job_info = std::make_shared< ScrubJobInfo >(job_id, pg_id, is_deep); + + { + std::lock_guard< std::shared_mutex > lock(scrub_job_mutex_); + scrub_jobs_map_.set(job_id, job_info); + } + + // Prepare immediate response + nlohmann::json result; + result["job_id"] = job_id; + result["pg_id"] = pg_id; + result["scrub_type"] = is_deep ? "deep" : "shallow"; + result["force"] = force; + result["message"] = "Scrub task submitted, query status using /api/v1/scrub_job_status?job_id=" + job_id; + + // Return immediately with HTTP 202 Accepted + response.send(Pistache::Http::Code::Accepted, result.dump()); + + // Submit scrub task (MANUALLY trigger type) - runs asynchronously + scrub_mgr->submit_scrub_task(pg_id, is_deep, force, SCRUB_TRIGGER_TYPE::MANUALLY) + .via(&folly::InlineExecutor::instance()) + .thenValue([job_info, is_deep](std::shared_ptr< ScrubManager::ShallowScrubReport > report) { + if (!report) { + job_info->try_complete(ScrubJobStatus::FAILED, "Scrub task failed or was cancelled"); + return; + } + + // Build report summary + nlohmann::json report_summary; + report_summary["pg_id"] = report->get_pg_id(); + + // Add missing shards info + const auto& missing_shards = report->get_missing_shard_ids(); + if (!missing_shards.empty()) { + nlohmann::json missing_shards_json; + for (const auto& [peer_id, shard_ids] : missing_shards) { + missing_shards_json[boost::uuids::to_string(peer_id)] = shard_ids; + } + report_summary["missing_shards"] = missing_shards_json; + } + + // Add missing blobs info + const auto& missing_blobs = report->get_missing_blobs(); + if (!missing_blobs.empty()) { report_summary["missing_blobs_count"] = count_peer_map_items(missing_blobs); } + + // If it's a deep scrub report, add additional info + if (is_deep) { + auto deep_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(report); + if (deep_report) { + // Add corrupted blobs count + const auto& corrupted_blobs = deep_report->get_corrupted_blobs(); + if (!corrupted_blobs.empty()) { + report_summary["corrupted_blobs_count"] = count_peer_map_items(corrupted_blobs); + } + + // Add inconsistent blobs count + const auto& inconsistent_blobs = deep_report->get_inconsistent_blobs(); + if (!inconsistent_blobs.empty()) { + report_summary["inconsistent_blobs_count"] = inconsistent_blobs.size(); + } + + // Add corrupted shards count + const auto& corrupted_shards = deep_report->get_corrupted_shards(); + if (!corrupted_shards.empty()) { + report_summary["corrupted_shards_count"] = count_peer_map_items(corrupted_shards); + } + + // Add corrupted PG meta info + const auto& corrupted_pg_metas = deep_report->get_corrupted_pg_metas(); + if (!corrupted_pg_metas.empty()) { + report_summary["corrupted_pg_metas_count"] = corrupted_pg_metas.size(); + } + } + } + + // Complete the job with success status and report + job_info->try_complete(ScrubJobStatus::COMPLETED, "", report_summary); + }) + .thenError([job_info](const folly::exception_wrapper& ew) { + job_info->try_complete(ScrubJobStatus::FAILED, ew.what().c_str()); + }); +} + void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { auto gc_mgr = ho_.gc_manager(); if (!gc_mgr) { @@ -633,7 +821,7 @@ void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::H std::string HttpManager::generate_job_id() { auto counter = job_counter_.fetch_add(1, std::memory_order_relaxed); - return fmt::format("trigger-gc-task-{}", counter); + return fmt::format("job-{}", counter); } void HttpManager::get_job_status(const std::string& job_id, nlohmann::json& result) { @@ -770,6 +958,189 @@ void HttpManager::trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id) { .get(); } +void HttpManager::get_scrub_job_status(const Pistache::Rest::Request& request, + Pistache::Http::ResponseWriter response) { + auto job_id_param = request.query().get("job_id"); + + if (job_id_param && !job_id_param.value().empty()) { + // Query specific job + const auto job_id = job_id_param.value(); + LOGINFO("Query scrub job {} status", job_id); + + std::shared_ptr< ScrubJobInfo > job_info; + { + std::shared_lock lock(scrub_job_mutex_); + job_info = scrub_jobs_map_.get(job_id); + } + + if (!job_info) { + nlohmann::json error; + error["error"] = "Job not found"; + error["job_id"] = job_id; + response.send(Pistache::Http::Code::Not_Found, error.dump()); + return; + } + + nlohmann::json result = build_scrub_job_json(job_info); + response.send(Pistache::Http::Code::Ok, result.dump()); + return; + } + + // Query all jobs + LOGINFO("Query all scrub job status"); + nlohmann::json result; + std::vector< std::shared_ptr< ScrubJobInfo > > all_jobs; + + { + std::shared_lock lock(scrub_job_mutex_); + for (const auto& [k, v] : scrub_jobs_map_) { + all_jobs.push_back(v); + } + } + + for (const auto& job_info : all_jobs) { + result["jobs"].push_back(build_scrub_job_json(job_info)); + } + + response.send(Pistache::Http::Code::Ok, result.dump()); +} + +nlohmann::json HttpManager::build_scrub_job_json(const std::shared_ptr< ScrubJobInfo >& job_info) { + nlohmann::json result; + + // Helper to convert status enum to string + auto status_to_string = [](ScrubJobStatus status) -> std::string { + switch (status) { + case ScrubJobStatus::RUNNING: + return "running"; + case ScrubJobStatus::COMPLETED: + return "completed"; + case ScrubJobStatus::FAILED: + return "failed"; + case ScrubJobStatus::CANCELLED: + return "cancelled"; + default: + return "unknown"; + } + }; + + // Thread-unsafe fields (read-only after construction) + result["job_id"] = job_info->job_id; + result["pg_id"] = job_info->pg_id; + result["scrub_type"] = job_info->is_deep ? "deep" : "shallow"; + + // Thread-safe fields (protected by mutex) + { + std::lock_guard< std::mutex > lock(job_info->mtx_); + + // Status + result["status"] = status_to_string(job_info->status); + + // Timestamps - convert to ISO 8601 format (no newline) + result["start_time"] = format_iso8601_time(job_info->start_time); + + if (job_info->status != ScrubJobStatus::RUNNING) { + result["end_time"] = format_iso8601_time(job_info->end_time); + + auto duration = + std::chrono::duration_cast< std::chrono::seconds >(job_info->end_time - job_info->start_time); + result["duration_seconds"] = duration.count(); + } + + // Error message (if any) + if (!job_info->error_message.empty()) { result["error_message"] = job_info->error_message; } + + // Report summary (if completed) + if (job_info->status == ScrubJobStatus::COMPLETED && !job_info->report_summary.empty()) { + result["report"] = job_info->report_summary; + } + } + + return result; +} + +void HttpManager::cancel_scrub_job(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { + auto job_id_param = request.query().get("job_id"); + + if (!job_id_param || job_id_param.value().empty()) { + nlohmann::json error; + error["error"] = "Missing required parameter: job_id"; + error["usage"] = "POST /api/v1/cancel_scrub_job?job_id="; + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + + const auto job_id = job_id_param.value(); + LOGINFO("Cancel scrub job {}", job_id); + + std::shared_ptr< ScrubJobInfo > job_info; + { + std::shared_lock lock(scrub_job_mutex_); + job_info = scrub_jobs_map_.get(job_id); + } + + if (!job_info) { + nlohmann::json error; + error["error"] = "Job not found"; + error["job_id"] = job_id; + response.send(Pistache::Http::Code::Not_Found, error.dump()); + return; + } + + // Check if job is still running (thread-safe) + bool can_cancel = false; + std::string current_status_str; + { + std::lock_guard< std::mutex > lock(job_info->mtx_); + can_cancel = (job_info->status == ScrubJobStatus::RUNNING); + if (!can_cancel) { + // Get status string for error message + switch (job_info->status) { + case ScrubJobStatus::COMPLETED: + current_status_str = "completed"; + break; + case ScrubJobStatus::FAILED: + current_status_str = "failed"; + break; + case ScrubJobStatus::CANCELLED: + current_status_str = "cancelled"; + break; + default: + current_status_str = "unknown"; + } + } + } + + if (!can_cancel) { + nlohmann::json result; + result["job_id"] = job_id; + result["message"] = "Job is not running, cannot cancel"; + result["current_status"] = current_status_str; + response.send(Pistache::Http::Code::Bad_Request, result.dump()); + return; + } + + // Cancel the scrub task + auto scrub_mgr = ho_.scrub_manager(); + if (!scrub_mgr) { + nlohmann::json error; + error["error"] = "Scrub manager not available"; + response.send(Pistache::Http::Code::Internal_Server_Error, error.dump()); + return; + } + + // Cancel in scrub manager first (this will stop ongoing work) + scrub_mgr->cancel_scrub_task(job_info->pg_id); + + // Update job status (thread-safe) + job_info->cancel(); + + nlohmann::json result; + result["job_id"] = job_id; + result["message"] = "Scrub job cancelled successfully"; + response.send(Pistache::Http::Code::Ok, result.dump()); +} + #ifdef _PRERELEASE void HttpManager::crash_system(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { std::string crash_type; diff --git a/src/lib/homestore_backend/hs_http_manager.hpp b/src/lib/homestore_backend/hs_http_manager.hpp index 9a6ee0b97..2681aeccb 100644 --- a/src/lib/homestore_backend/hs_http_manager.hpp +++ b/src/lib/homestore_backend/hs_http_manager.hpp @@ -50,6 +50,9 @@ class HttpManager { void get_gc_job_status(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); void trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id); void get_job_status(const std::string& job_id, nlohmann::json& result); + void trigger_pg_scrub(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); + void get_scrub_job_status(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); + void cancel_scrub_job(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); #ifdef _PRERELEASE void crash_system(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); @@ -74,15 +77,66 @@ class HttpManager { job_id(id), status(GCJobStatus::RUNNING), pg_id(pgid), chunk_id(cid) {} }; + enum class ScrubJobStatus { RUNNING, COMPLETED, FAILED, CANCELLED }; + + struct ScrubJobInfo { + std::string job_id; + uint16_t pg_id; + bool is_deep; + + // Mutable fields protected by mutex + mutable std::mutex mtx_; + ScrubJobStatus status; + std::chrono::system_clock::time_point start_time; + std::chrono::system_clock::time_point end_time; + std::string error_message; + nlohmann::json report_summary; + + // Flag to prevent status update after cancellation + std::atomic< bool > is_cancelled{false}; + + ScrubJobInfo(const std::string& id, uint16_t pgid, bool deep) : + job_id(id), + pg_id(pgid), + is_deep(deep), + status(ScrubJobStatus::RUNNING), + start_time(std::chrono::system_clock::now()) {} + + // Thread-safe status update - returns false if already cancelled + bool try_complete(ScrubJobStatus new_status, const std::string& error_msg = "", + const nlohmann::json& summary = nlohmann::json()) { + std::lock_guard< std::mutex > lock(mtx_); + if (is_cancelled.load(std::memory_order_acquire)) { return false; } // Already cancelled, reject update + + status = new_status; + end_time = std::chrono::system_clock::now(); + error_message = error_msg; + if (!summary.empty()) { report_summary = summary; } + return true; + } + + // Thread-safe cancel + void cancel() { + std::lock_guard< std::mutex > lock(mtx_); + is_cancelled.store(true, std::memory_order_release); + status = ScrubJobStatus::CANCELLED; + end_time = std::chrono::system_clock::now(); + error_message = "Cancelled by user"; + } + }; + std::string generate_job_id(); + nlohmann::json build_scrub_job_json(const std::shared_ptr< ScrubJobInfo >& job_info); private: HSHomeObject& ho_; std::atomic< uint64_t > job_counter_{0}; std::shared_mutex gc_job_mutex_; + std::shared_mutex scrub_job_mutex_; // we don`t have an external DB to store the job status, so we only keep the status of the lastest 100 jobs for // query. or, we can evict the job after it is completed after a timeout period. folly::EvictingCacheMap< std::string, std::shared_ptr< GCJobInfo > > gc_jobs_map_{100}; + folly::EvictingCacheMap< std::string, std::shared_ptr< ScrubJobInfo > > scrub_jobs_map_{100}; }; } // namespace homeobject \ No newline at end of file diff --git a/src/lib/homestore_backend/scrub_manager.cpp b/src/lib/homestore_backend/scrub_manager.cpp index 5b9452fa7..4908782a4 100644 --- a/src/lib/homestore_backend/scrub_manager.cpp +++ b/src/lib/homestore_backend/scrub_manager.cpp @@ -63,7 +63,7 @@ void ScrubManager::scan_pg_for_scrub() { .via(&folly::InlineExecutor::instance()) .thenValue([this, pg_id](std::shared_ptr< ShallowScrubReport > report) { if (!report) { - LOGERRORMOD(scrubmgr, "deep scrub failed for pg={}", pg_id); + LOGERRORMOD(scrubmgr, "shallow scrub failed for pg={}", pg_id); return; } LOGINFOMOD(scrubmgr, "shallow scrub is completed for pg={}", pg_id); @@ -615,7 +615,12 @@ bool ScrubManager::send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, } // Check if cancelled or incomplete - if (scrub_ctx->is_cancelled() || scrub_ctx->peer_sm_map_.size() != scrub_ctx->member_peer_ids_.size()) { + bool is_incomplete = false; + { + std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); + is_incomplete = scrub_ctx->peer_sm_map_.size() != scrub_ctx->member_peer_ids_.size(); + } + if (scrub_ctx->is_cancelled() || is_incomplete) { SCRUBLOGD(pg_id, task_id, "scrub task is cancelled or incomplete when scrubbing {}!", scrub_type_name); return false; } @@ -728,10 +733,31 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { } // Merge PG meta scrub results - pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + { + std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); + pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + } SCRUBLOGD(pg_id, task_id, "PG meta scrub completed"); } + // scrubbing probably goes with blob deletion, and thus some of blobs might be not present on some + // peers even if we wait for the same scrub_lsn. Theoretically, without a strong consistent snapshot , there is not + // a mechanism to distinguish whether a blob/shard is missing due to deletion or due to lost, this is the + // predicament we are in now with oure current design: + + // 1 no blob delete lsn, + // 2 no shard sealed lsn + // 3 no snapshot which can provide a strong consistent view of the + // blob/shard existence at the scrub_lsn. + + // we can only rely on the best effort of waiting for all peers to reach the same scrub_lsn, but it is not + // guaranteed. As a result, we might have false positive missing blobs due to deletion!!!! + + // TODO: figure out a solution to mitigate the false positive issue, for example, we can add a "blob delete lsn" and + // "shard sealed lsn". for all the missblobs, if its deletd lsn is after scrub_lsn, then it is a false positive + // missing blob, and we can move it out of missblobs. this can be done by leader when merging all the scrub maps for + // a specific scrub req. + // Step 2: Scrub Shard Range SCRUBLOGD(pg_id, task_id, "Starting shard range {} scrub", is_deep_scrub ? "deep" : "shallow"); { @@ -791,7 +817,10 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { } SCRUBLOGD(pg_id, task_id, "Merging shard scrub results for range [{}, {}]", shard_start, shard_end); - pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + { + std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); + pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + } } SCRUBLOGD(pg_id, task_id, "shard scrub completed, total ranges scrubbed: {}", shard_range_count); } @@ -850,7 +879,10 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { } SCRUBLOGD(pg_id, task_id, "Merging blob scrub results for range [{}, {}]", blob_start, blob_end); - pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + { + std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); + pg_scrub_report->merge(scrub_ctx->peer_sm_map_); + } } SCRUBLOGD(pg_id, task_id, "blob scrub completed, total ranges scrubbed: {}", blob_range_count); } @@ -920,7 +952,7 @@ void ScrubManager::save_scrub_superblk(const pg_id_t pg_id, const bool is_deep_s (*sb).create(sizeof(pg_scrub_superblk)); (*sb)->pg_id = pg_id; (*sb)->last_deep_scrub_timestamp = current_time; - (*sb)->last_deep_scrub_timestamp = current_time; + (*sb)->last_shallow_scrub_timestamp = current_time; (*sb).write(); m_pg_scrub_sb_map.emplace(pg_id, std::move(sb)); return; From 3b62d42275aad3eb89a333e74ebb757b56856c00 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 23 Mar 2026 07:48:03 +0800 Subject: [PATCH 3/4] filter out deleted blobs when scrubbing --- src/lib/homestore_backend/hs_blob_manager.cpp | 27 +++- src/lib/homestore_backend/hs_homeobject.hpp | 2 + .../replication_state_machine.cpp | 3 + src/lib/homestore_backend/scrub_manager.cpp | 126 +++++++++++++++--- src/lib/homestore_backend/scrub_manager.hpp | 9 ++ 5 files changed, 143 insertions(+), 24 deletions(-) diff --git a/src/lib/homestore_backend/hs_blob_manager.cpp b/src/lib/homestore_backend/hs_blob_manager.cpp index c89c32ec5..b3526bd54 100644 --- a/src/lib/homestore_backend/hs_blob_manager.cpp +++ b/src/lib/homestore_backend/hs_blob_manager.cpp @@ -88,7 +88,7 @@ BlobManager::AsyncResult< blob_id_t > HSHomeObject::_put_blob(ShardInfo const& s return folly::makeUnexpected(BlobErrorCode::SHUTTING_DOWN); } incr_pending_request_num(); - // check user key size + // check user key size if (blob.user_key.size() > BlobHeader::max_user_key_length) { BLOGE(tid, shard.id, 0, "input user key length > max_user_key_length {}", blob.user_key.size(), BlobHeader::max_user_key_length); @@ -167,8 +167,7 @@ BlobManager::AsyncResult< blob_id_t > HSHomeObject::_put_blob(ShardInfo const& s // Set offset of actual data after the blob header and user key (rounded off) req->blob_header()->data_offset = req->blob_header_buf().size(); - RELEASE_ASSERT(req->blob_header()->data_offset == _data_block_size, - "blob header should equals _data_block_size"); + RELEASE_ASSERT(req->blob_header()->data_offset == _data_block_size, "blob header should equals _data_block_size"); // In case blob body is not aligned, create a new aligned buffer and copy the blob body. if (((r_cast< uintptr_t >(blob.body.cbytes()) % io_align) != 0) || ((blob_size % io_align) != 0)) { // If address or size is not aligned, create a separate aligned buffer and do expensive memcpy. @@ -367,9 +366,7 @@ BlobManager::AsyncResult< Blob > HSHomeObject::_get_blob_data(const shared< home } auto verify_result = do_verify_blob(read_buf.cbytes(), shard_id, 0 /* no blob_id check */); - if (!verify_result.hasValue()) { - return folly::makeUnexpected(verify_result.error()); - } + if (!verify_result.hasValue()) { return folly::makeUnexpected(verify_result.error()); } std::string user_key = std::move(verify_result.value()); BlobHeader const* header = r_cast< BlobHeader const* >(read_buf.cbytes()); @@ -742,4 +739,22 @@ bool HSHomeObject::verify_blob(const void* blob, const shard_id_t shard_id, cons auto result = do_verify_blob(blob, shard_id, blob_id); return result.hasValue(); } + +bool HSHomeObject::on_blob_del_pre_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + cintrusive< homestore::repl_req_ctx >& hs_ctx) { + auto msg_header = r_cast< ReplicationMessageHeader* >(const_cast< uint8_t* >(header.cbytes())); + if (msg_header->corrupted()) { + // since log has been appended, we crash here immediately. + RELEASE_ASSERT(false, "corrupted header caught in on_blob_del_commit , lsn={}", lsn); + return false; + } + + const auto& pg_id = msg_header->pg_id; + const auto& shard_id = msg_header->shard_id; + const auto& blob_id = *r_cast< blob_id_t const* >(key.cbytes()); + LOGD("Received del_blob pre-commit for pg={}, shard=0x{:x}, blob_id={}, lsn={}", pg_id, shard_id, blob_id, lsn); + + if (scrub_mgr_) { scrub_mgr_->add_pg_deleted_blob(pg_id, {shard_id, blob_id}, lsn); } + return true; +} } // namespace homeobject diff --git a/src/lib/homestore_backend/hs_homeobject.hpp b/src/lib/homestore_backend/hs_homeobject.hpp index d5a11c78d..c156bb04f 100644 --- a/src/lib/homestore_backend/hs_homeobject.hpp +++ b/src/lib/homestore_backend/hs_homeobject.hpp @@ -942,6 +942,8 @@ class HSHomeObject : public HomeObjectImpl { bool on_shard_message_pre_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, cintrusive< homestore::repl_req_ctx >& hs_ctx); + bool on_blob_del_pre_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + cintrusive< homestore::repl_req_ctx >& hs_ctx); void on_shard_message_rollback(int64_t lsn, sisl::blob const& header, sisl::blob const& key, cintrusive< homestore::repl_req_ctx >& hs_ctx); diff --git a/src/lib/homestore_backend/replication_state_machine.cpp b/src/lib/homestore_backend/replication_state_machine.cpp index 5d022213b..f3c62fa43 100644 --- a/src/lib/homestore_backend/replication_state_machine.cpp +++ b/src/lib/homestore_backend/replication_state_machine.cpp @@ -97,6 +97,9 @@ bool ReplicationStateMachine::on_pre_commit(int64_t lsn, sisl::blob const& heade case ReplicationMessageType::SEAL_SHARD_MSG: { return home_object_->on_shard_message_pre_commit(lsn, header, key, ctx); } + case ReplicationMessageType::DEL_BLOB_MSG: { + return home_object_->on_blob_del_pre_commit(lsn, header, key, ctx); + } default: { break; } diff --git a/src/lib/homestore_backend/scrub_manager.cpp b/src/lib/homestore_backend/scrub_manager.cpp index 4908782a4..2d8f6947a 100644 --- a/src/lib/homestore_backend/scrub_manager.cpp +++ b/src/lib/homestore_backend/scrub_manager.cpp @@ -187,6 +187,17 @@ bool ScrubManager::add_scrub_map(const pg_id_t pg_id, std::shared_ptr< BaseScrub return pg_scrub_ctx->add_scrub_map(std::move(bsm)); } +void ScrubManager::add_pg_deleted_blob(const pg_id_t pg_id, const BlobRoute& blob_route, int64_t delete_lsn) { + auto pg_scrub_ctx_it = m_pg_scrub_ctx_map.find(pg_id); + if (pg_scrub_ctx_it == m_pg_scrub_ctx_map.end()) { + LOGDEBUGMOD(scrubmgr, "can not find scrub context for pg_id={}, fail to add deleted blob!", pg_id); + return; + } + + auto& pg_scrub_ctx = pg_scrub_ctx_it->second; + pg_scrub_ctx->add_deleted_blob(blob_route, delete_lsn); +} + void ScrubManager::handle_scrub_req(std::shared_ptr< base_scrub_req > req) { if (!req) { LOGERRORMOD(scrubmgr, "scrub req is null, can not handle it!"); @@ -649,6 +660,24 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { const pg_id_t& pg_id; ~scrub_task_guard() { + const auto scrub_ctx_it = pg_scrub_ctx_map.find(pg_id); + if (scrub_ctx_it != pg_scrub_ctx_map.end()) { + // filter out those deleted blobs in the scrub report. + + // we capture all the deleted blobs when scrubbing is ongoing, and filter them out from scrub report at + // the end of scrub task to make sure we will not report those deleted blobs as problematic blobs in + // scrub report. + // 1 we only capture the deleted blobs when scrubbing, since we only care about those + // blobs deleted during scrubbing. + // 2 on_pre_commit is called after log append. so, if leader switch happens during pg scrub, the delete + // blob log will be rollbacked , but the scrub task will also be cancelled, and will not give a final + // full scrub report for this scrub task. + // 3 if leader swith not happens during scrub, thee all the deleted blobs captured in on_pre_commit + // should finally be commited. + const auto& scrub_ctx = scrub_ctx_it->second; + scrub_report->filter_out_deleted_blobs(scrub_ctx->deleted_blobs_when_scrubbing_); + } + pg_scrub_ctx_map.erase(pg_id); task.scrub_report_promise->setValue(scrub_report); @@ -740,24 +769,6 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { SCRUBLOGD(pg_id, task_id, "PG meta scrub completed"); } - // scrubbing probably goes with blob deletion, and thus some of blobs might be not present on some - // peers even if we wait for the same scrub_lsn. Theoretically, without a strong consistent snapshot , there is not - // a mechanism to distinguish whether a blob/shard is missing due to deletion or due to lost, this is the - // predicament we are in now with oure current design: - - // 1 no blob delete lsn, - // 2 no shard sealed lsn - // 3 no snapshot which can provide a strong consistent view of the - // blob/shard existence at the scrub_lsn. - - // we can only rely on the best effort of waiting for all peers to reach the same scrub_lsn, but it is not - // guaranteed. As a result, we might have false positive missing blobs due to deletion!!!! - - // TODO: figure out a solution to mitigate the false positive issue, for example, we can add a "blob delete lsn" and - // "shard sealed lsn". for all the missblobs, if its deletd lsn is after scrub_lsn, then it is a false positive - // missing blob, and we can move it out of missblobs. this can be done by leader when merging all the scrub maps for - // a specific scrub req. - // Step 2: Scrub Shard Range SCRUBLOGD(pg_id, task_id, "Starting shard range {} scrub", is_deep_scrub ? "deep" : "shallow"); { @@ -1907,4 +1918,83 @@ void ScrubManager::DeepScrubReport::merge(const std::map< peer_id_t, std::shared corrupted_pg_metas.size()); } +void ScrubManager::ShallowScrubReport::filter_out_deleted_blobs( + const folly::ConcurrentHashMap< BlobRoute, int64_t >& deleted_blobs_when_scrubbing) { + size_t total_filtered = 0; + + // Filter out deleted blobs from missing_blobs + for (auto& [peer_id, blob_set] : missing_blobs) { + auto it = blob_set.begin(); + while (it != blob_set.end()) { + if (deleted_blobs_when_scrubbing.find(*it) != deleted_blobs_when_scrubbing.end()) { + // This blob was deleted during scrubbing, remove it from missing blobs + it = blob_set.erase(it); + ++total_filtered; + } else { + ++it; + } + } + } + + // Remove peers with no missing blobs + auto peer_it = missing_blobs.begin(); + while (peer_it != missing_blobs.end()) { + if (peer_it->second.empty()) { + peer_it = missing_blobs.erase(peer_it); + } else { + ++peer_it; + } + } + + if (total_filtered > 0) { + LOGINFOMOD(scrubmgr, "[pg={}] Filtered out {} deleted blobs from shallow scrub report", pg_id_, total_filtered); + } +} + +void ScrubManager::DeepScrubReport::filter_out_deleted_blobs( + const folly::ConcurrentHashMap< BlobRoute, int64_t >& deleted_blobs_when_scrubbing) { + size_t total_filtered = 0; + + // First filter the base class missing_blobs + ShallowScrubReport::filter_out_deleted_blobs(deleted_blobs_when_scrubbing); + + // Filter out deleted blobs from corrupted_blobs + for (auto& [peer_id, blob_map] : corrupted_blobs) { + auto it = blob_map.begin(); + while (it != blob_map.end()) { + if (deleted_blobs_when_scrubbing.find(it->first) != deleted_blobs_when_scrubbing.end()) { + it = blob_map.erase(it); + ++total_filtered; + } else { + ++it; + } + } + } + + // Remove peers with no corrupted blobs + auto peer_it = corrupted_blobs.begin(); + while (peer_it != corrupted_blobs.end()) { + if (peer_it->second.empty()) { + peer_it = corrupted_blobs.erase(peer_it); + } else { + ++peer_it; + } + } + + // Filter out deleted blobs from inconsistent_blobs + auto blob_it = inconsistent_blobs.begin(); + while (blob_it != inconsistent_blobs.end()) { + if (deleted_blobs_when_scrubbing.find(blob_it->first) != deleted_blobs_when_scrubbing.end()) { + blob_it = inconsistent_blobs.erase(blob_it); + ++total_filtered; + } else { + ++blob_it; + } + } + + if (total_filtered > 0) { + LOGINFOMOD(scrubmgr, "[pg={}] Filtered out {} deleted blobs from deep scrub report", pg_id_, total_filtered); + } +} + } // namespace homeobject \ No newline at end of file diff --git a/src/lib/homestore_backend/scrub_manager.hpp b/src/lib/homestore_backend/scrub_manager.hpp index 771ec5263..f6bf37276 100644 --- a/src/lib/homestore_backend/scrub_manager.hpp +++ b/src/lib/homestore_backend/scrub_manager.hpp @@ -260,6 +260,8 @@ class ScrubManager { const auto& get_missing_blobs() const { return missing_blobs; } virtual void merge(const std::map< peer_id_t, std::shared_ptr< BaseScrubMap > >& peer_sm_map); virtual void print() const; + virtual void + filter_out_deleted_blobs(const folly::ConcurrentHashMap< BlobRoute, int64_t >& deleted_blobs_when_scrubbing); public: std::map< peer_id_t, std::set< shard_id_t > > missing_shard_ids; @@ -291,6 +293,8 @@ class ScrubManager { const auto& get_corrupted_pg_metas() const { return corrupted_pg_metas; } void merge(const std::map< peer_id_t, std::shared_ptr< BaseScrubMap > >& peer_sm_map) override; void print() const override; + virtual void filter_out_deleted_blobs( + const folly::ConcurrentHashMap< BlobRoute, int64_t >& deleted_blobs_when_scrubbing) override; private: std::map< peer_id_t, std::map< BlobRoute, ScrubResult > > corrupted_blobs; @@ -389,6 +393,9 @@ class ScrubManager { std::vector< peer_id_t > get_peers_to_retry() const; void cancel(); bool is_cancelled() const { return cancelled.load(); } + void add_deleted_blob(const BlobRoute& blob_route, int64_t delete_lsn) { + deleted_blobs_when_scrubbing_.try_emplace(blob_route, delete_lsn); + } public: uint64_t task_id{0}; @@ -397,6 +404,7 @@ class ScrubManager { atomic_uint64_t req_id{0}; mutable std::mutex mtx_; std::map< peer_id_t, std::shared_ptr< BaseScrubMap > > peer_sm_map_; + folly::ConcurrentHashMap< BlobRoute, int64_t > deleted_blobs_when_scrubbing_; private: std::atomic_bool cancelled{false}; @@ -424,6 +432,7 @@ class ScrubManager { std::optional< pg_scrub_superblk > get_scrub_superblk(const pg_id_t pg_id) const; void save_scrub_superblk(const pg_id_t pg_id, const bool is_deep_scrub, bool force_update = true); void add_scrub_req(std::shared_ptr< base_scrub_req > req); + void add_pg_deleted_blob(const pg_id_t pg_id, const BlobRoute& blob_route, int64_t delete_lsn); /*local scrub*/ public: From 40463ac34020dd3f06cf26c8bb80d2dbfe77e9d6 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Fri, 3 Apr 2026 14:36:55 +0800 Subject: [PATCH 4/4] fix bug and add UT --- .../homestore_backend/hs_shard_manager.cpp | 2 +- src/lib/homestore_backend/scrub_manager.cpp | 123 +++-- src/lib/homestore_backend/scrub_manager.hpp | 6 +- .../tests/homeobj_fixture.hpp | 23 +- .../tests/hs_scrubber_tests.cpp | 445 +++++++++++++++++- 5 files changed, 540 insertions(+), 59 deletions(-) diff --git a/src/lib/homestore_backend/hs_shard_manager.cpp b/src/lib/homestore_backend/hs_shard_manager.cpp index 387515b89..a1567849f 100644 --- a/src/lib/homestore_backend/hs_shard_manager.cpp +++ b/src/lib/homestore_backend/hs_shard_manager.cpp @@ -745,6 +745,7 @@ void HSHomeObject::delete_shard_from_map(shard_id_t shard_id) { (shard_id >> homeobject::shard_width), (shard_id & homeobject::shard_mask)); auto hs_shard = d_cast< HS_Shard* >((*shard_iter->second).get()); const auto pg_id = hs_shard->info.placement_group; + auto p_chunk_id = hs_shard->p_chunk_id(); auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); RELEASE_ASSERT(hs_pg, "Missing pg info, pg={}", pg_id); @@ -752,7 +753,6 @@ void HSHomeObject::delete_shard_from_map(shard_id_t shard_id) { shards.remove_if([shard_id](auto& shard_it) { return (shard_it->info).id == shard_id; }); _shard_map.erase(shard_id); - auto p_chunk_id = hs_shard->p_chunk_id(); chunk_to_shards_map_[p_chunk_id].erase(shard_id); // TODO:: delete shard meta blk } diff --git a/src/lib/homestore_backend/scrub_manager.cpp b/src/lib/homestore_backend/scrub_manager.cpp index 2d8f6947a..92fb77033 100644 --- a/src/lib/homestore_backend/scrub_manager.cpp +++ b/src/lib/homestore_backend/scrub_manager.cpp @@ -260,15 +260,18 @@ void ScrubManager::handle_scrub_req(std::shared_ptr< base_scrub_req > req) { } // 2 send scrub map back to leader - auto flatbuffer = scrub_map->build_flat_buffer(); + auto flatbuffer = std::make_shared< flatbuffers::DetachedBuffer >(scrub_map->build_flat_buffer()); + auto scrub_type_buffer = std::make_shared< SCRUB_TYPE >(scrub_type); + sisl::io_blob_list_t blob_list; - blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); - blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + blob_list.emplace_back(reinterpret_cast< const uint8_t* >(scrub_type_buffer.get()), sizeof(scrub_type), false); + blob_list.emplace_back(flatbuffer->data(), flatbuffer->size(), false); // no need to retry, leader will handle retries pg_repl_dev->data_request_unidirectional(remote_peer_id, HSHomeObject::PUSH_SCRUB_MAP, blob_list) .via(&folly::InlineExecutor::instance()) - .thenValue([pg_id, remote_peer_id, scrub_type, task_id](auto&& response) { + .thenValue([pg_id, remote_peer_id, task_id, flatbuffer, scrub_type_buffer](auto&& response) { + const auto scrub_type = *scrub_type_buffer; if (response.hasError()) { SCRUBLOGD(pg_id, task_id, "failed to send scrub map to peer {}, scrub_type:{}, error={}", remote_peer_id, scrub_type, response.error()); @@ -528,12 +531,12 @@ folly::SemiFuture< std::shared_ptr< ScrubManager::ShallowScrubReport > > ScrubManager::submit_scrub_task(const pg_id_t& pg_id, const bool is_deep, const bool force, SCRUB_TRIGGER_TYPE trigger_type) { LOGINFOMOD(scrubmgr, "submit a scrub task for pg={}, deep_scrub:{}", pg_id, is_deep); + + // Check if a scrub task is already running for this PG + // Note: There's still a small race window between this check and task execution in handle_pg_scrub_task, + // but the try_emplace there provides the final guard. This check prevents unnecessary task queueing. auto it = m_pg_scrub_ctx_map.find(pg_id); if (it != m_pg_scrub_ctx_map.end()) { - // TODO:: there is case that two thread try to submit scrub task for the same pg at the same time, we can - // optimize it by adding a lock for each pg or using atomic operation to make sure only one scrub task can be - // submitted for each pg, and other threads can get the existing scrub task if they want to submit another scrub - // task for the same pg. LOGWARNMOD(scrubmgr, "a scrub task is already running for pg={}, no need to submit another one!", pg_id); return folly::makeFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); } @@ -588,22 +591,27 @@ void ScrubManager::cancel_scrub_task(const pg_id_t& pg_id) { } // Helper function to send scrub requests to all peers and handle retries -bool ScrubManager::send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, +bool ScrubManager::send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, shared< homestore::ReplDev > pg_repl_dev, const std::unordered_set< peer_id_t >& all_member_peer_ids, - const peer_id_t& my_uuid, shared< homestore::ReplDev > pg_repl_dev, - const sisl::io_blob_list_t& req_blob_list, - std::shared_ptr< PGScrubContext > scrub_ctx, uint32_t max_retries, - std::chrono::seconds timeout, const std::string& scrub_type_name) { + const peer_id_t& my_uuid, + std::shared_ptr< flatbuffers::DetachedBuffer > flat_buffer, + SCRUB_TYPE scrub_type, std::shared_ptr< PGScrubContext > scrub_ctx, + uint32_t max_retries, std::chrono::seconds timeout) { + auto scrub_type_buffer = std::make_shared< SCRUB_TYPE >(scrub_type); + sisl::io_blob_list_t blob_list; + blob_list.emplace_back(reinterpret_cast< const uint8_t* >(scrub_type_buffer.get()), sizeof(scrub_type), false); + blob_list.emplace_back(flat_buffer->data(), flat_buffer->size(), false); + // Lambda to send requests to a list of peers - auto send_requests_to_remote_peers = [&](const auto& peer_list, bool is_retry) { + auto send_requests_to_remote_peers = [&, flat_buffer, scrub_type_buffer](const auto& peer_list, bool is_retry) { for (const auto& peer_id : peer_list) { if (peer_id == my_uuid) continue; - pg_repl_dev->data_request_unidirectional(peer_id, HSHomeObject::PUSH_SCRUB_REQ, req_blob_list) + pg_repl_dev->data_request_unidirectional(peer_id, HSHomeObject::PUSH_SCRUB_REQ, blob_list) .via(&folly::InlineExecutor::instance()) - .thenValue([pg_id, peer_id, task_id, scrub_type_name, is_retry](auto&& response) { + .thenValue([pg_id, peer_id, task_id, flat_buffer, scrub_type_buffer, is_retry](auto&& response) { if (response.hasError()) { SCRUBLOGE(pg_id, task_id, "{} to send {} scrub request to peer {}", - is_retry ? "retry failed" : "failed", scrub_type_name, peer_id); + is_retry ? "retry failed" : "failed", *scrub_type_buffer, peer_id); } }); } @@ -618,7 +626,7 @@ bool ScrubManager::send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, auto peers_to_retry = scrub_ctx->get_peers_to_retry(); if (peers_to_retry.empty()) break; - SCRUBLOGD(pg_id, task_id, "Retrying {} scrub for {} peers", scrub_type_name, peers_to_retry.size()); + SCRUBLOGD(pg_id, task_id, "Retrying {} scrub for {} peers", scrub_type, peers_to_retry.size()); send_requests_to_remote_peers(peers_to_retry, true); if (scrub_ctx->wait_for_all_req_sms(timeout)) break; @@ -632,7 +640,7 @@ bool ScrubManager::send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, is_incomplete = scrub_ctx->peer_sm_map_.size() != scrub_ctx->member_peer_ids_.size(); } if (scrub_ctx->is_cancelled() || is_incomplete) { - SCRUBLOGD(pg_id, task_id, "scrub task is cancelled or incomplete when scrubbing {}!", scrub_type_name); + SCRUBLOGD(pg_id, task_id, "scrub task is cancelled or incomplete when scrubbing {}!", scrub_type); return false; } return true; @@ -707,6 +715,8 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { RELEASE_ASSERT(all_member_peer_ids.find(my_uuid) != all_member_peer_ids.end(), "my uuid={} is not in the member list of this pg, something is wrong!", my_uuid); + // Use try_emplace for atomic check-and-insert to avoid race condition + // This is safe because try_emplace is atomic in folly::ConcurrentHashMap auto [ctx_it, happened] = m_pg_scrub_ctx_map.try_emplace(pg_id, std::make_shared< PGScrubContext >(task_id, all_member_peer_ids)); if (!happened) { @@ -736,14 +746,11 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { SCRUBLOGD(pg_id, task_id, "Starting PG meta scrub"); auto pg_meta_req = std::make_shared< base_scrub_req >(task_id, scrub_ctx->req_id.fetch_add(1), scrub_lsn, my_uuid, pg_id, true); - // TODO:: add a lock here to protect add_scrub_map when changing current_req. - scrub_ctx->current_req = pg_meta_req; - // Send requests to all peers - auto flatbuffer = pg_meta_req->build_flat_buffer(); - sisl::io_blob_list_t req_blob_list; - const auto scrub_type = SCRUB_TYPE::PG_META; - req_blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); - req_blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + // Protect current_req update with lock to avoid race with add_scrub_map + { + std::lock_guard lock(scrub_ctx->mtx_); + scrub_ctx->current_req = pg_meta_req; + } // Scrub locally async (runs in parallel with remote requests) m_scrub_req_executor->add([this, pg_meta_req, scrub_ctx, pg_id, task_id]() { @@ -756,8 +763,10 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { }); // Send requests to all peers and wait for responses - if (!send_scrub_req_and_wait(pg_id, task_id, all_member_peer_ids, my_uuid, pg_repl_dev, req_blob_list, - scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT, "PG meta")) { + auto flatbuffer = std::make_shared< flatbuffers::DetachedBuffer >(pg_meta_req->build_flat_buffer()); + const auto scrub_type = SCRUB_TYPE::PG_META; + if (!send_scrub_req_and_wait(pg_id, task_id, pg_repl_dev, all_member_peer_ids, my_uuid, flatbuffer, scrub_type, + scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT)) { return; } @@ -791,17 +800,21 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { // Scrub shard range uint64_t shard_start = 0; - uint64_t shard_end = shard_scrub_range_size; uint64_t shard_range_count = 0; - for (; shard_start <= last_shard_id; - shard_start = shard_end + 1, shard_end = std::min(shard_end + shard_scrub_range_size, last_shard_id)) { + while (shard_start <= last_shard_id) { + // Calculate end of current range (inclusive), ensuring it doesn't exceed last_shard_id + uint64_t shard_end = std::min(shard_start + shard_scrub_range_size - 1, last_shard_id); ++shard_range_count; SCRUBLOGD(pg_id, task_id, "Scrubbing shard range {}: [{}, {}]", shard_range_count, shard_start, shard_end); auto shard_req = std::make_shared< shard_scrub_req >(task_id, scrub_ctx->req_id.fetch_add(1), scrub_lsn, my_uuid, pg_id, shard_start, shard_end, is_deep_scrub); scrub_ctx->reset_for_new_req(); - scrub_ctx->current_req = shard_req; + // Protect current_req update with lock to avoid race with add_scrub_map + { + std::lock_guard lock(scrub_ctx->mtx_); + scrub_ctx->current_req = shard_req; + } // scrub locally async (runs in parallel with remote requests) m_scrub_req_executor->add([this, shard_req, scrub_ctx, pg_id, task_id, is_deep_scrub]() { @@ -815,23 +828,23 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { }); // request remote peers to scrub this shard range and wait for responses - auto flatbuffer = shard_req->build_flat_buffer(); - sisl::io_blob_list_t req_blob_list; + auto flatbuffer = std::make_shared< flatbuffers::DetachedBuffer >(shard_req->build_flat_buffer()); const auto scrub_type = is_deep_scrub ? SCRUB_TYPE::DEEP_SHARD : SCRUB_TYPE::SHALLOW_SHARD; - req_blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); - req_blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); - if (!send_scrub_req_and_wait(pg_id, task_id, all_member_peer_ids, my_uuid, pg_repl_dev, req_blob_list, - scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT, "shard")) { + if (!send_scrub_req_and_wait(pg_id, task_id, pg_repl_dev, all_member_peer_ids, my_uuid, flatbuffer, + scrub_type, scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT)) { SCRUBLOGE(pg_id, task_id, "shard scrub failed or was cancelled"); return; } SCRUBLOGD(pg_id, task_id, "Merging shard scrub results for range [{}, {}]", shard_start, shard_end); { - std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); + std::lock_guard lock(scrub_ctx->mtx_); pg_scrub_report->merge(scrub_ctx->peer_sm_map_); } + + // Move to next range + shard_start = shard_end + 1; } SCRUBLOGD(pg_id, task_id, "shard scrub completed, total ranges scrubbed: {}", shard_range_count); } @@ -853,17 +866,21 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { // Scrub blob range uint64_t blob_start = 0; - uint64_t blob_end = blob_scrub_range_size; uint64_t blob_range_count = 0; - for (; blob_start <= last_blob_id; - blob_start = blob_end + 1, blob_end = std::min(blob_end + blob_scrub_range_size, last_blob_id)) { + while (blob_start <= last_blob_id) { + // Calculate end of current range (inclusive), ensuring it doesn't exceed last_blob_id + uint64_t blob_end = std::min(blob_start + blob_scrub_range_size - 1, last_blob_id); ++blob_range_count; SCRUBLOGD(pg_id, task_id, "Scrubbing blob range {}: [{}, {}]", blob_range_count, blob_start, blob_end); auto blob_req = std::make_shared< blob_scrub_req >(task_id, scrub_ctx->req_id.fetch_add(1), scrub_lsn, my_uuid, pg_id, blob_start, blob_end, is_deep_scrub); scrub_ctx->reset_for_new_req(); - scrub_ctx->current_req = blob_req; + // Protect current_req update with lock to avoid race with add_scrub_map + { + std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); + scrub_ctx->current_req = blob_req; + } // locally scrub this blob range async (runs in parallel with remote requests) m_scrub_req_executor->add([this, blob_req, scrub_ctx, pg_id, task_id, is_deep_scrub]() { @@ -877,14 +894,10 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { }); // request remote peers to scrub this blob range and wait for responses - auto flatbuffer = blob_req->build_flat_buffer(); - sisl::io_blob_list_t req_blob_list; + auto flatbuffer = std::make_shared< flatbuffers::DetachedBuffer >(blob_req->build_flat_buffer()); const auto scrub_type = is_deep_scrub ? SCRUB_TYPE::DEEP_BLOB : SCRUB_TYPE::SHALLOW_BLOB; - req_blob_list.emplace_back(reinterpret_cast< const uint8_t* >(&scrub_type), sizeof(scrub_type), false); - req_blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); - - if (!send_scrub_req_and_wait(pg_id, task_id, all_member_peer_ids, my_uuid, pg_repl_dev, req_blob_list, - scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT, "blob")) { + if (!send_scrub_req_and_wait(pg_id, task_id, pg_repl_dev, all_member_peer_ids, my_uuid, flatbuffer, + scrub_type, scrub_ctx, MAX_RETRIES, SM_REQUEST_TIMEOUT)) { SCRUBLOGE(pg_id, task_id, "blob scrub failed or was cancelled"); return; } @@ -894,6 +907,9 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { std::lock_guard< std::mutex > lock(scrub_ctx->mtx_); pg_scrub_report->merge(scrub_ctx->peer_sm_map_); } + + // Move to next range + blob_start = blob_end + 1; } SCRUBLOGD(pg_id, task_id, "blob scrub completed, total ranges scrubbed: {}", blob_range_count); } @@ -901,6 +917,11 @@ void ScrubManager::handle_pg_scrub_task(scrub_task task) { // only if pg is successfully scrubbed, we persist scrub metablk. save_scrub_superblk(pg_id, is_deep_scrub, true); SCRUBLOGD(pg_id, task_id, "successfully complete {} scrub task!", is_deep_scrub ? "deep" : "shallow"); + +#ifdef _PRERELEASE + // Trigger the callback flip to delete missing blob during scrub if enabled + iomgr_flip::instance()->callback_flip("delete_missing_blob_through_raft"); +#endif } void ScrubManager::add_pg(const pg_id_t pg_id) { diff --git a/src/lib/homestore_backend/scrub_manager.hpp b/src/lib/homestore_backend/scrub_manager.hpp index f6bf37276..c4e79d46b 100644 --- a/src/lib/homestore_backend/scrub_manager.hpp +++ b/src/lib/homestore_backend/scrub_manager.hpp @@ -445,11 +445,11 @@ class ScrubManager { void scan_pg_for_scrub(); void handle_pg_scrub_task(scrub_task task); - bool send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, + bool send_scrub_req_and_wait(pg_id_t pg_id, uint64_t task_id, shared< homestore::ReplDev > pg_repl_dev, const std::unordered_set< peer_id_t >& all_member_peer_ids, const peer_id_t& my_uuid, - shared< homestore::ReplDev > pg_repl_dev, const sisl::io_blob_list_t& req_blob_list, + std::shared_ptr< flatbuffers::DetachedBuffer > flat_buffer, SCRUB_TYPE scrub_type, std::shared_ptr< PGScrubContext > scrub_ctx, uint32_t max_retries, - std::chrono::seconds timeout, const std::string& scrub_type_name); + std::chrono::seconds timeout); bool is_eligible_for_deep_scrub(const pg_id_t& pg_id); bool is_eligible_for_shallow_scrub(const pg_id_t& pg_id); diff --git a/src/lib/homestore_backend/tests/homeobj_fixture.hpp b/src/lib/homestore_backend/tests/homeobj_fixture.hpp index 499968ab3..955884847 100644 --- a/src/lib/homestore_backend/tests/homeobj_fixture.hpp +++ b/src/lib/homestore_backend/tests/homeobj_fixture.hpp @@ -49,7 +49,7 @@ class HomeObjectFixture : public ::testing::Test { HSHomeObject::_hs_chunk_size = SISL_OPTIONS["chunk_size"].as< uint64_t >() * Mi; _obj_inst = std::dynamic_pointer_cast< HSHomeObject >(g_helper->build_new_homeobject()); - + // Used to export metrics, it should be called after init_homeobject if (SISL_OPTIONS["enable_http"].as< bool >()) { g_helper->app->start_http_server(); } if (!g_helper->is_current_testcase_restarted()) { @@ -906,6 +906,27 @@ class HomeObjectFixture : public ::testing::Test { LOGINFO("Flip {} set", flip_name); } + void set_callback_flip(const std::string flip_name, std::function< void() > callback, uint32_t count = 1, + uint32_t percent = 100) { + flip::FlipCondition null_cond; + flip::FlipFrequency freq; + freq.set_count(count); + freq.set_percent(percent); + m_fc.inject_callback_flip(flip_name, {null_cond}, freq, callback); + LOGINFO("Flip {} with callback set", flip_name); + } + + template < typename T > + void set_callback_retval_flip(const std::string flip_name, std::function< T() > callback, uint32_t count = 1, + uint32_t percent = 100) { + flip::FlipCondition null_cond; + flip::FlipFrequency freq; + freq.set_count(count); + freq.set_percent(percent); + ASSERT_TRUE(m_fc.inject_callback_retval_flip(flip_name, {null_cond}, freq, callback)); + LOGINFO("Flip {} with callback retval set", flip_name); + } + void remove_flip(const std::string flip_name) { m_fc.remove_flip(flip_name); LOGINFO("Flip {} removed", flip_name); diff --git a/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp b/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp index ab7636180..7b6881e37 100644 --- a/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp +++ b/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp @@ -247,8 +247,6 @@ TEST_F(HomeObjectFixture, BasicScrubTest) { << "Empty PG should have no missing shards in shallow scrub"; }); - g_helper->sync(); - // Create blobs in all shards shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs_per_shard, pg_blob_id); LOGINFO("Created {} blobs per shard, total {} blobs", num_blobs_per_shard, num_shards * num_blobs_per_shard); @@ -289,6 +287,7 @@ TEST_F(HomeObjectFixture, BasicScrubTest) { }); g_helper->sync(); + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; @@ -526,7 +525,6 @@ TEST_F(HomeObjectFixture, ScrubSuperblockPersistenceTest) { const uint64_t shard_size = 64 * Mi; create_shard(pg_id, shard_size, "shard_meta"); - auto scrub_mgr = _obj_inst->scrub_manager(); run_on_pg_leader(pg_id, [&]() { @@ -565,5 +563,446 @@ TEST_F(HomeObjectFixture, ScrubSuperblockPersistenceTest) { << "Shallow scrub timestamp should be updated"; }); + g_helper->sync(); +} + +// Test cancel scrub task +TEST_F(HomeObjectFixture, CancelScrubTaskTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + + const uint64_t num_blobs = 10; + put_blobs(pg_shard_id_vec, num_blobs, pg_blob_id); + g_helper->sync(); + + // Submit a scrub task and then cancel it + run_on_pg_leader(pg_id, [&]() { + auto scrub_future = scrub_mgr->submit_scrub_task(pg_id, true, false, SCRUB_TRIGGER_TYPE::MANUALLY); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + scrub_mgr->cancel_scrub_task(pg_id); + LOGINFO("Cancelled scrub task for pg={}", pg_id); + auto scrub_report = std::move(scrub_future).get(); + + // The report might be null or have partial results due to cancellation + // We just verify that cancel doesn't cause crash + LOGINFO("Scrub task cancelled, report: {}", scrub_report ? "present" : "null"); + }); + + // Test canceling when no task is running - should not crash + run_on_pg_leader(pg_id, [&]() { + scrub_mgr->cancel_scrub_task(pg_id); + LOGINFO("Cancel non-existent scrub task for pg={} - should not crash", pg_id); + }); + + g_helper->sync(); +} + +// Test concurrent scrubs on multiple PGs +TEST_F(HomeObjectFixture, ConcurrentScrubsOnMultiplePGsTest) { + const uint64_t num_pgs = 3; + const uint64_t shard_size = 64 * Mi; + + std::vector< pg_id_t > pg_ids; + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + + // Create multiple PGs with shards and blobs + for (uint64_t i = 1; i <= num_pgs; ++i) { + pg_id_t pg_id = i; + pg_ids.push_back(pg_id); + create_pg(pg_id); + auto shard_info = create_shard(pg_id, shard_size, "shard meta " + std::to_string(pg_id)); + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + put_blobs(pg_shard_id_vec, 5, pg_blob_id); + } + + auto scrub_mgr = _obj_inst->scrub_manager(); + + // Submit scrub tasks for all PGs concurrently + std::vector< folly::SemiFuture< std::shared_ptr< ScrubManager::ShallowScrubReport > > > scrub_futures; + + for (const auto& pg_id : pg_ids) { + run_on_pg_leader(pg_id, [&]() { + auto future = scrub_mgr->submit_scrub_task(pg_id, true, false, SCRUB_TRIGGER_TYPE::MANUALLY); + scrub_futures.push_back(std::move(future)); + LOGINFO("Submitted deep scrub for pg={}", pg_id); + }); + } + + // Wait for all scrub tasks to complete + for (size_t i = 0; i < scrub_futures.size(); ++i) { + auto report = std::move(scrub_futures[i]).get(); + if (report) { + LOGINFO("PG {} scrub completed, report present", pg_ids[i]); + } else { + LOGWARN("PG {} scrub returned null report", pg_ids[i]); + } + } + + g_helper->sync(); +} + +// Test deleted blob filter in scrub report +TEST_F(HomeObjectFixture, DeletedBlobFilterTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + + std::map< shard_id_t, std::map< blob_id_t, uint64_t > > shard_blob_ids_map; + + // Create some blobs + const uint64_t num_blobs = 10; + shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs, pg_blob_id); + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + const auto shard_id = shard_info.id; + auto& shard_blobs = shard_blob_ids_map[shard_id]; + + // Select blobs to test: + // - missing_blob_to_delete: will be missing from leader index AND deleted via blob delete + // - missing_blob_not_deleted: will be missing from leader index but NOT deleted + auto it = shard_blobs.begin(); + const auto missing_blob_to_delete = it->first; // First blob: will be deleted via blob delete + const auto missing_blob_not_deleted = (++it)->first; // Second blob: will NOT be deleted + + // Delete both blobs from index table to simulate missing blobs on followers + run_on_pg_follower(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + delete_blob_from_index(pg_index_table, shard_id, missing_blob_to_delete); + delete_blob_from_index(pg_index_table, shard_id, missing_blob_not_deleted); + LOGINFO("Deleted blobs {} and {} from leader index table", missing_blob_to_delete, missing_blob_not_deleted); + }); + + g_helper->sync(); + + run_on_pg_leader(pg_id, [&]() { + // only the blob that was deleted via blob delete should be filtered out, the other missing blob should be + // reported in the scrub report + std::set< peer_id_t > follower_peer_ids; + const auto& leader_uuid = _obj_inst->our_uuid(); + const auto& members = (hs_pg->pg_info_).members; + for (const auto& member : members) { + if (member.id == leader_uuid) { continue; } + follower_peer_ids.insert(member.id); + } + + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false /* shallow */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + auto missing_blobs = scrub_report->get_missing_blobs(); + for (const auto& peer_id : follower_peer_ids) { + auto it = missing_blobs.find(peer_id); + ASSERT_TRUE(it != missing_blobs.end()) << "Missing blob for follower should be reported in scrub report"; + EXPECT_TRUE(it->second.size() == 2) << "There should be two missing blobs for leader in scrub report"; + EXPECT_TRUE(it->second.count(BlobRoute{shard_id, missing_blob_to_delete}) == 1) + << "The missing blob that will be deleted should be reported in scrub report"; + EXPECT_TRUE(it->second.count(BlobRoute{shard_id, missing_blob_not_deleted}) == 1) + << "The missing blob that will NOT be deleted should be reported in scrub report"; + } + +#ifdef _PRERELEASE + set_callback_flip( + "delete_missing_blob_through_raft", std::function< void() >([this, missing_blob_to_delete, shard_id]() { + auto ret = + _obj_inst->blob_manager()->del(shard_id, missing_blob_to_delete, generateRandomTraceId()).get(); + LOGINFO("Blob delete via callback flip completed, ret={}", ret.hasValue()); + })); + + scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false /* shallow */, false /* force */, SCRUB_TRIGGER_TYPE::MANUALLY) + .get(); + + remove_flip("delete_missing_blob_through_raft"); + + // Verify the scrub report + ASSERT_NE(scrub_report, nullptr) << "Scrub report should not be null"; + + missing_blobs = scrub_report->get_missing_blobs(); + for (const auto& peer_id : follower_peer_ids) { + auto it = missing_blobs.find(peer_id); + ASSERT_TRUE(it != missing_blobs.end()) << "Missing blob for follower should be reported in scrub report"; + EXPECT_TRUE(it->second.size() == 1) << "There should be one missing blob for leader in scrub report"; + EXPECT_TRUE(it->second.count(BlobRoute{shard_id, missing_blob_not_deleted}) == 1) + << "The missing blob that was not deleted should be reported in scrub report"; + } +#endif + }); + + g_helper->sync(); + LOGINFO("DeletedBlobFilterTest completed successfully"); +} + +// Test add and remove PG from scrub manager +TEST_F(HomeObjectFixture, AddRemovePGScrubTest) { + const pg_id_t pg_id = 1; + const uint64_t shard_size = 64 * Mi; + + // Create PG and verify scrub superblock is created + create_pg(pg_id); + create_shard(pg_id, shard_size, "shard meta"); + + auto scrub_mgr = _obj_inst->scrub_manager(); + + // Verify scrub superblock exists + run_on_pg_leader(pg_id, [&]() { + auto sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(sb.has_value()) << "Scrub superblock should exist after PG creation"; + LOGINFO("Scrub superblock created for pg={}", pg_id); + }); + + // Run a scrub to update timestamps + run_on_pg_leader(pg_id, [&]() { + // Get initial timestamp before scrub + auto sb_before = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(sb_before.has_value()) << "Scrub superblock should exist before scrub"; + uint64_t timestamp_before = sb_before->last_shallow_scrub_timestamp; + LOGINFO("Timestamp before scrub: {}", timestamp_before); + + // Wait a bit to ensure timestamp will be different + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + auto report = scrub_mgr->submit_scrub_task(pg_id, false, false, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(report, nullptr) << "Scrub report should not be null"; + + // Verify timestamp was updated after scrub + auto sb_after = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(sb_after.has_value()) << "Scrub superblock should exist after scrub"; + uint64_t timestamp_after = sb_after->last_shallow_scrub_timestamp; + EXPECT_GT(timestamp_after, timestamp_before) << "Shallow scrub timestamp should be updated after scrub"; + LOGINFO("Timestamp after scrub: {} (updated from {})", timestamp_after, timestamp_before); + }); + + // Now delete the PG - this should cancel any running scrub and remove superblock + run_on_pg_leader(pg_id, [&]() { + _obj_inst->pg_manager()->destroy_pg(pg_id); + LOGINFO("Deleted pg={}", pg_id); + }); + + // Run a scrub to update timestamps + run_on_pg_leader(pg_id, [&]() { + auto report = scrub_mgr->submit_scrub_task(pg_id, false, false, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_EQ(report, nullptr) << "Scrub report should be null after PG deletion"; + LOGINFO("Scrub task for deleted pg={} returned null report as expected", pg_id); + }); + + // Wait for PG to be deleted + std::this_thread::sleep_for(std::chrono::seconds(2)); + g_helper->sync(); + + // Verify scrub superblock is cleaned up - get_scrub_superblk should return nullopt + // Note: This might not be directly testable without internal access, so we just verify no crash + LOGINFO("PG deleted, scrub manager should have cleaned up"); +} + +// Test local scrub methods +TEST_F(HomeObjectFixture, LocalScrubMethodsTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + + // Create blobs first + const uint64_t num_blobs = 10; + auto shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs, pg_blob_id); + LOGINFO("Created {} blobs for local scrub test", num_blobs); + + g_helper->sync(); + + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + const auto shard_id = shard_info.id; + auto& shard_blobs = shard_blob_ids_map[shard_id]; + + // Select blobs to corrupt + auto it = shard_blobs.begin(); + const auto corrupted_blob_id = it->first; + + // Corrupt blobs on the local node + run_on_pg_leader(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + + // Make corrupted_blob_id corrupted (corrupt data) + corrupt_blob_data(pg_index_table, shard_id, corrupted_blob_id); + LOGINFO("Corrupted blob {} on leader", corrupted_blob_id); + }); + + g_helper->sync(); + + run_on_pg_leader(pg_id, [&]() { + // Create a shard scrub request + auto shard_req = + std::make_shared< ScrubManager::shard_scrub_req >(1, 1, 0, _obj_inst->our_uuid(), pg_id, 0, 100, false); + + // Test local_scrub_shard (shallow) + auto shallow_shard_map = scrub_mgr->local_scrub_shard(shard_req); + ASSERT_NE(shallow_shard_map, nullptr); + EXPECT_EQ(shallow_shard_map->get_scrub_type(), SCRUB_TYPE::SHALLOW_SHARD); + LOGINFO("Shallow shard scrub returned map with {} shards", shallow_shard_map->shards.size()); + + // Create a deep shard scrub request + auto deep_shard_req = + std::make_shared< ScrubManager::shard_scrub_req >(1, 1, 0, _obj_inst->our_uuid(), pg_id, 0, 100, true); + + // Test local_scrub_shard (deep) + auto deep_shard_map = scrub_mgr->local_scrub_shard(deep_shard_req); + ASSERT_NE(deep_shard_map, nullptr); + EXPECT_EQ(deep_shard_map->get_scrub_type(), SCRUB_TYPE::DEEP_SHARD); + LOGINFO("Deep shard scrub returned map with {} shards", deep_shard_map->shards.size()); + + // Test scrub_pg_meta + auto pg_meta_req = + std::make_shared< ScrubManager::base_scrub_req >(1, 1, 0, _obj_inst->our_uuid(), pg_id, true); + auto pg_meta_map = scrub_mgr->scrub_pg_meta(pg_meta_req); + ASSERT_NE(pg_meta_map, nullptr); + EXPECT_EQ(pg_meta_map->get_scrub_type(), SCRUB_TYPE::PG_META); + LOGINFO("PG meta scrub completed"); + + // Test local_scrub_blob (shallow) + auto shallow_blob_req = + std::make_shared< ScrubManager::blob_scrub_req >(1, 1, 0, _obj_inst->our_uuid(), pg_id, 0, 100, false); + auto shallow_blob_map = scrub_mgr->local_scrub_blob(shallow_blob_req); + // May be null if no blobs exist in range + if (shallow_blob_map) { + EXPECT_EQ(shallow_blob_map->get_scrub_type(), SCRUB_TYPE::SHALLOW_BLOB); + LOGINFO("Shallow blob scrub completed"); + } + + // Test local_scrub_blob (deep) - should detect corrupted and inconsistent blobs + auto deep_blob_req = + std::make_shared< ScrubManager::blob_scrub_req >(1, 1, 0, _obj_inst->our_uuid(), pg_id, 0, 100, true); + auto deep_blob_map = scrub_mgr->local_scrub_blob(deep_blob_req); + ASSERT_NE(deep_blob_map, nullptr); + EXPECT_EQ(deep_blob_map->get_scrub_type(), SCRUB_TYPE::DEEP_BLOB); + auto deep_blob_map_cast = std::dynamic_pointer_cast< ScrubManager::DeepBlobScrubMap >(deep_blob_map); + LOGINFO("Deep blob scrub completed, found {} blobs", deep_blob_map_cast->blobs.size()); + + // Check for corrupted blob + auto corrupted_it = deep_blob_map_cast->blobs.find(BlobRoute{shard_id, corrupted_blob_id}); + EXPECT_TRUE(corrupted_it != deep_blob_map_cast->blobs.end()) << "Corrupted blob should be in deep scrub result"; + if (corrupted_it != deep_blob_map_cast->blobs.end()) { + auto result = std::get_if< ScrubResult >(&corrupted_it->second); + ASSERT_TRUE(result != nullptr) << "Corrupted blob result should be ScrubResult"; + EXPECT_EQ(*result, ScrubResult::MISMATCH) << "Corrupted blob should have MISMATCH result"; + LOGINFO("Deep scrub correctly detected corrupted blob {}", corrupted_blob_id); + } + }); + + g_helper->sync(); +} + +// Test scrub request serialization and deserialization +TEST_F(HomeObjectFixture, ScrubRequestSerializationTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + create_shard(pg_id, shard_size, "shard meta"); + run_on_pg_leader(pg_id, [&]() { + auto my_uuid = _obj_inst->our_uuid(); + + // Test base_scrub_req serialization + { + auto req = std::make_shared< ScrubManager::base_scrub_req >(1, 1, 100, my_uuid, pg_id, true); + + // Serialize + auto buffer = req->build_flat_buffer(); + EXPECT_GT(buffer.size(), 0) << "Serialized buffer should not be empty"; + + // Deserialize + auto req_loaded = std::make_shared< ScrubManager::base_scrub_req >(); + bool load_success = req_loaded->load(buffer.data(), buffer.size()); + EXPECT_TRUE(load_success) << "Deserialization should succeed"; + + // Verify fields + EXPECT_EQ(req_loaded->pg_id, pg_id); + EXPECT_EQ(req_loaded->task_id, 1); + EXPECT_EQ(req_loaded->req_id, 1); + EXPECT_EQ(req_loaded->scrub_lsn, 100); + + LOGINFO("base_scrub_req serialization test passed"); + } + + // Test blob_scrub_req serialization + { + auto req = std::make_shared< ScrubManager::blob_scrub_req >(1, 2, 200, my_uuid, pg_id, 100, 200, true); + + // Serialize + auto buffer = req->build_flat_buffer(); + EXPECT_GT(buffer.size(), 0); + + // Deserialize + auto req_loaded = std::make_shared< ScrubManager::blob_scrub_req >(); + bool load_success = req_loaded->load(buffer.data(), buffer.size()); + EXPECT_TRUE(load_success); + + // Verify fields + EXPECT_EQ(req_loaded->pg_id, pg_id); + EXPECT_EQ(req_loaded->task_id, 1); + EXPECT_EQ(req_loaded->req_id, 2); + EXPECT_EQ(req_loaded->scrub_lsn, 200); + EXPECT_EQ(req_loaded->start, 100); + EXPECT_EQ(req_loaded->end, 200); + EXPECT_TRUE(req_loaded->is_deep_scrub()); + EXPECT_EQ(req_loaded->get_scrub_type(), SCRUB_TYPE::DEEP_BLOB); + + LOGINFO("blob_scrub_req serialization test passed"); + } + + // Test shard_scrub_req serialization + { + auto req = std::make_shared< ScrubManager::shard_scrub_req >(1, 3, 300, my_uuid, pg_id, 0, 100, false); + + // Serialize + auto buffer = req->build_flat_buffer(); + EXPECT_GT(buffer.size(), 0); + + // Deserialize + auto req_loaded = std::make_shared< ScrubManager::shard_scrub_req >(); + bool load_success = req_loaded->load(buffer.data(), buffer.size()); + EXPECT_TRUE(load_success); + + // Verify fields + EXPECT_EQ(req_loaded->pg_id, pg_id); + EXPECT_EQ(req_loaded->task_id, 1); + EXPECT_EQ(req_loaded->req_id, 3); + EXPECT_EQ(req_loaded->scrub_lsn, 300); + EXPECT_EQ(req_loaded->start, 0); + EXPECT_EQ(req_loaded->end, 100); + EXPECT_FALSE(req_loaded->is_deep_scrub()); + EXPECT_EQ(req_loaded->get_scrub_type(), SCRUB_TYPE::SHALLOW_SHARD); + + LOGINFO("shard_scrub_req serialization test passed"); + } + }); + g_helper->sync(); } \ No newline at end of file